Merge pull request #61 from chrisps/canary_experimental
performance improvements, kernel fixes, cpu accuracy improvements
This commit is contained in:
commit
0b013fdc6b
|
@ -979,7 +979,12 @@ void EmulatorWindow::ToggleDisplayConfigDialog() {
|
|||
}
|
||||
|
||||
void EmulatorWindow::ToggleControllerVibration() {
|
||||
emulator()->input_system()->ToggleVibration();
|
||||
auto input_sys = emulator()->input_system();
|
||||
if (input_sys) {
|
||||
auto input_lock = input_sys->lock();
|
||||
|
||||
input_sys->ToggleVibration();
|
||||
}
|
||||
}
|
||||
|
||||
void EmulatorWindow::ShowCompatibility() {
|
||||
|
|
|
@ -91,7 +91,7 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
|
|||
}
|
||||
|
||||
bool XmaContext::Work() {
|
||||
std::lock_guard<std::mutex> lock(lock_);
|
||||
std::lock_guard<xe_mutex> lock(lock_);
|
||||
if (!is_allocated() || !is_enabled()) {
|
||||
return false;
|
||||
}
|
||||
|
@ -106,7 +106,7 @@ bool XmaContext::Work() {
|
|||
}
|
||||
|
||||
void XmaContext::Enable() {
|
||||
std::lock_guard<std::mutex> lock(lock_);
|
||||
std::lock_guard<xe_mutex> lock(lock_);
|
||||
|
||||
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
|
||||
XMA_CONTEXT_DATA data(context_ptr);
|
||||
|
@ -134,7 +134,7 @@ bool XmaContext::Block(bool poll) {
|
|||
}
|
||||
|
||||
void XmaContext::Clear() {
|
||||
std::lock_guard<std::mutex> lock(lock_);
|
||||
std::lock_guard<xe_mutex> lock(lock_);
|
||||
XELOGAPU("XmaContext: reset context {}", id());
|
||||
|
||||
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
|
||||
|
@ -151,14 +151,14 @@ void XmaContext::Clear() {
|
|||
}
|
||||
|
||||
void XmaContext::Disable() {
|
||||
std::lock_guard<std::mutex> lock(lock_);
|
||||
std::lock_guard<xe_mutex> lock(lock_);
|
||||
XELOGAPU("XmaContext: disabling context {}", id());
|
||||
set_is_enabled(false);
|
||||
}
|
||||
|
||||
void XmaContext::Release() {
|
||||
// Lock it in case the decoder thread is working on it now.
|
||||
std::lock_guard<std::mutex> lock(lock_);
|
||||
std::lock_guard<xe_mutex> lock(lock_);
|
||||
assert_true(is_allocated_ == true);
|
||||
|
||||
set_is_allocated(false);
|
||||
|
|
|
@ -200,7 +200,7 @@ class XmaContext {
|
|||
|
||||
uint32_t id_ = 0;
|
||||
uint32_t guest_ptr_ = 0;
|
||||
std::mutex lock_;
|
||||
xe_mutex lock_;
|
||||
bool is_allocated_ = false;
|
||||
bool is_enabled_ = false;
|
||||
// bool is_dirty_ = true;
|
||||
|
|
|
@ -15,6 +15,13 @@
|
|||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/mutex.h"
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#include "xenia/base/platform_win.h"
|
||||
|
||||
#endif
|
||||
|
||||
DEFINE_bool(clock_no_scaling, false,
|
||||
"Disable scaling code. Time management and locking is bypassed. "
|
||||
|
@ -42,8 +49,13 @@ std::pair<uint64_t, uint64_t> guest_tick_ratio_ = std::make_pair(1, 1);
|
|||
uint64_t last_guest_tick_count_ = 0;
|
||||
// Last sampled host tick count.
|
||||
uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
|
||||
|
||||
|
||||
using tick_mutex_type = xe_unlikely_mutex;
|
||||
|
||||
// Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
|
||||
std::mutex tick_mutex_;
|
||||
// std::mutex tick_mutex_;
|
||||
static tick_mutex_type tick_mutex_;
|
||||
|
||||
void RecomputeGuestTickScalar() {
|
||||
// Create a rational number with numerator (first) and denominator (second)
|
||||
|
@ -61,7 +73,7 @@ void RecomputeGuestTickScalar() {
|
|||
// Keep this a rational calculation and reduce the fraction
|
||||
reduce_fraction(frac);
|
||||
|
||||
std::lock_guard<std::mutex> lock(tick_mutex_);
|
||||
std::lock_guard<tick_mutex_type> lock(tick_mutex_);
|
||||
guest_tick_ratio_ = frac;
|
||||
}
|
||||
|
||||
|
@ -75,7 +87,7 @@ uint64_t UpdateGuestClock() {
|
|||
return host_tick_count * guest_tick_ratio_.first / guest_tick_ratio_.second;
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lock(tick_mutex_, std::defer_lock);
|
||||
std::unique_lock<tick_mutex_type> lock(tick_mutex_, std::defer_lock);
|
||||
if (lock.try_lock()) {
|
||||
// Translate host tick count to guest tick count.
|
||||
uint64_t host_tick_delta = host_tick_count > last_host_tick_count_
|
||||
|
@ -107,7 +119,6 @@ inline uint64_t QueryGuestSystemTimeOffset() {
|
|||
|
||||
return guest_tick_count * numerator / denominator;
|
||||
}
|
||||
|
||||
uint64_t Clock::QueryHostTickFrequency() {
|
||||
#if XE_CLOCK_RAW_AVAILABLE
|
||||
if (cvars::clock_source_raw) {
|
||||
|
@ -137,7 +148,7 @@ void Clock::set_guest_time_scalar(double scalar) {
|
|||
}
|
||||
|
||||
std::pair<uint64_t, uint64_t> Clock::guest_tick_ratio() {
|
||||
std::lock_guard<std::mutex> lock(tick_mutex_);
|
||||
std::lock_guard<tick_mutex_type> lock(tick_mutex_);
|
||||
return guest_tick_ratio_;
|
||||
}
|
||||
|
||||
|
@ -159,6 +170,7 @@ uint64_t Clock::QueryGuestTickCount() {
|
|||
return guest_tick_count;
|
||||
}
|
||||
|
||||
uint64_t* Clock::GetGuestTickCountPointer() { return &last_guest_tick_count_; }
|
||||
uint64_t Clock::QueryGuestSystemTime() {
|
||||
if (cvars::clock_no_scaling) {
|
||||
return Clock::QueryHostSystemTime();
|
||||
|
|
|
@ -33,11 +33,15 @@ class Clock {
|
|||
// Either from platform suplied time source or from hardware directly.
|
||||
static uint64_t host_tick_frequency_platform();
|
||||
#if XE_CLOCK_RAW_AVAILABLE
|
||||
XE_NOINLINE
|
||||
static uint64_t host_tick_frequency_raw();
|
||||
#endif
|
||||
// Host tick count. Generally QueryHostTickCount() should be used.
|
||||
static uint64_t host_tick_count_platform();
|
||||
#if XE_CLOCK_RAW_AVAILABLE
|
||||
//chrispy: the way msvc was ordering the branches was causing rdtsc to be speculatively executed each time
|
||||
//the branch history was lost
|
||||
XE_NOINLINE
|
||||
static uint64_t host_tick_count_raw();
|
||||
#endif
|
||||
|
||||
|
@ -70,6 +74,8 @@ class Clock {
|
|||
// Queries the current guest tick count, accounting for frequency adjustment
|
||||
// and scaling.
|
||||
static uint64_t QueryGuestTickCount();
|
||||
|
||||
static uint64_t* GetGuestTickCountPointer();
|
||||
// Queries the guest time, in FILETIME format, accounting for scaling.
|
||||
static uint64_t QueryGuestSystemTime();
|
||||
// Queries the milliseconds since the guest began, accounting for scaling.
|
||||
|
|
|
@ -12,7 +12,17 @@
|
|||
#include "xenia/base/platform_win.h"
|
||||
|
||||
namespace xe {
|
||||
#if XE_USE_KUSER_SHARED == 1
|
||||
uint64_t Clock::host_tick_frequency_platform() { return 10000000ULL; }
|
||||
|
||||
uint64_t Clock::host_tick_count_platform() {
|
||||
return *reinterpret_cast<volatile uint64_t*>(GetKUserSharedSystemTime());
|
||||
}
|
||||
uint64_t Clock::QueryHostSystemTime() {
|
||||
return *reinterpret_cast<volatile uint64_t*>(GetKUserSharedSystemTime());
|
||||
}
|
||||
|
||||
#else
|
||||
uint64_t Clock::host_tick_frequency_platform() {
|
||||
LARGE_INTEGER frequency;
|
||||
QueryPerformanceFrequency(&frequency);
|
||||
|
@ -27,13 +37,13 @@ uint64_t Clock::host_tick_count_platform() {
|
|||
}
|
||||
return time;
|
||||
}
|
||||
|
||||
uint64_t Clock::QueryHostSystemTime() {
|
||||
FILETIME t;
|
||||
GetSystemTimeAsFileTime(&t);
|
||||
return (uint64_t(t.dwHighDateTime) << 32) | t.dwLowDateTime;
|
||||
}
|
||||
|
||||
#endif
|
||||
uint64_t Clock::QueryHostUptimeMillis() {
|
||||
return host_tick_count_platform() * 1000 / host_tick_frequency_platform();
|
||||
}
|
||||
|
|
|
@ -41,10 +41,14 @@
|
|||
"\n" \
|
||||
"Set the cvar 'clock_source_raw' to 'false'.");
|
||||
|
||||
|
||||
|
||||
|
||||
namespace xe {
|
||||
// Getting the TSC frequency can be a bit tricky. This method here only works on
|
||||
// Intel as it seems. There is no easy way to get the frequency outside of ring0
|
||||
// on AMD, so we fail gracefully if not possible.
|
||||
XE_NOINLINE
|
||||
uint64_t Clock::host_tick_frequency_raw() {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
|
||||
|
@ -71,6 +75,8 @@ uint64_t Clock::host_tick_frequency_raw() {
|
|||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (max_cpuid >= 0x15) {
|
||||
// 15H Get TSC/Crystal ratio and Crystal Hz.
|
||||
xe_cpu_cpuid(0x15, eax, ebx, ecx, edx);
|
||||
|
@ -92,10 +98,11 @@ uint64_t Clock::host_tick_frequency_raw() {
|
|||
return cpu_base_freq;
|
||||
}
|
||||
|
||||
|
||||
CLOCK_FATAL("The clock frequency could not be determined.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
uint64_t Clock::host_tick_count_raw() { return xe_cpu_rdtsc(); }
|
||||
|
||||
} // namespace xe
|
||||
|
|
|
@ -19,7 +19,7 @@ namespace xe {
|
|||
|
||||
// TODO(Triang3l): Set the default depending on the actual subsystem. Currently
|
||||
// it inhibits message boxes.
|
||||
static bool has_console_attached_ = true;
|
||||
static bool has_console_attached_ = false;
|
||||
|
||||
bool has_console_attached() { return has_console_attached_; }
|
||||
|
||||
|
|
|
@ -78,17 +78,25 @@ std::pair<char*, size_t> GetThreadBuffer();
|
|||
void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// Appends a line to the log with {fmt}-style formatting.
|
||||
template <typename... Args>
|
||||
void AppendLogLineFormat(LogLevel log_level, const char prefix_char,
|
||||
XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level,
|
||||
const char prefix_char,
|
||||
const char* format,
|
||||
const Args&... args) {
|
||||
auto target = internal::GetThreadBuffer();
|
||||
auto result = fmt::format_to_n(target.first, target.second, format, args...);
|
||||
internal::AppendLogLine(log_level, prefix_char, result.size);
|
||||
}
|
||||
|
||||
// Appends a line to the log with {fmt}-style formatting.
|
||||
//chrispy: inline the initial check, outline the append. the append should happen rarely for end users
|
||||
template <typename... Args>
|
||||
XE_FORCEINLINE static void AppendLogLineFormat(LogLevel log_level, const char prefix_char,
|
||||
const char* format, const Args&... args) {
|
||||
if (!internal::ShouldLog(log_level)) {
|
||||
return;
|
||||
}
|
||||
auto target = internal::GetThreadBuffer();
|
||||
auto result = fmt::format_to_n(target.first, target.second, format, args...);
|
||||
internal::AppendLogLine(log_level, prefix_char, result.size);
|
||||
AppendLogLineFormat_Impl(log_level, prefix_char, format, args...);
|
||||
}
|
||||
|
||||
// Appends a line to the log.
|
||||
|
@ -98,18 +106,19 @@ void AppendLogLine(LogLevel log_level, const char prefix_char,
|
|||
} // namespace logging
|
||||
|
||||
// Logs a fatal error and aborts the program.
|
||||
void FatalError(const std::string_view str);
|
||||
[[noreturn]] void FatalError(const std::string_view str);
|
||||
|
||||
} // namespace xe
|
||||
|
||||
#if XE_OPTION_ENABLE_LOGGING
|
||||
|
||||
template <typename... Args>
|
||||
void XELOGE(const char* format, const Args&... args) {
|
||||
XE_COLD void XELOGE(const char* format, const Args&... args) {
|
||||
xe::logging::AppendLogLineFormat(xe::LogLevel::Error, '!', format, args...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
XE_COLD
|
||||
void XELOGW(const char* format, const Args&... args) {
|
||||
xe::logging::AppendLogLineFormat(xe::LogLevel::Warning, 'w', format, args...);
|
||||
}
|
||||
|
@ -131,12 +140,12 @@ void XELOGCPU(const char* format, const Args&... args) {
|
|||
|
||||
template <typename... Args>
|
||||
void XELOGAPU(const char* format, const Args&... args) {
|
||||
xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'A', format, args...);
|
||||
xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'A', format, args...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void XELOGGPU(const char* format, const Args&... args) {
|
||||
xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'G', format, args...);
|
||||
xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'G', format, args...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
|
|
|
@ -376,6 +376,29 @@ template <int N>
|
|||
int64_t m128_i64(const __m128& v) {
|
||||
return m128_i64<N>(_mm_castps_pd(v));
|
||||
}
|
||||
/*
|
||||
|
||||
std::min/max float has handling for nans, where if either argument is nan the first argument is returned
|
||||
|
||||
minss/maxss are different, if either argument is nan the second operand to the instruction is returned
|
||||
this is problematic because we have no assurances from the compiler on the argument ordering
|
||||
|
||||
so only use in places where nan handling is not needed
|
||||
*/
|
||||
static float xe_minf(float x, float y) {
|
||||
return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
static float xe_maxf(float x, float y) {
|
||||
return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
static float xe_rcpf(float den) {
|
||||
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
||||
}
|
||||
|
||||
#else
|
||||
static float xe_minf(float x, float y) { return std::min<float>(x, y); }
|
||||
static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
|
||||
static float xe_rcpf(float den) { return 1.0f / den; }
|
||||
#endif
|
||||
|
||||
// Similar to the C++ implementation of XMConvertFloatToHalf and
|
||||
|
|
|
@ -467,6 +467,259 @@ constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) {
|
|||
return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
|
||||
}
|
||||
|
||||
// chrispy::todo:use for command stream vector, resize happens a ton and has to
|
||||
// call memset
|
||||
template <size_t sz>
|
||||
class FixedVMemVector {
|
||||
static_assert((sz & 65535) == 0,
|
||||
"Always give fixed_vmem_vector a size divisible by 65536 to "
|
||||
"avoid wasting memory on windows");
|
||||
|
||||
uint8_t* data_;
|
||||
size_t nbytes_;
|
||||
|
||||
public:
|
||||
FixedVMemVector()
|
||||
: data_((uint8_t*)memory::AllocFixed(
|
||||
nullptr, sz, memory::AllocationType::kReserveCommit,
|
||||
memory::PageAccess::kReadWrite)),
|
||||
nbytes_(0) {}
|
||||
~FixedVMemVector() {
|
||||
if (data_) {
|
||||
memory::DeallocFixed(data_, sz, memory::DeallocationType::kRelease);
|
||||
data_ = nullptr;
|
||||
}
|
||||
nbytes_ = 0;
|
||||
}
|
||||
|
||||
uint8_t* data() const { return data_; }
|
||||
size_t size() const { return nbytes_; }
|
||||
|
||||
void resize(size_t newsize) {
|
||||
nbytes_ = newsize;
|
||||
xenia_assert(newsize < sz);
|
||||
}
|
||||
size_t alloc() const { return sz; }
|
||||
|
||||
void clear() {
|
||||
resize(0); // todo:maybe zero out
|
||||
}
|
||||
void reserve(size_t size) { xenia_assert(size < sz); }
|
||||
};
|
||||
// software prefetches/cache operations
|
||||
namespace swcache {
|
||||
/*
|
||||
warning, prefetchw's current behavior is not consistent across msvc and
|
||||
clang, for clang it will only compile to prefetchw if the set architecture
|
||||
supports it, for msvc however it will unconditionally compile to prefetchw!
|
||||
so prefetchw support is still in process
|
||||
|
||||
|
||||
only use these if you're absolutely certain you know what you're doing;
|
||||
you can easily tank performance through misuse CPUS have excellent automatic
|
||||
prefetchers that can predict patterns, but in situations where memory
|
||||
accesses are super unpredictable and follow no pattern you can make use of
|
||||
them
|
||||
|
||||
another scenario where it can be handy is when crossing page boundaries,
|
||||
as many automatic prefetchers do not allow their streams to cross pages (no
|
||||
idea what this means for huge pages)
|
||||
|
||||
I believe software prefetches do not kick off an automatic prefetcher
|
||||
stream, so you can't just prefetch one line of the data you're about to
|
||||
access and be fine, you need to go all the way
|
||||
|
||||
prefetchnta is implementation dependent, and that makes its use a bit
|
||||
limited. For intel cpus, i believe it only prefetches the line into one way
|
||||
of the L3
|
||||
|
||||
for amd cpus, it marks the line as requiring immediate eviction, the
|
||||
next time an entry is needed in the set it resides in it will be evicted. ms
|
||||
does dumb shit for memcpy, like looping over the contents of the source
|
||||
buffer and doing prefetchnta on them, likely evicting some of the data they
|
||||
just prefetched by the end of the buffer, and probably messing up data that
|
||||
was already in the cache
|
||||
|
||||
|
||||
another warning for these: this bypasses what i think is called
|
||||
"critical word load", the data will always become available starting from the
|
||||
very beginning of the line instead of from the piece that is needed
|
||||
|
||||
L1I cache is not prefetchable, however likely all cpus can fulfill
|
||||
requests for the L1I from L2, so prefetchL2 on instructions should be fine
|
||||
|
||||
todo: clwb, clflush
|
||||
*/
|
||||
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void PrefetchW(const void* addr) { __builtin_prefetch(addr, 1, 0); }
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchNTA(const void* addr) { __builtin_prefetch(addr, 0, 0); }
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL3(const void* addr) { __builtin_prefetch(addr, 0, 1); }
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL2(const void* addr) { __builtin_prefetch(addr, 0, 2); }
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL1(const void* addr) { __builtin_prefetch(addr, 0, 3); }
|
||||
#elif XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
|
||||
XE_FORCEINLINE
|
||||
static void PrefetchW(const void* addr) { _m_prefetchw(addr); }
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void PrefetchNTA(const void* addr) {
|
||||
_mm_prefetch((const char*)addr, _MM_HINT_NTA);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL3(const void* addr) {
|
||||
_mm_prefetch((const char*)addr, _MM_HINT_T2);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL2(const void* addr) {
|
||||
_mm_prefetch((const char*)addr, _MM_HINT_T1);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL1(const void* addr) {
|
||||
_mm_prefetch((const char*)addr, _MM_HINT_T0);
|
||||
}
|
||||
|
||||
#else
|
||||
XE_FORCEINLINE
|
||||
static void PrefetchW(const void* addr) {}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void PrefetchNTA(const void* addr) {}
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL3(const void* addr) {}
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL2(const void* addr) {}
|
||||
XE_FORCEINLINE
|
||||
|
||||
static void PrefetchL1(const void* addr) {}
|
||||
|
||||
#endif
|
||||
|
||||
enum class PrefetchTag { Write, Nontemporal, Level3, Level2, Level1 };
|
||||
|
||||
template <PrefetchTag tag>
|
||||
static void Prefetch(const void* addr) {
|
||||
static_assert(false, "Unknown tag");
|
||||
}
|
||||
|
||||
template <>
|
||||
static void Prefetch<PrefetchTag::Write>(const void* addr) {
|
||||
PrefetchW(addr);
|
||||
}
|
||||
template <>
|
||||
static void Prefetch<PrefetchTag::Nontemporal>(const void* addr) {
|
||||
PrefetchNTA(addr);
|
||||
}
|
||||
template <>
|
||||
static void Prefetch<PrefetchTag::Level3>(const void* addr) {
|
||||
PrefetchL3(addr);
|
||||
}
|
||||
template <>
|
||||
static void Prefetch<PrefetchTag::Level2>(const void* addr) {
|
||||
PrefetchL2(addr);
|
||||
}
|
||||
template <>
|
||||
static void Prefetch<PrefetchTag::Level1>(const void* addr) {
|
||||
PrefetchL1(addr);
|
||||
}
|
||||
// todo: does aarch64 have streaming stores/loads?
|
||||
|
||||
/*
|
||||
non-temporal stores/loads
|
||||
|
||||
the stores allow cacheable memory to behave like write-combining memory.
|
||||
on the first nt store to a line, an intermediate buffer will be
|
||||
allocated by the cpu for stores that come after. once the entire contents of
|
||||
the line have been written the intermediate buffer will be transmitted to
|
||||
memory
|
||||
|
||||
the written line will not be cached and if it is in the cache it will be
|
||||
invalidated from all levels of the hierarchy
|
||||
|
||||
the cpu in this case does not have to read line from memory when we
|
||||
first write to it if it is not anywhere in the cache, so we use half the
|
||||
memory bandwidth using these stores
|
||||
|
||||
non-temporal loads are... loads, but they dont use the cache. you need
|
||||
to manually insert memory barriers (_ReadWriteBarrier, ReadBarrier, etc, do
|
||||
not use any barriers that generate actual code) if on msvc to prevent it from
|
||||
moving the load of the data to just before the use of the data (immediately
|
||||
requiring the memory to be available = big stall)
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
|
||||
#define XE_MSVC_REORDER_BARRIER _ReadWriteBarrier
|
||||
|
||||
#else
|
||||
// if the compiler actually has pipelining for instructions we dont need a
|
||||
// barrier
|
||||
#define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
|
||||
#endif
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void WriteLineNT(void* destination, const void* source) {
|
||||
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
|
||||
__m256i low = _mm256_loadu_si256((const __m256i*)source);
|
||||
__m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]);
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
_mm256_stream_si256((__m256i*)destination, low);
|
||||
_mm256_stream_si256(&((__m256i*)destination)[1], high);
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void ReadLineNT(void* destination, const void* source) {
|
||||
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
|
||||
__m256i low = _mm256_stream_load_si256((const __m256i*)source);
|
||||
__m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
_mm256_storeu_si256((__m256i*)destination, low);
|
||||
_mm256_storeu_si256(&((__m256i*)destination)[1], high);
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void WriteFence() { _mm_sfence(); }
|
||||
XE_FORCEINLINE
|
||||
static void ReadFence() { _mm_lfence(); }
|
||||
XE_FORCEINLINE
|
||||
static void ReadWriteFence() { _mm_mfence(); }
|
||||
#else
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void WriteLineNT(void* destination, const void* source) {
|
||||
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
|
||||
memcpy(destination, source, 64);
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static void ReadLineNT(void* destination, const void* source) {
|
||||
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
|
||||
memcpy(destination, source, 64);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static void WriteFence() {}
|
||||
XE_FORCEINLINE
|
||||
static void ReadFence() {}
|
||||
XE_FORCEINLINE
|
||||
static void ReadWriteFence() {}
|
||||
#endif
|
||||
} // namespace swcache
|
||||
} // namespace xe
|
||||
|
||||
#endif // XENIA_BASE_MEMORY_H_
|
||||
|
|
|
@ -8,11 +8,79 @@
|
|||
*/
|
||||
|
||||
#include "xenia/base/mutex.h"
|
||||
#if XE_PLATFORM_WIN32 == 1
|
||||
#include "xenia/base/platform_win.h"
|
||||
#endif
|
||||
|
||||
namespace xe {
|
||||
#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1
|
||||
// default spincount for entercriticalsection is insane on windows, 0x20007D0i64
|
||||
// (33556432 times!!) when a lock is highly contended performance degrades
|
||||
// sharply on some processors todo: perhaps we should have a set of optional
|
||||
// jobs that processors can do instead of spinning, for instance, sorting a list
|
||||
// so we have better locality later or something
|
||||
#define XE_CRIT_SPINCOUNT 128
|
||||
/*
|
||||
chrispy: todo, if a thread exits before releasing the global mutex we need to
|
||||
check this and release the mutex one way to do this is by using FlsAlloc and
|
||||
PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
|
||||
thread exits
|
||||
*/
|
||||
thread_local unsigned global_mutex_depth = 0;
|
||||
static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
|
||||
return reinterpret_cast<CRITICAL_SECTION*>(mutex);
|
||||
}
|
||||
|
||||
xe_global_mutex::xe_global_mutex() {
|
||||
InitializeCriticalSectionEx(global_critical_section(this), XE_CRIT_SPINCOUNT,
|
||||
CRITICAL_SECTION_NO_DEBUG_INFO);
|
||||
}
|
||||
xe_global_mutex ::~xe_global_mutex() {
|
||||
DeleteCriticalSection(global_critical_section(this));
|
||||
}
|
||||
void xe_global_mutex::lock() {
|
||||
if (global_mutex_depth) {
|
||||
} else {
|
||||
EnterCriticalSection(global_critical_section(this));
|
||||
}
|
||||
global_mutex_depth++;
|
||||
}
|
||||
void xe_global_mutex::unlock() {
|
||||
if (--global_mutex_depth == 0) {
|
||||
LeaveCriticalSection(global_critical_section(this));
|
||||
}
|
||||
}
|
||||
bool xe_global_mutex::try_lock() {
|
||||
if (global_mutex_depth) {
|
||||
++global_mutex_depth;
|
||||
return true;
|
||||
} else {
|
||||
BOOL success = TryEnterCriticalSection(global_critical_section(this));
|
||||
if (success) {
|
||||
++global_mutex_depth;
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
|
||||
return reinterpret_cast<CRITICAL_SECTION*>(mutex);
|
||||
}
|
||||
xe_fast_mutex::xe_fast_mutex() {
|
||||
InitializeCriticalSectionEx(fast_crit(this), XE_CRIT_SPINCOUNT,
|
||||
CRITICAL_SECTION_NO_DEBUG_INFO);
|
||||
}
|
||||
xe_fast_mutex::~xe_fast_mutex() { DeleteCriticalSection(fast_crit(this)); }
|
||||
|
||||
void xe_fast_mutex::lock() { EnterCriticalSection(fast_crit(this)); }
|
||||
void xe_fast_mutex::unlock() { LeaveCriticalSection(fast_crit(this)); }
|
||||
bool xe_fast_mutex::try_lock() {
|
||||
return TryEnterCriticalSection(fast_crit(this));
|
||||
}
|
||||
#endif
|
||||
// chrispy: moved this out of body of function to eliminate the initialization
|
||||
// guards
|
||||
static std::recursive_mutex global_mutex;
|
||||
std::recursive_mutex& global_critical_region::mutex() { return global_mutex; }
|
||||
static global_mutex_type global_mutex;
|
||||
global_mutex_type& global_critical_region::mutex() { return global_mutex; }
|
||||
|
||||
} // namespace xe
|
||||
|
|
|
@ -9,11 +9,90 @@
|
|||
|
||||
#ifndef XENIA_BASE_MUTEX_H_
|
||||
#define XENIA_BASE_MUTEX_H_
|
||||
|
||||
#include <mutex>
|
||||
#include "platform.h"
|
||||
|
||||
#define XE_ENABLE_FAST_WIN32_MUTEX 1
|
||||
namespace xe {
|
||||
|
||||
#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1
|
||||
/*
|
||||
must conform to
|
||||
BasicLockable:https://en.cppreference.com/w/cpp/named_req/BasicLockable as
|
||||
well as Lockable: https://en.cppreference.com/w/cpp/named_req/Lockable
|
||||
|
||||
this emulates a recursive mutex, except with far less overhead
|
||||
*/
|
||||
|
||||
class alignas(4096) xe_global_mutex {
|
||||
char detail[64];
|
||||
|
||||
public:
|
||||
xe_global_mutex();
|
||||
~xe_global_mutex();
|
||||
|
||||
void lock();
|
||||
void unlock();
|
||||
bool try_lock();
|
||||
};
|
||||
using global_mutex_type = xe_global_mutex;
|
||||
|
||||
class alignas(64) xe_fast_mutex {
|
||||
char detail[64];
|
||||
|
||||
public:
|
||||
xe_fast_mutex();
|
||||
~xe_fast_mutex();
|
||||
|
||||
void lock();
|
||||
void unlock();
|
||||
bool try_lock();
|
||||
};
|
||||
// a mutex that is extremely unlikely to ever be locked
|
||||
// use for race conditions that have extremely remote odds of happening
|
||||
class xe_unlikely_mutex {
|
||||
std::atomic<uint32_t> mut;
|
||||
bool _tryget() {
|
||||
uint32_t lock_expected = 0;
|
||||
return mut.compare_exchange_strong(lock_expected, 1);
|
||||
}
|
||||
|
||||
public:
|
||||
xe_unlikely_mutex() : mut(0) {}
|
||||
~xe_unlikely_mutex() { mut = 0; }
|
||||
|
||||
void lock() {
|
||||
uint32_t lock_expected = 0;
|
||||
|
||||
if (XE_LIKELY(_tryget())) {
|
||||
return;
|
||||
} else {
|
||||
do {
|
||||
// chrispy: warning, if no SMT, mm_pause does nothing...
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
_mm_pause();
|
||||
#endif
|
||||
|
||||
} while (!_tryget());
|
||||
}
|
||||
}
|
||||
void unlock() { mut.exchange(0); }
|
||||
bool try_lock() { return _tryget(); }
|
||||
};
|
||||
using xe_mutex = xe_fast_mutex;
|
||||
#else
|
||||
using global_mutex_type = std::recursive_mutex;
|
||||
using xe_mutex = std::mutex;
|
||||
using xe_unlikely_mutex = std::mutex;
|
||||
#endif
|
||||
struct null_mutex {
|
||||
public:
|
||||
static void lock() {}
|
||||
static void unlock() {}
|
||||
static bool try_lock() { return true; }
|
||||
};
|
||||
|
||||
using global_unique_lock_type = std::unique_lock<global_mutex_type>;
|
||||
// The global critical region mutex singleton.
|
||||
// This must guard any operation that may suspend threads or be sensitive to
|
||||
// being suspended such as global table locks and such.
|
||||
|
@ -54,30 +133,30 @@ namespace xe {
|
|||
// };
|
||||
class global_critical_region {
|
||||
public:
|
||||
static std::recursive_mutex& mutex();
|
||||
static global_mutex_type& mutex();
|
||||
|
||||
// Acquires a lock on the global critical section.
|
||||
// Use this when keeping an instance is not possible. Otherwise, prefer
|
||||
// to keep an instance of global_critical_region near the members requiring
|
||||
// it to keep things readable.
|
||||
static std::unique_lock<std::recursive_mutex> AcquireDirect() {
|
||||
return std::unique_lock<std::recursive_mutex>(mutex());
|
||||
static global_unique_lock_type AcquireDirect() {
|
||||
return global_unique_lock_type(mutex());
|
||||
}
|
||||
|
||||
// Acquires a lock on the global critical section.
|
||||
inline std::unique_lock<std::recursive_mutex> Acquire() {
|
||||
return std::unique_lock<std::recursive_mutex>(mutex());
|
||||
inline global_unique_lock_type Acquire() {
|
||||
return global_unique_lock_type(mutex());
|
||||
}
|
||||
|
||||
// Acquires a deferred lock on the global critical section.
|
||||
inline std::unique_lock<std::recursive_mutex> AcquireDeferred() {
|
||||
return std::unique_lock<std::recursive_mutex>(mutex(), std::defer_lock);
|
||||
inline global_unique_lock_type AcquireDeferred() {
|
||||
return global_unique_lock_type(mutex(), std::defer_lock);
|
||||
}
|
||||
|
||||
// Tries to acquire a lock on the glboal critical section.
|
||||
// Check owns_lock() to see if the lock was successfully acquired.
|
||||
inline std::unique_lock<std::recursive_mutex> TryAcquire() {
|
||||
return std::unique_lock<std::recursive_mutex>(mutex(), std::try_to_lock);
|
||||
inline global_unique_lock_type TryAcquire() {
|
||||
return global_unique_lock_type(mutex(), std::try_to_lock);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -122,6 +122,7 @@
|
|||
#define XE_COLD __attribute__((cold))
|
||||
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
|
||||
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
|
||||
|
||||
#else
|
||||
#define XE_FORCEINLINE inline
|
||||
#define XE_NOINLINE
|
||||
|
@ -129,6 +130,24 @@
|
|||
#define XE_LIKELY(...) (!!(__VA_ARGS__))
|
||||
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
|
||||
#endif
|
||||
// only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which
|
||||
// acts as __restrict across the board todo: __restrict is part of the type
|
||||
// system, we might actually have to still emit it on clang and gcc
|
||||
#if XE_COMPILER_CLANG_CL == 0 && XE_COMPILER_MSVC == 1
|
||||
|
||||
#define XE_RESTRICT __restrict
|
||||
#else
|
||||
#define XE_RESTRICT
|
||||
#endif
|
||||
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
#define XE_HOST_CACHE_LINE_SIZE 64
|
||||
#elif XE_ARCH_ARM64 == 1
|
||||
#define XE_HOST_CACHE_LINE_SIZE 64
|
||||
#else
|
||||
|
||||
#error unknown cache line size for unknown architecture!
|
||||
#endif
|
||||
|
||||
namespace xe {
|
||||
|
||||
|
|
|
@ -34,31 +34,48 @@
|
|||
#undef DeleteFile
|
||||
#undef GetFirstChild
|
||||
|
||||
#define XE_USE_NTDLL_FUNCTIONS 1
|
||||
#if XE_USE_NTDLL_FUNCTIONS==1
|
||||
#define XE_USE_NTDLL_FUNCTIONS 1
|
||||
//chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
|
||||
//
|
||||
#define XE_USE_KUSER_SHARED 0
|
||||
#if XE_USE_NTDLL_FUNCTIONS == 1
|
||||
/*
|
||||
ntdll versions of functions often skip through a lot of extra garbage in KernelBase
|
||||
ntdll versions of functions often skip through a lot of extra garbage in
|
||||
KernelBase
|
||||
*/
|
||||
#define XE_NTDLL_IMPORT(name, cls, clsvar) \
|
||||
static class cls { \
|
||||
public: \
|
||||
FARPROC fn;\
|
||||
cls() : fn(nullptr) {\
|
||||
auto ntdll = GetModuleHandleA("ntdll.dll");\
|
||||
if (ntdll) { \
|
||||
fn = GetProcAddress(ntdll, #name );\
|
||||
}\
|
||||
} \
|
||||
template <typename TRet = void, typename... TArgs> \
|
||||
inline TRet invoke(TArgs... args) {\
|
||||
return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...);\
|
||||
}\
|
||||
inline operator bool() const {\
|
||||
return fn!=nullptr;\
|
||||
}\
|
||||
#define XE_NTDLL_IMPORT(name, cls, clsvar) \
|
||||
static class cls { \
|
||||
public: \
|
||||
FARPROC fn; \
|
||||
cls() : fn(nullptr) { \
|
||||
auto ntdll = GetModuleHandleA("ntdll.dll"); \
|
||||
if (ntdll) { \
|
||||
fn = GetProcAddress(ntdll, #name); \
|
||||
} \
|
||||
} \
|
||||
template <typename TRet = void, typename... TArgs> \
|
||||
inline TRet invoke(TArgs... args) { \
|
||||
return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...); \
|
||||
} \
|
||||
inline operator bool() const { return fn != nullptr; } \
|
||||
} clsvar
|
||||
#else
|
||||
#define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false
|
||||
|
||||
#endif
|
||||
#if XE_USE_KUSER_SHARED==1
|
||||
// KUSER_SHARED
|
||||
struct __declspec(align(4)) _KSYSTEM_TIME {
|
||||
unsigned int LowPart;
|
||||
int High1Time;
|
||||
int High2Time;
|
||||
};
|
||||
|
||||
static constexpr size_t KSUER_SHARED_SYSTEMTIME_OFFSET = 0x14;
|
||||
static unsigned char* KUserShared() { return (unsigned char*)0x7FFE0000ULL; }
|
||||
static volatile _KSYSTEM_TIME* GetKUserSharedSystemTime() {
|
||||
return reinterpret_cast<volatile _KSYSTEM_TIME*>(
|
||||
KUserShared() + KSUER_SHARED_SYSTEMTIME_OFFSET);
|
||||
}
|
||||
#endif
|
||||
#endif // XENIA_BASE_PLATFORM_WIN_H_
|
||||
|
|
|
@ -8,46 +8,52 @@
|
|||
*/
|
||||
|
||||
#include "xenia/base/ring_buffer.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
namespace xe {
|
||||
|
||||
RingBuffer::RingBuffer(uint8_t* buffer, size_t capacity)
|
||||
: buffer_(buffer), capacity_(capacity) {}
|
||||
: buffer_(buffer),
|
||||
capacity_(static_cast<ring_size_t>(capacity)),
|
||||
read_offset_(0),
|
||||
write_offset_(0) {}
|
||||
|
||||
void RingBuffer::AdvanceRead(size_t count) {
|
||||
void RingBuffer::AdvanceRead(size_t _count) {
|
||||
ring_size_t count = static_cast<ring_size_t>(_count);
|
||||
if (read_offset_ + count < capacity_) {
|
||||
read_offset_ += count;
|
||||
} else {
|
||||
size_t left_half = capacity_ - read_offset_;
|
||||
size_t right_half = count - left_half;
|
||||
ring_size_t left_half = capacity_ - read_offset_;
|
||||
ring_size_t right_half = count - left_half;
|
||||
read_offset_ = right_half;
|
||||
}
|
||||
}
|
||||
|
||||
void RingBuffer::AdvanceWrite(size_t count) {
|
||||
void RingBuffer::AdvanceWrite(size_t _count) {
|
||||
ring_size_t count = static_cast<ring_size_t>(_count);
|
||||
|
||||
if (write_offset_ + count < capacity_) {
|
||||
write_offset_ += count;
|
||||
} else {
|
||||
size_t left_half = capacity_ - write_offset_;
|
||||
size_t right_half = count - left_half;
|
||||
ring_size_t left_half = capacity_ - write_offset_;
|
||||
ring_size_t right_half = count - left_half;
|
||||
write_offset_ = right_half;
|
||||
}
|
||||
}
|
||||
|
||||
RingBuffer::ReadRange RingBuffer::BeginRead(size_t count) {
|
||||
count = std::min(count, capacity_);
|
||||
RingBuffer::ReadRange RingBuffer::BeginRead(size_t _count) {
|
||||
ring_size_t count =
|
||||
std::min<ring_size_t>(static_cast<ring_size_t>(_count), capacity_);
|
||||
if (!count) {
|
||||
return {0};
|
||||
}
|
||||
if (read_offset_ + count < capacity_) {
|
||||
return {buffer_ + read_offset_, count, nullptr, 0};
|
||||
return {buffer_ + read_offset_, nullptr, count, 0};
|
||||
} else {
|
||||
size_t left_half = capacity_ - read_offset_;
|
||||
size_t right_half = count - left_half;
|
||||
return {buffer_ + read_offset_, left_half, buffer_, right_half};
|
||||
ring_size_t left_half = capacity_ - read_offset_;
|
||||
ring_size_t right_half = count - left_half;
|
||||
return {buffer_ + read_offset_, buffer_, left_half, right_half};
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -59,7 +65,8 @@ void RingBuffer::EndRead(ReadRange read_range) {
|
|||
}
|
||||
}
|
||||
|
||||
size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
|
||||
size_t RingBuffer::Read(uint8_t* buffer, size_t _count) {
|
||||
ring_size_t count = static_cast<ring_size_t>(_count);
|
||||
count = std::min(count, capacity_);
|
||||
if (!count) {
|
||||
return 0;
|
||||
|
@ -69,7 +76,7 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
|
|||
if (read_offset_ < write_offset_) {
|
||||
assert_true(read_offset_ + count <= write_offset_);
|
||||
} else if (read_offset_ + count >= capacity_) {
|
||||
size_t left_half = capacity_ - read_offset_;
|
||||
ring_size_t left_half = capacity_ - read_offset_;
|
||||
assert_true(count - left_half <= write_offset_);
|
||||
}
|
||||
|
||||
|
@ -77,8 +84,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
|
|||
std::memcpy(buffer, buffer_ + read_offset_, count);
|
||||
read_offset_ += count;
|
||||
} else {
|
||||
size_t left_half = capacity_ - read_offset_;
|
||||
size_t right_half = count - left_half;
|
||||
ring_size_t left_half = capacity_ - read_offset_;
|
||||
ring_size_t right_half = count - left_half;
|
||||
std::memcpy(buffer, buffer_ + read_offset_, left_half);
|
||||
std::memcpy(buffer + left_half, buffer_, right_half);
|
||||
read_offset_ = right_half;
|
||||
|
@ -87,7 +94,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
|
|||
return count;
|
||||
}
|
||||
|
||||
size_t RingBuffer::Write(const uint8_t* buffer, size_t count) {
|
||||
size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) {
|
||||
ring_size_t count = static_cast<ring_size_t>(_count);
|
||||
count = std::min(count, capacity_);
|
||||
if (!count) {
|
||||
return 0;
|
||||
|
@ -105,8 +113,8 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t count) {
|
|||
std::memcpy(buffer_ + write_offset_, buffer, count);
|
||||
write_offset_ += count;
|
||||
} else {
|
||||
size_t left_half = capacity_ - write_offset_;
|
||||
size_t right_half = count - left_half;
|
||||
ring_size_t left_half = capacity_ - write_offset_;
|
||||
ring_size_t right_half = count - left_half;
|
||||
std::memcpy(buffer_ + write_offset_, buffer, left_half);
|
||||
std::memcpy(buffer_, buffer + left_half, right_half);
|
||||
write_offset_ = right_half;
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/byte_order.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
|
||||
namespace xe {
|
||||
/*
|
||||
|
@ -39,18 +41,24 @@ namespace xe {
|
|||
that the registers no longer need the rex prefix, shrinking the generated
|
||||
code a bit.. like i said, every bit helps in this class
|
||||
*/
|
||||
using ring_size_t = uint32_t;
|
||||
class RingBuffer {
|
||||
public:
|
||||
RingBuffer(uint8_t* buffer, size_t capacity);
|
||||
|
||||
uint8_t* buffer() const { return buffer_; }
|
||||
size_t capacity() const { return capacity_; }
|
||||
ring_size_t capacity() const { return capacity_; }
|
||||
bool empty() const { return read_offset_ == write_offset_; }
|
||||
|
||||
size_t read_offset() const { return read_offset_; }
|
||||
uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; }
|
||||
ring_size_t read_offset() const { return read_offset_; }
|
||||
uintptr_t read_ptr() const {
|
||||
return uintptr_t(buffer_) + static_cast<uintptr_t>(read_offset_);
|
||||
}
|
||||
|
||||
// todo: offset/ capacity_ is probably always 1 when its over, just check and
|
||||
// subtract instead
|
||||
void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
|
||||
size_t read_count() const {
|
||||
ring_size_t read_count() const {
|
||||
// chrispy: these branches are unpredictable
|
||||
#if 0
|
||||
if (read_offset_ == write_offset_) {
|
||||
|
@ -61,14 +69,14 @@ class RingBuffer {
|
|||
return (capacity_ - read_offset_) + write_offset_;
|
||||
}
|
||||
#else
|
||||
size_t read_offs = read_offset_;
|
||||
size_t write_offs = write_offset_;
|
||||
size_t cap = capacity_;
|
||||
ring_size_t read_offs = read_offset_;
|
||||
ring_size_t write_offs = write_offset_;
|
||||
ring_size_t cap = capacity_;
|
||||
|
||||
size_t offset_delta = write_offs - read_offs;
|
||||
size_t wrap_read_count = (cap - read_offs) + write_offs;
|
||||
ring_size_t offset_delta = write_offs - read_offs;
|
||||
ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
|
||||
|
||||
size_t comparison_value = read_offs <= write_offs;
|
||||
ring_size_t comparison_value = read_offs <= write_offs;
|
||||
#if 0
|
||||
size_t selector =
|
||||
static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
|
||||
|
@ -89,10 +97,12 @@ class RingBuffer {
|
|||
#endif
|
||||
}
|
||||
|
||||
size_t write_offset() const { return write_offset_; }
|
||||
ring_size_t write_offset() const { return write_offset_; }
|
||||
uintptr_t write_ptr() const { return uintptr_t(buffer_) + write_offset_; }
|
||||
void set_write_offset(size_t offset) { write_offset_ = offset % capacity_; }
|
||||
size_t write_count() const {
|
||||
void set_write_offset(size_t offset) {
|
||||
write_offset_ = static_cast<ring_size_t>(offset) % capacity_;
|
||||
}
|
||||
ring_size_t write_count() const {
|
||||
if (read_offset_ == write_offset_) {
|
||||
return capacity_;
|
||||
} else if (write_offset_ < read_offset_) {
|
||||
|
@ -107,13 +117,35 @@ class RingBuffer {
|
|||
|
||||
struct ReadRange {
|
||||
const uint8_t* first;
|
||||
size_t first_length;
|
||||
|
||||
const uint8_t* second;
|
||||
size_t second_length;
|
||||
ring_size_t first_length;
|
||||
ring_size_t second_length;
|
||||
};
|
||||
ReadRange BeginRead(size_t count);
|
||||
void EndRead(ReadRange read_range);
|
||||
|
||||
/*
|
||||
BeginRead, but if there is a second Range it will prefetch all lines of it
|
||||
|
||||
this does not prefetch the first range, because software prefetching can do that faster than we can
|
||||
*/
|
||||
template <swcache::PrefetchTag tag>
|
||||
XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
|
||||
ReadRange range = BeginRead(count);
|
||||
|
||||
if (range.second) {
|
||||
ring_size_t numlines =
|
||||
xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
|
||||
XE_HOST_CACHE_LINE_SIZE;
|
||||
//chrispy: maybe unroll?
|
||||
for (ring_size_t i = 0; i < numlines; ++i) {
|
||||
swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
|
||||
}
|
||||
}
|
||||
return range;
|
||||
}
|
||||
|
||||
size_t Read(uint8_t* buffer, size_t count);
|
||||
template <typename T>
|
||||
size_t Read(T* buffer, size_t count) {
|
||||
|
@ -156,29 +188,29 @@ class RingBuffer {
|
|||
|
||||
private:
|
||||
uint8_t* buffer_ = nullptr;
|
||||
size_t capacity_ = 0;
|
||||
size_t read_offset_ = 0;
|
||||
size_t write_offset_ = 0;
|
||||
ring_size_t capacity_ = 0;
|
||||
ring_size_t read_offset_ = 0;
|
||||
ring_size_t write_offset_ = 0;
|
||||
};
|
||||
|
||||
template <>
|
||||
inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
|
||||
size_t read_offset = this->read_offset_;
|
||||
ring_size_t read_offset = this->read_offset_;
|
||||
xenia_assert(this->capacity_ >= 4);
|
||||
|
||||
size_t next_read_offset = read_offset + 4;
|
||||
#if 0
|
||||
ring_size_t next_read_offset = read_offset + 4;
|
||||
#if 0
|
||||
size_t zerotest = next_read_offset - this->capacity_;
|
||||
// unpredictable branch, use bit arith instead
|
||||
// todo: it would be faster to use lzcnt, but we need to figure out if all
|
||||
// machines we support support it
|
||||
next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
|
||||
#else
|
||||
#else
|
||||
if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
|
||||
next_read_offset = 0;
|
||||
//todo: maybe prefetch next? or should that happen much earlier?
|
||||
// todo: maybe prefetch next? or should that happen much earlier?
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
this->read_offset_ = next_read_offset;
|
||||
unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
|
||||
return xe::byte_swap(ring_value);
|
||||
|
|
|
@ -148,6 +148,7 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value);
|
|||
// be kept short or else all timers will be impacted. This is a simplified
|
||||
// wrapper around QueueTimerRecurring which automatically cancels the timer on
|
||||
// destruction.
|
||||
//only used by XboxkrnlModule::XboxkrnlModule
|
||||
class HighResolutionTimer {
|
||||
HighResolutionTimer(std::chrono::milliseconds interval,
|
||||
std::function<void()> callback) {
|
||||
|
|
|
@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem;
|
|||
edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds
|
||||
|
||||
*/
|
||||
using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy;
|
||||
using WaitStrat = dp::blocking_wait_strategy;
|
||||
|
||||
class TimerQueue {
|
||||
public:
|
||||
|
@ -205,7 +205,7 @@ void TimerQueueWaitItem::Disarm() {
|
|||
spinner.spin_once();
|
||||
}
|
||||
}
|
||||
|
||||
//unused
|
||||
std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
|
||||
void* userdata,
|
||||
WaitItem::clock::time_point due) {
|
||||
|
@ -213,7 +213,7 @@ std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
|
|||
std::make_shared<WaitItem>(std::move(callback), userdata, &timer_queue_,
|
||||
due, WaitItem::clock::duration::zero()));
|
||||
}
|
||||
|
||||
// only used by HighResolutionTimer
|
||||
std::weak_ptr<WaitItem> QueueTimerRecurring(
|
||||
std::function<void(void*)> callback, void* userdata,
|
||||
WaitItem::clock::time_point due, WaitItem::clock::duration interval) {
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#include "xenia/cpu/backend/x64/x64_backend.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include "third_party/capstone/include/capstone/capstone.h"
|
||||
#include "third_party/capstone/include/capstone/x86.h"
|
||||
|
||||
|
@ -50,6 +50,9 @@ DEFINE_bool(record_mmio_access_exceptions, true,
|
|||
"for them. This info can then be used on a subsequent run to "
|
||||
"instruct the recompiler to emit checks",
|
||||
"CPU");
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
DECLARE_bool(instrument_call_times);
|
||||
#endif
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
@ -96,6 +99,68 @@ static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
|
|||
reinterpret_cast<X64Backend*>(context)
|
||||
->RecordMMIOExceptionForGuestInstruction(hostaddr);
|
||||
}
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
// todo: better way of passing to atexit. maybe do in destructor instead?
|
||||
// nope, destructor is never called
|
||||
static GuestProfilerData* backend_profiler_data = nullptr;
|
||||
|
||||
static uint64_t nanosecond_lifetime_start = 0;
|
||||
static void WriteGuestProfilerData() {
|
||||
if (cvars::instrument_call_times) {
|
||||
uint64_t end = Clock::QueryHostSystemTime();
|
||||
|
||||
uint64_t total = end - nanosecond_lifetime_start;
|
||||
|
||||
double totaltime_divisor = static_cast<double>(total);
|
||||
|
||||
FILE* output_file = nullptr;
|
||||
std::vector<std::pair<uint32_t, uint64_t>> unsorted_profile{};
|
||||
for (auto&& entry : *backend_profiler_data) {
|
||||
if (entry.second) { // skip times of 0
|
||||
unsorted_profile.emplace_back(entry.first, entry.second);
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(unsorted_profile.begin(), unsorted_profile.end(),
|
||||
[](auto& x, auto& y) { return x.second < y.second; });
|
||||
|
||||
fopen_s(&output_file, "profile_times.txt", "w");
|
||||
FILE* idapy_file = nullptr;
|
||||
fopen_s(&idapy_file, "profile_print_times.py", "w");
|
||||
|
||||
for (auto&& sorted_entry : unsorted_profile) {
|
||||
// double time_in_seconds =
|
||||
// static_cast<double>(sorted_entry.second) / 10000000.0;
|
||||
double time_in_milliseconds =
|
||||
static_cast<double>(sorted_entry.second) / (10000000.0 / 1000.0);
|
||||
|
||||
double slice = static_cast<double>(sorted_entry.second) /
|
||||
static_cast<double>(totaltime_divisor);
|
||||
|
||||
fprintf(output_file,
|
||||
"%X took %.20f milliseconds, totaltime slice percentage %.20f \n",
|
||||
sorted_entry.first, time_in_milliseconds, slice);
|
||||
|
||||
fprintf(idapy_file,
|
||||
"print(get_name(0x%X) + ' took %.20f ms, %.20f percent')\n",
|
||||
sorted_entry.first, time_in_milliseconds, slice);
|
||||
}
|
||||
|
||||
fclose(output_file);
|
||||
fclose(idapy_file);
|
||||
}
|
||||
}
|
||||
|
||||
static void GuestProfilerUpdateThreadProc() {
|
||||
nanosecond_lifetime_start = Clock::QueryHostSystemTime();
|
||||
|
||||
do {
|
||||
xe::threading::Sleep(std::chrono::seconds(30));
|
||||
WriteGuestProfilerData();
|
||||
} while (true);
|
||||
}
|
||||
static std::unique_ptr<xe::threading::Thread> g_profiler_update_thread{};
|
||||
#endif
|
||||
|
||||
bool X64Backend::Initialize(Processor* processor) {
|
||||
if (!Backend::Initialize(processor)) {
|
||||
|
@ -159,6 +224,21 @@ bool X64Backend::Initialize(Processor* processor) {
|
|||
|
||||
processor->memory()->SetMMIOExceptionRecordingCallback(
|
||||
ForwardMMIOAccessForRecording, (void*)this);
|
||||
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
if (cvars::instrument_call_times) {
|
||||
backend_profiler_data = &profiler_data_;
|
||||
xe::threading::Thread::CreationParameters slimparams;
|
||||
|
||||
slimparams.create_suspended = false;
|
||||
slimparams.initial_priority = xe::threading::ThreadPriority::kLowest;
|
||||
slimparams.stack_size = 65536 * 4;
|
||||
|
||||
g_profiler_update_thread = std::move(xe::threading::Thread::Create(
|
||||
slimparams, GuestProfilerUpdateThreadProc));
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -734,6 +814,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
|
|||
bctx->flags = 0;
|
||||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||
bctx->Ox1000 = 0x1000;
|
||||
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
||||
}
|
||||
const uint32_t mxcsr_table[8] = {
|
||||
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
||||
|
@ -747,6 +828,23 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
|
|||
bctx->mxcsr_fpu = mxcsr_table[control];
|
||||
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
|
||||
}
|
||||
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) {
|
||||
// who knows, we might want to compile different versions of a function one
|
||||
// day
|
||||
auto entry = profiler_data_.find(guest_address);
|
||||
|
||||
if (entry != profiler_data_.end()) {
|
||||
return &entry->second;
|
||||
} else {
|
||||
profiler_data_[guest_address] = 0;
|
||||
|
||||
return &profiler_data_[guest_address];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -15,6 +15,14 @@
|
|||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/cpu/backend/backend.h"
|
||||
|
||||
#if XE_PLATFORM_WIN32 == 1
|
||||
// we use KUSER_SHARED's systemtime field, which is at a fixed address and
|
||||
// obviously windows specific, to get the start/end time for a function using
|
||||
// rdtsc would be too slow and skew the results by consuming extra cpu time, so
|
||||
// we have lower time precision but better overall accuracy
|
||||
#define XE_X64_PROFILER_AVAILABLE 1
|
||||
#endif
|
||||
|
||||
DECLARE_int32(x64_extension_mask);
|
||||
|
||||
namespace xe {
|
||||
|
@ -24,6 +32,8 @@ namespace xe {
|
|||
namespace cpu {
|
||||
namespace backend {
|
||||
namespace x64 {
|
||||
// mapping of guest function addresses to total nanoseconds taken in the func
|
||||
using GuestProfilerData = std::map<uint32_t, uint64_t>;
|
||||
|
||||
class X64CodeCache;
|
||||
|
||||
|
@ -37,8 +47,10 @@ typedef void (*ResolveFunctionThunk)();
|
|||
// negatively index the membase reg)
|
||||
struct X64BackendContext {
|
||||
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
|
||||
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
||||
// affects both vmx and the fpu
|
||||
uint64_t* guest_tick_count;
|
||||
|
||||
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
||||
// affects both vmx and the fpu
|
||||
unsigned int mxcsr_vmx;
|
||||
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx
|
||||
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
|
||||
|
@ -93,7 +105,9 @@ class X64Backend : public Backend {
|
|||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
|
||||
|
||||
void RecordMMIOExceptionForGuestInstruction(void* host_address);
|
||||
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
|
||||
#endif
|
||||
private:
|
||||
static bool ExceptionCallbackThunk(Exception* ex, void* data);
|
||||
bool ExceptionCallback(Exception* ex);
|
||||
|
@ -106,6 +120,10 @@ class X64Backend : public Backend {
|
|||
HostToGuestThunk host_to_guest_thunk_;
|
||||
GuestToHostThunk guest_to_host_thunk_;
|
||||
ResolveFunctionThunk resolve_function_thunk_;
|
||||
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
GuestProfilerData profiler_data_;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace x64
|
||||
|
|
|
@ -57,6 +57,12 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
|
|||
"code. The workaround may cause reduced CPU performance but is a "
|
||||
"more accurate emulation",
|
||||
"x64");
|
||||
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
DEFINE_bool(instrument_call_times, false,
|
||||
"Compute time taken for functions, for profiling guest code",
|
||||
"x64");
|
||||
#endif
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -120,28 +126,37 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
*/
|
||||
unsigned int data[4];
|
||||
Xbyak::util::Cpu::getCpuid(0x80000001, data);
|
||||
if (data[2] & (1U << 5)) {
|
||||
unsigned amd_flags = data[2];
|
||||
if (amd_flags & (1U << 5)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
|
||||
feature_flags_ |= kX64EmitLZCNT;
|
||||
}
|
||||
}
|
||||
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
|
||||
if (amd_flags & (1U << 16)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
|
||||
feature_flags_ |= kX64EmitFMA4;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 21)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
|
||||
feature_flags_ |= kX64EmitTBM;
|
||||
}
|
||||
}
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
|
||||
/*
|
||||
chrispy: according to agner's tables, all amd architectures that
|
||||
we support (ones with avx) have the same timings for
|
||||
jrcxz/loop/loope/loopne as for other jmps
|
||||
*/
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
if (is_zennish) {
|
||||
// ik that i heard somewhere that this is the case for zen, but i need to
|
||||
// verify. cant find my original source for that.
|
||||
// todo: ask agner?
|
||||
feature_flags_ |= kX64FlagsIndependentVars;
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
|
||||
if (cpu_.displayFamily > 0x17) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
|
||||
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
} // todo:figure out at model zen+ became zen2, this is just the model
|
||||
// for my cpu, which is ripper90
|
||||
}
|
||||
}
|
||||
may_use_membase32_as_zero_reg_ =
|
||||
|
@ -157,6 +172,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
|
|||
std::vector<SourceMapEntry>* out_source_map) {
|
||||
SCOPE_profile_cpu_f("cpu");
|
||||
guest_module_ = dynamic_cast<XexModule*>(function->module());
|
||||
current_guest_function_ = function->address();
|
||||
// Reset.
|
||||
debug_info_ = debug_info;
|
||||
debug_info_flags_ = debug_info_flags;
|
||||
|
@ -286,10 +302,19 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
* chrispy: removed this, it serves no purpose
|
||||
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
|
||||
*/
|
||||
|
||||
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
|
||||
|
||||
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); // 0
|
||||
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
if (cvars::instrument_call_times) {
|
||||
mov(rdx, 0x7ffe0014); // load pointer to kusershared systemtime
|
||||
mov(rdx, qword[rdx]);
|
||||
mov(qword[rsp + StackLayout::GUEST_PROFILER_START],
|
||||
rdx); // save time for end of function
|
||||
}
|
||||
#endif
|
||||
// Safe now to do some tracing.
|
||||
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
|
||||
// We require 32-bit addresses.
|
||||
|
@ -363,6 +388,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
|
||||
*/
|
||||
code_offsets.epilog = getSize();
|
||||
EmitProfilerEpilogue();
|
||||
|
||||
add(rsp, (uint32_t)stack_size);
|
||||
ret();
|
||||
|
@ -391,6 +417,27 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
|
||||
return true;
|
||||
}
|
||||
// dont use rax, we do this in tail call handling
|
||||
void X64Emitter::EmitProfilerEpilogue() {
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
if (cvars::instrument_call_times) {
|
||||
uint64_t* profiler_entry =
|
||||
backend()->GetProfilerRecordForFunction(current_guest_function_);
|
||||
mov(ecx, 0x7ffe0014);
|
||||
mov(rdx, qword[rcx]);
|
||||
mov(rbx, (uintptr_t)profiler_entry);
|
||||
sub(rdx, qword[rsp + StackLayout::GUEST_PROFILER_START]);
|
||||
|
||||
// atomic add our time to the profiler entry
|
||||
// this could be atomic free if we had per thread profile counts, and on a
|
||||
// threads exit we lock and sum up to the global counts, which would make
|
||||
// this a few cycles less intrusive, but its good enough for now
|
||||
// actually... lets just try without atomics lol
|
||||
// lock();
|
||||
add(qword[rbx], rdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void X64Emitter::MarkSourceOffset(const Instr* i) {
|
||||
auto entry = source_map_arena_.Alloc<SourceMapEntry>();
|
||||
|
@ -558,7 +605,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
|||
if (instr->flags & hir::CALL_TAIL) {
|
||||
// Since we skip the prolog we need to mark the return here.
|
||||
EmitTraceUserCallReturn();
|
||||
|
||||
EmitProfilerEpilogue();
|
||||
// Pass the callers return address over.
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
|
@ -602,7 +649,7 @@ void X64Emitter::CallIndirect(const hir::Instr* instr,
|
|||
if (instr->flags & hir::CALL_TAIL) {
|
||||
// Since we skip the prolog we need to mark the return here.
|
||||
EmitTraceUserCallReturn();
|
||||
|
||||
EmitProfilerEpilogue();
|
||||
// Pass the callers return address over.
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
|
@ -952,7 +999,34 @@ static const vec128_t xmm_consts[] = {
|
|||
/*XMMVSRShlByteshuf*/
|
||||
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
|
||||
// XMMVSRMask
|
||||
vec128b(1)};
|
||||
vec128b(1),
|
||||
/*
|
||||
XMMF16UnpackLCPI2
|
||||
*/
|
||||
|
||||
vec128i(0x38000000),
|
||||
/*
|
||||
XMMF16UnpackLCPI3
|
||||
*/
|
||||
vec128q(0x7fe000007fe000ULL),
|
||||
|
||||
/* XMMF16PackLCPI0*/
|
||||
vec128i(0x8000000),
|
||||
/*XMMF16PackLCPI2*/
|
||||
vec128i(0x47ffe000),
|
||||
/*XMMF16PackLCPI3*/
|
||||
vec128i(0xc7800000),
|
||||
/*XMMF16PackLCPI4
|
||||
*/
|
||||
vec128i(0xf7fdfff),
|
||||
/*XMMF16PackLCPI5*/
|
||||
vec128i(0x7fff),
|
||||
/*
|
||||
XMMF16PackLCPI6
|
||||
*/
|
||||
vec128i(0x8000)
|
||||
|
||||
};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
|
|
|
@ -159,7 +159,15 @@ enum XmmConst {
|
|||
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
|
||||
XMMXenosF16ExtRangeStart,
|
||||
XMMVSRShlByteshuf,
|
||||
XMMVSRMask
|
||||
XMMVSRMask,
|
||||
XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768
|
||||
XMMF16UnpackLCPI3, // 0x0x7fe000007fe000
|
||||
XMMF16PackLCPI0,
|
||||
XMMF16PackLCPI2,
|
||||
XMMF16PackLCPI3,
|
||||
XMMF16PackLCPI4,
|
||||
XMMF16PackLCPI5,
|
||||
XMMF16PackLCPI6
|
||||
};
|
||||
// X64Backend specific Instr->runtime_flags
|
||||
enum : uint32_t {
|
||||
|
@ -177,7 +185,7 @@ class XbyakAllocator : public Xbyak::Allocator {
|
|||
enum X64EmitterFeatureFlags {
|
||||
kX64EmitAVX2 = 1 << 0,
|
||||
kX64EmitFMA = 1 << 1,
|
||||
kX64EmitLZCNT = 1 << 2,
|
||||
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
||||
kX64EmitBMI1 = 1 << 3,
|
||||
kX64EmitBMI2 = 1 << 4,
|
||||
kX64EmitF16C = 1 << 5,
|
||||
|
@ -201,7 +209,11 @@ enum X64EmitterFeatureFlags {
|
|||
// inc/dec) do not introduce false dependencies on EFLAGS
|
||||
// because the individual flags are treated as different vars by
|
||||
// the processor. (this applies to zen)
|
||||
kX64EmitPrefetchW = 1 << 16
|
||||
kX64EmitPrefetchW = 1 << 16,
|
||||
kX64EmitXOP = 1 << 17, // chrispy: xop maps really well to many vmx
|
||||
// instructions, and FX users need the boost
|
||||
kX64EmitFMA4 = 1 << 18, // todo: also use on zen1?
|
||||
kX64EmitTBM = 1 << 19
|
||||
};
|
||||
class ResolvableGuestCall {
|
||||
public:
|
||||
|
@ -337,6 +349,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
XexModule* GuestModule() { return guest_module_; }
|
||||
|
||||
void EmitProfilerEpilogue();
|
||||
|
||||
protected:
|
||||
void* Emplace(const EmitFunctionInfo& func_info,
|
||||
GuestFunction* function = nullptr);
|
||||
|
@ -352,7 +366,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
XexModule* guest_module_ = nullptr;
|
||||
Xbyak::util::Cpu cpu_;
|
||||
uint32_t feature_flags_ = 0;
|
||||
|
||||
uint32_t current_guest_function_ = 0;
|
||||
Xbyak::Label* epilog_label_ = nullptr;
|
||||
|
||||
hir::Instr* current_instr_ = nullptr;
|
||||
|
|
|
@ -19,10 +19,6 @@
|
|||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
||||
|
||||
DEFINE_bool(use_extended_range_half, true,
|
||||
"Emulate extended range half-precision, may be slower on games "
|
||||
"that use it heavily",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -1982,6 +1978,137 @@ struct PERMUTE_V128
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128);
|
||||
|
||||
#define LCPI(name, quad1) const __m128i name = _mm_set1_epi32(quad1)
|
||||
// xmm0 is precasted to int, but contains float
|
||||
// chrispy: todo: make available to gpu code
|
||||
static __m128i xenos_float4_to_float16_x4(__m128i xmm0) {
|
||||
LCPI(LCPI0_0, 2147483647);
|
||||
LCPI(LCPI0_1, 1207951360);
|
||||
LCPI(LCPI0_2, 134217728);
|
||||
LCPI(LCPI0_3, 3347054592);
|
||||
LCPI(LCPI0_4, 260038655);
|
||||
LCPI(LCPI0_5, 32767);
|
||||
LCPI(LCPI0_6, 4294934528);
|
||||
|
||||
__m128i xmm1 = _mm_and_si128(xmm0, LCPI0_0);
|
||||
|
||||
__m128i xmm2 = LCPI0_1;
|
||||
|
||||
__m128i xmm3 = _mm_add_epi32(xmm0, LCPI0_2);
|
||||
xmm2 = _mm_cmpgt_epi32(xmm2, xmm1);
|
||||
xmm3 = _mm_srli_epi32(xmm3, 13);
|
||||
xmm1 = _mm_add_epi32(xmm1, LCPI0_3);
|
||||
__m128i xmm4 = _mm_min_epu32(xmm1, LCPI0_4);
|
||||
xmm1 = _mm_cmpeq_epi32(xmm1, xmm4);
|
||||
xmm4 = LCPI0_5;
|
||||
xmm3 = _mm_and_si128(xmm3, xmm4);
|
||||
xmm1 = _mm_and_si128(xmm1, xmm3);
|
||||
|
||||
xmm1 = _mm_castps_si128(_mm_blendv_ps(
|
||||
_mm_castsi128_ps(xmm4), _mm_castsi128_ps(xmm1), _mm_castsi128_ps(xmm2)));
|
||||
xmm0 = _mm_srli_epi32(xmm0, 16);
|
||||
xmm0 = _mm_and_si128(xmm0, LCPI0_6);
|
||||
xmm0 = _mm_or_si128(xmm1, xmm0);
|
||||
xmm0 = _mm_packus_epi32(xmm0, _mm_setzero_si128());
|
||||
return xmm0;
|
||||
}
|
||||
// returns floats, uncasted
|
||||
// chrispy: todo, make this available to gpu code?
|
||||
static __m128i xenos_halves_to_floats(__m128i xmm0) {
|
||||
LCPI(LCPI3_0, 0x1f);
|
||||
LCPI(LCPI3_1, 0x80000000);
|
||||
LCPI(LCPI3_2, 0x38000000);
|
||||
LCPI(LCPI3_3, 0x7fe000);
|
||||
|
||||
__m128i xmm1, xmm2, xmm3, xmm4;
|
||||
|
||||
xmm1 = _mm_cvtepu16_epi32(xmm0);
|
||||
|
||||
xmm2 = _mm_srli_epi32(xmm1, 10);
|
||||
|
||||
xmm2 = _mm_and_si128(xmm2, LCPI3_0);
|
||||
|
||||
xmm0 = _mm_cvtepi16_epi32(xmm0);
|
||||
|
||||
xmm0 = _mm_and_si128(xmm0, LCPI3_1);
|
||||
|
||||
xmm3 = _mm_setzero_si128();
|
||||
|
||||
xmm4 = _mm_slli_epi32(xmm2, 23);
|
||||
|
||||
xmm4 = _mm_add_epi32(xmm4, LCPI3_2);
|
||||
|
||||
xmm2 = _mm_cmpeq_epi32(xmm2, xmm3);
|
||||
|
||||
xmm1 = _mm_slli_epi32(xmm1, 13);
|
||||
|
||||
xmm1 = _mm_and_si128(xmm1, LCPI3_3);
|
||||
|
||||
xmm3 = _mm_andnot_si128(xmm2, xmm4);
|
||||
|
||||
xmm1 = _mm_andnot_si128(xmm2, xmm1);
|
||||
|
||||
xmm0 = _mm_or_si128(xmm1, xmm0);
|
||||
xmm0 = _mm_or_si128(xmm0, xmm3);
|
||||
return xmm0;
|
||||
}
|
||||
|
||||
#undef LCPI
|
||||
template <typename Inst>
|
||||
static void emit_fast_f16_unpack(X64Emitter& e, const Inst& i,
|
||||
XmmConst initial_shuffle) {
|
||||
auto src1 = i.src1;
|
||||
|
||||
e.vpshufb(i.dest, src1, e.GetXmmConstPtr(initial_shuffle));
|
||||
e.vpmovsxwd(e.xmm1, i.dest);
|
||||
|
||||
e.vpsrld(e.xmm2, e.xmm1, 10);
|
||||
e.vpmovsxwd(e.xmm0, i.dest);
|
||||
e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
e.vpand(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
|
||||
|
||||
e.vpslld(e.xmm3, e.xmm2, 23);
|
||||
|
||||
e.vpaddd(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMF16UnpackLCPI2));
|
||||
|
||||
e.vpcmpeqd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMZero));
|
||||
|
||||
e.vpslld(e.xmm1, e.xmm1, 13);
|
||||
|
||||
e.vpandn(e.xmm1, e.xmm2, e.xmm1);
|
||||
e.vpandn(e.xmm2, e.xmm2, e.xmm3);
|
||||
|
||||
e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMF16UnpackLCPI3));
|
||||
e.vpor(e.xmm0, e.xmm1, e.xmm0);
|
||||
e.vpor(i.dest, e.xmm0, e.xmm2);
|
||||
}
|
||||
template <typename Inst>
|
||||
static void emit_fast_f16_pack(X64Emitter& e, const Inst& i,
|
||||
XmmConst final_shuffle) {
|
||||
e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMF16PackLCPI0));
|
||||
e.vpand(e.xmm2, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
e.vmovdqa(e.xmm3, e.GetXmmConstPtr(XMMF16PackLCPI2));
|
||||
|
||||
e.vpcmpgtd(e.xmm3, e.xmm3, e.xmm2);
|
||||
e.vpsrld(e.xmm1, e.xmm1, 13);
|
||||
|
||||
e.vpaddd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI3));
|
||||
e.vpminud(e.xmm0, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI4));
|
||||
|
||||
e.vpcmpeqd(e.xmm2, e.xmm2, e.xmm0);
|
||||
e.vmovdqa(e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI5));
|
||||
e.vpand(e.xmm1, e.xmm1, e.xmm0);
|
||||
e.vpand(e.xmm1, e.xmm2, e.xmm1);
|
||||
e.vpxor(e.xmm2, e.xmm2, e.xmm2);
|
||||
|
||||
e.vblendvps(e.xmm1, e.xmm0, e.xmm1, e.xmm3);
|
||||
|
||||
e.vpsrld(e.xmm0, i.src1, 16);
|
||||
e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI6));
|
||||
e.vorps(e.xmm0, e.xmm1, e.xmm0);
|
||||
e.vpackusdw(i.dest, e.xmm0, e.xmm2);
|
||||
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(final_shuffle));
|
||||
}
|
||||
// ============================================================================
|
||||
// OPCODE_SWIZZLE
|
||||
// ============================================================================
|
||||
|
@ -2081,14 +2208,9 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
alignas(16) uint16_t b[8];
|
||||
_mm_store_ps(a, src1);
|
||||
std::memset(b, 0, sizeof(b));
|
||||
if (!cvars::use_extended_range_half) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
b[7 - i] = float_to_xenos_half(a[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
b[7 - i] = float_to_xenos_half(a[i]);
|
||||
}
|
||||
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(b));
|
||||
|
@ -2098,70 +2220,26 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
// http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
|
||||
// dest = [(src1.x | src1.y), 0, 0, 0]
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
|
||||
Xmm src;
|
||||
if (i.src1.is_constant) {
|
||||
src = i.dest;
|
||||
e.LoadConstantXmm(src, i.src1.constant());
|
||||
} else {
|
||||
src = i.src1;
|
||||
}
|
||||
// 0|0|0|0|W|Z|Y|X
|
||||
e.vcvtps2ph(i.dest, src, 0b00000011);
|
||||
// Shuffle to X|Y|0|0|0|0|0|0
|
||||
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2));
|
||||
if (i.src1.is_constant) {
|
||||
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
|
||||
} else {
|
||||
if (i.src1.is_constant) {
|
||||
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
}
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
}
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
}
|
||||
static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
|
||||
alignas(16) float a[4];
|
||||
alignas(16) uint16_t b[8];
|
||||
_mm_store_ps(a, src1);
|
||||
std::memset(b, 0, sizeof(b));
|
||||
if (!cvars::use_extended_range_half) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
b[7 - (i ^ 2)] =
|
||||
half_float::detail::float2half<std::round_toward_zero>(a[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
b[7 - (i ^ 2)] = float_to_xenos_half(a[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(b));
|
||||
}
|
||||
static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
|
||||
assert_true(i.src2.value->IsConstantZero());
|
||||
// dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0]
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
|
||||
Xmm src;
|
||||
if (i.src1.is_constant) {
|
||||
src = i.dest;
|
||||
e.LoadConstantXmm(src, i.src1.constant());
|
||||
} else {
|
||||
src = i.src1;
|
||||
}
|
||||
// 0|0|0|0|W|Z|Y|X
|
||||
e.vcvtps2ph(i.dest, src, 0b00000011);
|
||||
// Shuffle to Z|W|X|Y|0|0|0|0
|
||||
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4));
|
||||
if (!i.src1.is_constant) {
|
||||
emit_fast_f16_pack(e, i, XMMPackFLOAT16_4);
|
||||
} else {
|
||||
if (i.src1.is_constant) {
|
||||
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
vec128_t result = vec128b(0);
|
||||
for (unsigned idx = 0; idx < 4; ++idx) {
|
||||
result.u16[(7 - (idx ^ 2))] =
|
||||
float_to_xenos_half(i.src1.constant().f32[idx]);
|
||||
}
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
e.LoadConstantXmm(i.dest, result);
|
||||
}
|
||||
}
|
||||
static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
|
||||
|
@ -2508,15 +2586,10 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
alignas(16) float b[4];
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||
|
||||
if (!cvars::use_extended_range_half) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
|
||||
}
|
||||
|
||||
// Constants, or something
|
||||
b[2] = 0.f;
|
||||
b[3] = 1.f;
|
||||
|
@ -2536,74 +2609,28 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
// Also zero out the high end.
|
||||
// TODO(benvanik): special case constant unpacks that just get 0/1/etc.
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitF16C) &&
|
||||
!cvars::use_extended_range_half) { // todo: can use cvtph and bit logic
|
||||
// to implement
|
||||
Xmm src;
|
||||
if (i.src1.is_constant) {
|
||||
src = i.dest;
|
||||
e.LoadConstantXmm(src, i.src1.constant());
|
||||
} else {
|
||||
src = i.src1;
|
||||
}
|
||||
// sx = src.iw >> 16;
|
||||
// sy = src.iw & 0xFFFF;
|
||||
// dest = { XMConvertHalfToFloat(sx),
|
||||
// XMConvertHalfToFloat(sy),
|
||||
// 0.0,
|
||||
// 1.0 };
|
||||
// Shuffle to 0|0|0|0|0|0|Y|X
|
||||
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2));
|
||||
e.vcvtph2ps(i.dest, i.dest);
|
||||
e.vpshufd(i.dest, i.dest, 0b10100100);
|
||||
e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
|
||||
if (i.src1.is_constant) {
|
||||
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
|
||||
} else {
|
||||
if (i.src1.is_constant) {
|
||||
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
}
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
}
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
}
|
||||
static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
|
||||
alignas(16) uint16_t a[8];
|
||||
alignas(16) float b[4];
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||
if (!cvars::use_extended_range_half) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
b[i] = xenos_half_to_float(a[VEC128_W(4 + i)]);
|
||||
}
|
||||
}
|
||||
|
||||
return _mm_load_ps(b);
|
||||
}
|
||||
static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
|
||||
// src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
|
||||
if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
|
||||
Xmm src;
|
||||
if (i.src1.is_constant) {
|
||||
src = i.dest;
|
||||
e.LoadConstantXmm(src, i.src1.constant());
|
||||
} else {
|
||||
src = i.src1;
|
||||
if (i.src1.is_constant) {
|
||||
vec128_t result{};
|
||||
|
||||
for (int idx = 0; idx < 4; ++idx) {
|
||||
result.f32[idx] =
|
||||
xenos_half_to_float(i.src1.constant().u16[VEC128_W(4 + idx)]);
|
||||
}
|
||||
// Shuffle to 0|0|0|0|W|Z|Y|X
|
||||
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4));
|
||||
e.vcvtph2ps(i.dest, i.dest);
|
||||
|
||||
e.LoadConstantXmm(i.dest, result);
|
||||
|
||||
} else {
|
||||
if (i.src1.is_constant) {
|
||||
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
}
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
emit_fast_f16_unpack(e, i, XMMUnpackFLOAT16_4);
|
||||
}
|
||||
}
|
||||
static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
|
||||
|
|
|
@ -50,6 +50,10 @@ DEFINE_bool(no_round_to_single, false,
|
|||
"Not for users, breaks games. Skip rounding double values to "
|
||||
"single precision and back",
|
||||
"CPU");
|
||||
DEFINE_bool(
|
||||
inline_loadclock, false,
|
||||
"Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -475,33 +479,39 @@ EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128);
|
|||
// ============================================================================
|
||||
struct LOAD_CLOCK : Sequence<LOAD_CLOCK, I<OPCODE_LOAD_CLOCK, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// When scaling is disabled and the raw clock source is selected, the code
|
||||
// in the Clock class is actually just forwarding tick counts after one
|
||||
// simple multiply and division. In that case we rather bake the scaling in
|
||||
// here to cut extra function calls with CPU cache misses and stack frame
|
||||
// overhead.
|
||||
if (cvars::clock_no_scaling && cvars::clock_source_raw) {
|
||||
auto ratio = Clock::guest_tick_ratio();
|
||||
// The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
|
||||
// mfence/lfence magic the rdtsc instruction can be executed sooner or
|
||||
// later in the cache window. Since it's resolution however is much higher
|
||||
// than the 360's mftb instruction this can safely be ignored.
|
||||
|
||||
// Read time stamp in edx (high part) and eax (low part).
|
||||
e.rdtsc();
|
||||
// Make it a 64 bit number in rax.
|
||||
e.shl(e.rdx, 32);
|
||||
e.or_(e.rax, e.rdx);
|
||||
// Apply tick frequency scaling.
|
||||
e.mov(e.rcx, ratio.first);
|
||||
e.mul(e.rcx);
|
||||
// We actually now have a 128 bit number in rdx:rax.
|
||||
e.mov(e.rcx, ratio.second);
|
||||
e.div(e.rcx);
|
||||
e.mov(i.dest, e.rax);
|
||||
if (cvars::inline_loadclock) {
|
||||
e.mov(e.rcx,
|
||||
e.GetBackendCtxPtr(offsetof(X64BackendContext, guest_tick_count)));
|
||||
e.mov(i.dest, e.qword[e.rcx]);
|
||||
} else {
|
||||
e.CallNative(LoadClock);
|
||||
e.mov(i.dest, e.rax);
|
||||
// When scaling is disabled and the raw clock source is selected, the code
|
||||
// in the Clock class is actually just forwarding tick counts after one
|
||||
// simple multiply and division. In that case we rather bake the scaling
|
||||
// in here to cut extra function calls with CPU cache misses and stack
|
||||
// frame overhead.
|
||||
if (cvars::clock_no_scaling && cvars::clock_source_raw) {
|
||||
auto ratio = Clock::guest_tick_ratio();
|
||||
// The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
|
||||
// mfence/lfence magic the rdtsc instruction can be executed sooner or
|
||||
// later in the cache window. Since it's resolution however is much
|
||||
// higher than the 360's mftb instruction this can safely be ignored.
|
||||
|
||||
// Read time stamp in edx (high part) and eax (low part).
|
||||
e.rdtsc();
|
||||
// Make it a 64 bit number in rax.
|
||||
e.shl(e.rdx, 32);
|
||||
e.or_(e.rax, e.rdx);
|
||||
// Apply tick frequency scaling.
|
||||
e.mov(e.rcx, ratio.first);
|
||||
e.mul(e.rcx);
|
||||
// We actually now have a 128 bit number in rdx:rax.
|
||||
e.mov(e.rcx, ratio.second);
|
||||
e.div(e.rcx);
|
||||
e.mov(i.dest, e.rax);
|
||||
} else {
|
||||
e.CallNative(LoadClock);
|
||||
e.mov(i.dest, e.rax);
|
||||
}
|
||||
}
|
||||
}
|
||||
static uint64_t LoadClock(void* raw_context) {
|
||||
|
@ -539,10 +549,12 @@ struct MAX_F64 : Sequence<MAX_F64, I<OPCODE_MAX, F64Op, F64Op, F64Op>> {
|
|||
struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
e.vmaxps(dest, src1, src2);
|
||||
});
|
||||
//if 0 and -0, return 0! opposite of minfp
|
||||
auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
e.vmaxps(e.xmm2, src1, src2);
|
||||
e.vmaxps(e.xmm3, src2, src1);
|
||||
e.vandps(i.dest, e.xmm2, e.xmm3);
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128);
|
||||
|
@ -597,10 +609,11 @@ struct MIN_F64 : Sequence<MIN_F64, I<OPCODE_MIN, F64Op, F64Op, F64Op>> {
|
|||
struct MIN_V128 : Sequence<MIN_V128, I<OPCODE_MIN, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
e.vminps(dest, src1, src2);
|
||||
});
|
||||
auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
e.vminps(e.xmm2, src1, src2);
|
||||
e.vminps(e.xmm3, src2, src1);
|
||||
e.vorps(i.dest, e.xmm2, e.xmm3);
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32,
|
||||
|
@ -768,6 +781,7 @@ struct SELECT_V128_V128
|
|||
} else if (mayblend == PermittedBlend::Ps) {
|
||||
e.vblendvps(i.dest, src2, src3, src1);
|
||||
} else {
|
||||
//ideally we would have an xop path here...
|
||||
// src1 ? src2 : src3;
|
||||
e.vpandn(e.xmm3, src1, src2);
|
||||
e.vpand(i.dest, src1, src3);
|
||||
|
@ -1932,6 +1946,53 @@ struct MUL_ADD_V128
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
|
||||
|
||||
struct NEGATED_MUL_ADD_F64
|
||||
: Sequence<NEGATED_MUL_ADD_F64,
|
||||
I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Fpu);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovapd(e.xmm3, src1);
|
||||
e.vfmadd213sd(e.xmm3, src2, src3);
|
||||
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulsd(e.xmm3, src1, src2);
|
||||
e.vaddsd(i.dest, e.xmm3, src3);
|
||||
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
}
|
||||
}
|
||||
};
|
||||
struct NEGATED_MUL_ADD_V128
|
||||
: Sequence<NEGATED_MUL_ADD_V128,
|
||||
I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovaps(e.xmm3, src1);
|
||||
e.vfmadd213ps(e.xmm3, src2, src3);
|
||||
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulps(e.xmm3, src1, src2);
|
||||
e.vaddps(i.dest, e.xmm3, src3);
|
||||
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
|
||||
NEGATED_MUL_ADD_V128);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_MUL_SUB
|
||||
// ============================================================================
|
||||
|
@ -1944,12 +2005,7 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
|
|||
// - 132 -> $1 = $1 * $3 - $2
|
||||
// - 213 -> $1 = $2 * $1 - $3
|
||||
// - 231 -> $1 = $2 * $3 - $1
|
||||
struct MUL_SUB_F32
|
||||
: Sequence<MUL_SUB_F32, I<OPCODE_MUL_SUB, F32Op, F32Op, F32Op, F32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
assert_impossible_sequence(MUL_SUB_F32);
|
||||
}
|
||||
};
|
||||
|
||||
struct MUL_SUB_F64
|
||||
: Sequence<MUL_SUB_F64, I<OPCODE_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
|
@ -1991,7 +2047,54 @@ struct MUL_SUB_V128
|
|||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F32, MUL_SUB_F64, MUL_SUB_V128);
|
||||
EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
|
||||
|
||||
struct NEGATED_MUL_SUB_F64
|
||||
: Sequence<NEGATED_MUL_SUB_F64,
|
||||
I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Fpu);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovapd(e.xmm3, src1);
|
||||
e.vfmsub213sd(e.xmm3, src2, src3);
|
||||
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulsd(e.xmm3, src1, src2);
|
||||
e.vsubsd(i.dest, e.xmm3, src3);
|
||||
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
}
|
||||
}
|
||||
};
|
||||
struct NEGATED_MUL_SUB_V128
|
||||
: Sequence<NEGATED_MUL_SUB_V128,
|
||||
I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovaps(e.xmm3, src1);
|
||||
e.vfmsub213ps(e.xmm3, src2, src3);
|
||||
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulps(e.xmm3, src1, src2);
|
||||
e.vsubps(i.dest, e.xmm3, src3);
|
||||
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
|
||||
NEGATED_MUL_SUB_V128);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_NEG
|
||||
|
@ -2264,7 +2367,7 @@ struct DOT_PRODUCT_3_V128
|
|||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
// todo: add fast_dot_product path that just checks for infinity instead of
|
||||
// using mxcsr
|
||||
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
|
||||
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH];
|
||||
|
||||
// this is going to hurt a bit...
|
||||
/*
|
||||
|
@ -2380,7 +2483,7 @@ struct DOT_PRODUCT_4_V128
|
|||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
// todo: add fast_dot_product path that just checks for infinity instead of
|
||||
// using mxcsr
|
||||
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
|
||||
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH];
|
||||
|
||||
bool is_lensqr = i.instr->src1.value == i.instr->src2.value;
|
||||
|
||||
|
@ -3162,9 +3265,9 @@ struct SET_ROUNDING_MODE_I32
|
|||
// backends dont have to worry about it
|
||||
if (i.src1.is_constant) {
|
||||
e.mov(e.eax, mxcsr_table[i.src1.constant()]);
|
||||
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64], e.eax);
|
||||
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
|
||||
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
|
||||
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]);
|
||||
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);
|
||||
|
||||
} else {
|
||||
e.mov(e.ecx, i.src1);
|
||||
|
|
|
@ -123,7 +123,10 @@ class StackLayout {
|
|||
*/
|
||||
static const size_t GUEST_STACK_SIZE = 104;
|
||||
//was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
|
||||
static const size_t GUEST_SCRATCH64 = 80;
|
||||
static const size_t GUEST_SCRATCH = 0;
|
||||
|
||||
//when profiling is on, this stores the nanosecond time at the start of the function
|
||||
static const size_t GUEST_PROFILER_START = 80;
|
||||
static const size_t GUEST_RET_ADDR = 88;
|
||||
static const size_t GUEST_CALL_RET_ADDR = 96;
|
||||
};
|
||||
|
|
|
@ -600,6 +600,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
break;
|
||||
case OPCODE_MAX:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
if (should_skip_because_of_float) {
|
||||
break;
|
||||
}
|
||||
v->set_from(i->src1.value);
|
||||
v->Max(i->src2.value);
|
||||
i->Remove();
|
||||
|
|
|
@ -1636,15 +1636,7 @@ Value* HIRBuilder::Div(Value* value1, Value* value2,
|
|||
Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value3);
|
||||
#if 0
|
||||
bool c1 = value1->IsConstant();
|
||||
bool c2 = value2->IsConstant();
|
||||
if (c1 && c2) {
|
||||
Value* dest = CloneValue(value1);
|
||||
dest->Mul(value2);
|
||||
return Add(dest, value3);
|
||||
}
|
||||
#endif
|
||||
|
||||
Instr* i = AppendInstr(OPCODE_MUL_ADD_info, 0, AllocValue(value1->type));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
|
@ -1655,15 +1647,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
|
|||
Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value3);
|
||||
#if 0
|
||||
bool c1 = value1->IsConstant();
|
||||
bool c2 = value2->IsConstant();
|
||||
if (c1 && c2) {
|
||||
Value* dest = CloneValue(value1);
|
||||
dest->Mul(value2);
|
||||
return Sub(dest, value3);
|
||||
}
|
||||
#endif
|
||||
|
||||
Instr* i = AppendInstr(OPCODE_MUL_SUB_info, 0, AllocValue(value1->type));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
|
@ -1671,6 +1655,30 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
|
|||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value3);
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
i->set_src3(value3);
|
||||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value3);
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
i->set_src3(value3);
|
||||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::Neg(Value* value) {
|
||||
Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
|
||||
i->set_src1(value);
|
||||
|
|
|
@ -214,6 +214,10 @@ class HIRBuilder {
|
|||
Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
|
||||
Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3
|
||||
Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3
|
||||
Value* NegatedMulAdd(Value* value1, Value* value2,
|
||||
Value* value3); // -((1 * 2) + 3)
|
||||
Value* NegatedMulSub(Value* value1, Value* value2,
|
||||
Value* value3); // -((1 * 2) - 3)
|
||||
Value* Neg(Value* value);
|
||||
Value* Abs(Value* value);
|
||||
Value* Sqrt(Value* value);
|
||||
|
@ -265,6 +269,7 @@ class HIRBuilder {
|
|||
Value* AtomicAdd(Value* address, Value* value);
|
||||
Value* AtomicSub(Value* address, Value* value);
|
||||
void SetNJM(Value* value);
|
||||
|
||||
protected:
|
||||
void DumpValue(StringBuffer* str, Value* value);
|
||||
void DumpOp(StringBuffer* str, OpcodeSignatureType sig_type, Instr::Op* op);
|
||||
|
|
|
@ -208,6 +208,12 @@ enum Opcode {
|
|||
OPCODE_STORE_OFFSET,
|
||||
OPCODE_LOAD,
|
||||
OPCODE_STORE,
|
||||
// chrispy: todo: implement, our current codegen for the unaligned loads is
|
||||
// very bad
|
||||
OPCODE_LVLX,
|
||||
OPCODE_LVRX,
|
||||
OPCODE_STVLX,
|
||||
OPCODE_STVRX,
|
||||
OPCODE_MEMSET,
|
||||
OPCODE_CACHE_CONTROL,
|
||||
OPCODE_MEMORY_BARRIER,
|
||||
|
@ -244,7 +250,9 @@ enum Opcode {
|
|||
OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type.
|
||||
OPCODE_DIV,
|
||||
OPCODE_MUL_ADD,
|
||||
OPCODE_NEGATED_MUL_ADD,
|
||||
OPCODE_MUL_SUB,
|
||||
OPCODE_NEGATED_MUL_SUB,
|
||||
OPCODE_NEG,
|
||||
OPCODE_ABS,
|
||||
OPCODE_SQRT,
|
||||
|
@ -284,7 +292,8 @@ enum Opcode {
|
|||
OPCODE_TO_SINGLE, // i could not find a decent name to assign to this opcode,
|
||||
// as we already have OPCODE_ROUND. round double to float (
|
||||
// ppc "single" fpu instruction result rounding behavior )
|
||||
OPCODE_SET_NJM,
|
||||
OPCODE_SET_NJM,
|
||||
|
||||
__OPCODE_MAX_VALUE, // Keep at end.
|
||||
};
|
||||
|
||||
|
|
|
@ -464,6 +464,19 @@ DEFINE_OPCODE(
|
|||
OPCODE_SIG_V_V_V_V,
|
||||
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_NEGATED_MUL_ADD,
|
||||
"negated_mul_add",
|
||||
OPCODE_SIG_V_V_V_V,
|
||||
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_NEGATED_MUL_SUB,
|
||||
"negated_mul_sub",
|
||||
OPCODE_SIG_V_V_V_V,
|
||||
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
|
||||
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_NEG,
|
||||
"neg",
|
||||
|
@ -692,4 +705,5 @@ DEFINE_OPCODE(
|
|||
"set_njm",
|
||||
OPCODE_SIG_X_V,
|
||||
0
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
@ -613,10 +613,12 @@ class Value {
|
|||
// returns true if every single use is as an operand to a single instruction
|
||||
// (add var2, var1, var1)
|
||||
bool AllUsesByOneInsn() const;
|
||||
//the maybe is here because this includes vec128, which is untyped data that can be treated as float or int depending on the context
|
||||
// the maybe is here because this includes vec128, which is untyped data that
|
||||
// can be treated as float or int depending on the context
|
||||
bool MaybeFloaty() const {
|
||||
return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
|
||||
}
|
||||
|
||||
private:
|
||||
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
|
||||
static bool CompareInt16(Opcode opcode, Value* a, Value* b);
|
||||
|
|
|
@ -48,7 +48,7 @@ class MMIOHandler {
|
|||
typedef uint32_t (*HostToGuestVirtual)(const void* context,
|
||||
const void* host_address);
|
||||
typedef bool (*AccessViolationCallback)(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
global_unique_lock_type global_lock_locked_once, //not passed by reference with const like the others?
|
||||
void* context, void* host_address, bool is_write);
|
||||
|
||||
// access_violation_callback is called with global_critical_region locked once
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
#include <mutex>
|
||||
#include <string>
|
||||
|
||||
#include "xenia/base/mutex.h"
|
||||
#include "xenia/base/vec128.h"
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
class Processor;
|
||||
|
@ -390,9 +390,11 @@ typedef struct alignas(64) PPCContext_s {
|
|||
// These are split to make it easier to do DCE on unused stores.
|
||||
uint64_t cr() const;
|
||||
void set_cr(uint64_t value);
|
||||
|
||||
// todo: remove, saturation should be represented by a vector
|
||||
uint8_t vscr_sat;
|
||||
|
||||
uint32_t vrsave;
|
||||
|
||||
// uint32_t get_fprf() {
|
||||
// return fpscr.value & 0x000F8000;
|
||||
// }
|
||||
|
@ -405,7 +407,7 @@ typedef struct alignas(64) PPCContext_s {
|
|||
|
||||
// Global interrupt lock, held while interrupts are disabled or interrupts are
|
||||
// executing. This is shared among all threads and comes from the processor.
|
||||
std::recursive_mutex* global_mutex;
|
||||
global_mutex_type* global_mutex;
|
||||
|
||||
// Used to shuttle data into externs. Contents volatile.
|
||||
uint64_t scratch;
|
||||
|
|
|
@ -1197,7 +1197,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
|
|||
Value* b = f.VectorDenormFlush(f.LoadVR(vb));
|
||||
Value* c = f.VectorDenormFlush(f.LoadVR(vc));
|
||||
|
||||
Value* v = f.Neg(f.MulSub(a, c, b));
|
||||
Value* v = f.NegatedMulSub(a, c, b);
|
||||
f.StoreVR(vd, v);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -16,6 +16,12 @@
|
|||
#include "xenia/cpu/ppc/ppc_hir_builder.h"
|
||||
|
||||
#include <stddef.h>
|
||||
// chrispy: added this, we can have simpler control flow and do dce on the
|
||||
// inputs
|
||||
DEFINE_bool(ignore_trap_instructions, true,
|
||||
"Generate no code for powerpc trap instructions, can result in "
|
||||
"better performance in games that aggressively check with trap.",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
@ -449,6 +455,9 @@ constexpr uint32_t TRAP_SLT = 1 << 4, TRAP_SGT = 1 << 3, TRAP_EQ = 1 << 2,
|
|||
|
||||
int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb,
|
||||
uint32_t TO) {
|
||||
if (cvars::ignore_trap_instructions) {
|
||||
return 0;
|
||||
}
|
||||
// if (a < b) & TO[0] then TRAP
|
||||
// if (a > b) & TO[1] then TRAP
|
||||
// if (a = b) & TO[2] then TRAP
|
||||
|
@ -521,6 +530,9 @@ int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb,
|
|||
}
|
||||
|
||||
int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (cvars::ignore_trap_instructions) {
|
||||
return 0;
|
||||
}
|
||||
// a <- (RA)
|
||||
// b <- (RB)
|
||||
// if (a < b) & TO[0] then TRAP
|
||||
|
@ -534,6 +546,9 @@ int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) {
|
|||
}
|
||||
|
||||
int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (cvars::ignore_trap_instructions) {
|
||||
return 0;
|
||||
}
|
||||
// a <- (RA)
|
||||
// if (a < EXTS(SI)) & TO[0] then TRAP
|
||||
// if (a > EXTS(SI)) & TO[1] then TRAP
|
||||
|
@ -546,6 +561,9 @@ int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) {
|
|||
}
|
||||
|
||||
int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (cvars::ignore_trap_instructions) {
|
||||
return 0;
|
||||
}
|
||||
// a <- EXTS((RA)[32:63])
|
||||
// b <- EXTS((RB)[32:63])
|
||||
// if (a < b) & TO[0] then TRAP
|
||||
|
@ -561,6 +579,9 @@ int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) {
|
|||
}
|
||||
|
||||
int InstrEmit_twi(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (cvars::ignore_trap_instructions) {
|
||||
return 0;
|
||||
}
|
||||
// a <- EXTS((RA)[32:63])
|
||||
// if (a < EXTS(SI)) & TO[0] then TRAP
|
||||
// if (a > EXTS(SI)) & TO[1] then TRAP
|
||||
|
@ -645,7 +666,9 @@ int InstrEmit_mfspr(PPCHIRBuilder& f, const InstrData& i) {
|
|||
break;
|
||||
case 256:
|
||||
// VRSAVE
|
||||
v = f.LoadZeroInt64();
|
||||
|
||||
v = f.ZeroExtend(f.LoadContext(offsetof(PPCContext, vrsave), INT32_TYPE),
|
||||
INT64_TYPE);
|
||||
break;
|
||||
case 268:
|
||||
// TB
|
||||
|
@ -749,6 +772,8 @@ int InstrEmit_mtspr(PPCHIRBuilder& f, const InstrData& i) {
|
|||
f.StoreCTR(rt);
|
||||
break;
|
||||
case 256:
|
||||
|
||||
f.StoreContext(offsetof(PPCContext, vrsave), f.Truncate(rt, INT32_TYPE));
|
||||
// VRSAVE
|
||||
break;
|
||||
default:
|
||||
|
@ -768,6 +793,7 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// bit 48 = EE; interrupt enabled
|
||||
// bit 62 = RI; recoverable interrupt
|
||||
// return 8000h if unlocked (interrupts enabled), else 0
|
||||
#if 0
|
||||
f.MemoryBarrier();
|
||||
if (cvars::disable_global_lock || true) {
|
||||
f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
|
||||
|
@ -777,63 +803,23 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
|
|||
f.StoreGPR(i.X.RT,
|
||||
f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
|
||||
}
|
||||
#else
|
||||
f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (i.X.RA & 0x01) {
|
||||
// L = 1
|
||||
// iff storing from r13
|
||||
f.MemoryBarrier();
|
||||
f.StoreContext(
|
||||
offsetof(PPCContext, scratch),
|
||||
f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
|
||||
#if 0
|
||||
if (i.X.RT == 13) {
|
||||
// iff storing from r13 we are taking a lock (disable interrupts).
|
||||
if (!cvars::disable_global_lock) {
|
||||
f.CallExtern(f.builtins()->enter_global_lock);
|
||||
}
|
||||
} else {
|
||||
// Otherwise we are restoring interrupts (probably).
|
||||
if (!cvars::disable_global_lock) {
|
||||
f.CallExtern(f.builtins()->leave_global_lock);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
} else {
|
||||
// L = 0
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
}
|
||||
f.StoreContext(
|
||||
offsetof(PPCContext, scratch),
|
||||
f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (i.X.RA & 0x01) {
|
||||
// L = 1
|
||||
f.MemoryBarrier();
|
||||
f.StoreContext(offsetof(PPCContext, scratch),
|
||||
f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
|
||||
#if 0
|
||||
if (i.X.RT == 13) {
|
||||
// iff storing from r13 we are taking a lock (disable interrupts).
|
||||
if (!cvars::disable_global_lock) {
|
||||
f.CallExtern(f.builtins()->enter_global_lock);
|
||||
}
|
||||
} else {
|
||||
// Otherwise we are restoring interrupts (probably).
|
||||
if (!cvars::disable_global_lock) {
|
||||
f.CallExtern(f.builtins()->leave_global_lock);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
} else {
|
||||
// L = 0
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
}
|
||||
f.StoreContext(offsetof(PPCContext, scratch),
|
||||
f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
|
||||
return 0;
|
||||
}
|
||||
|
||||
void RegisterEmitCategoryControl() {
|
||||
|
|
|
@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] + frB)
|
||||
Value* v = f.Neg(
|
||||
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] + frB)
|
||||
Value* v = f.Neg(
|
||||
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
|
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] - frB)
|
||||
Value* v = f.Neg(
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] - frB)
|
||||
Value* v = f.Neg(
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
|
|
|
@ -834,6 +834,7 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
// Issue memory barrier for when we go out of lock and want others to see our
|
||||
// updates.
|
||||
|
||||
f.MemoryBarrier();
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -77,7 +77,8 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
|
|||
|
||||
// Allocate with 64b alignment.
|
||||
|
||||
context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext()); // memory::AlignedAlloc<ppc::PPCContext>(64);
|
||||
context_ = reinterpret_cast<ppc::PPCContext*>(
|
||||
AllocateContext());
|
||||
processor->backend()->InitializeBackendContext(context_);
|
||||
assert_true(((uint64_t)context_ & 0x3F) == 0);
|
||||
std::memset(context_, 0, sizeof(ppc::PPCContext));
|
||||
|
@ -93,6 +94,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
|
|||
// Set initial registers.
|
||||
context_->r[1] = stack_base;
|
||||
context_->r[13] = pcr_address;
|
||||
// fixme: VSCR must be set here!
|
||||
}
|
||||
|
||||
ThreadState::~ThreadState() {
|
||||
|
@ -105,7 +107,7 @@ ThreadState::~ThreadState() {
|
|||
if (context_) {
|
||||
FreeContext(reinterpret_cast<void*>(context_));
|
||||
}
|
||||
// memory::AlignedFree(context_);
|
||||
// memory::AlignedFree(context_);
|
||||
}
|
||||
|
||||
void ThreadState::Bind(ThreadState* thread_state) {
|
||||
|
|
|
@ -29,10 +29,20 @@
|
|||
#include "xenia/kernel/kernel_state.h"
|
||||
#include "xenia/kernel/user_module.h"
|
||||
|
||||
#if defined(NDEBUG)
|
||||
static constexpr bool should_log_unknown_reg_writes() { return false; }
|
||||
|
||||
#else
|
||||
|
||||
DEFINE_bool(log_unknown_register_writes, false,
|
||||
"Log writes to unknown registers from "
|
||||
"CommandProcessor::WriteRegister. Has significant performance hit.",
|
||||
"GPU");
|
||||
static bool should_log_unknown_reg_writes() {
|
||||
return cvars::log_unknown_register_writes;
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -465,7 +475,7 @@ void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
|
|||
}
|
||||
}
|
||||
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||
if (XE_UNLIKELY(cvars::log_unknown_register_writes)) {
|
||||
if (should_log_unknown_reg_writes()) {
|
||||
// chrispy: rearrange check order, place set after checks
|
||||
if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) {
|
||||
XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
|
||||
|
@ -493,15 +503,45 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
|||
// very unlikely. these ORS here are meant to be bitwise ors, so that we do
|
||||
// not do branching evaluation of the conditions (we will almost always take
|
||||
// all of the branches)
|
||||
if (XE_UNLIKELY(
|
||||
(index - XE_GPU_REG_SCRATCH_REG0 < 8) |
|
||||
(index == XE_GPU_REG_COHER_STATUS_HOST) |
|
||||
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) {
|
||||
|
||||
unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
|
||||
(index == XE_GPU_REG_COHER_STATUS_HOST) |
|
||||
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
||||
// chrispy: reordered for msvc branch probability (assumes if is taken and
|
||||
// else is not)
|
||||
if (XE_LIKELY(expr == 0)) {
|
||||
} else {
|
||||
HandleSpecialRegisterWrite(index, value);
|
||||
}
|
||||
}
|
||||
void CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
||||
this->WriteRegister(start_index + i, data);
|
||||
}
|
||||
}
|
||||
|
||||
void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_registers) {
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = ring->ReadAndSwap<uint32_t>();
|
||||
WriteRegister(base + i, data);
|
||||
}
|
||||
}
|
||||
|
||||
void CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_times) {
|
||||
for (uint32_t m = 0; m < num_times; m++) {
|
||||
uint32_t reg_data = ring->ReadAndSwap<uint32_t>();
|
||||
uint32_t target_index = base;
|
||||
WriteRegister(target_index, reg_data);
|
||||
}
|
||||
}
|
||||
void CommandProcessor::MakeCoherent() {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
|
@ -623,15 +663,20 @@ void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
|
|||
}
|
||||
|
||||
bool CommandProcessor::ExecutePacket(RingBuffer* reader) {
|
||||
// prefetch the wraparound range
|
||||
// it likely is already in L3 cache, but in a zen system it may be another
|
||||
// chiplets l3
|
||||
reader->BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
|
||||
reader->read_count());
|
||||
const uint32_t packet = reader->ReadAndSwap<uint32_t>();
|
||||
const uint32_t packet_type = packet >> 30;
|
||||
if (packet == 0 || packet == 0x0BADF00D) {
|
||||
if (XE_UNLIKELY(packet == 0 || packet == 0x0BADF00D)) {
|
||||
trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1);
|
||||
trace_writer_.WritePacketEnd();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (packet == 0xCDCDCDCD) {
|
||||
if (XE_UNLIKELY(packet == 0xCDCDCDCD)) {
|
||||
XELOGW("GPU packet is CDCDCDCD - probably read uninitialized memory!");
|
||||
}
|
||||
|
||||
|
@ -667,10 +712,10 @@ bool CommandProcessor::ExecutePacketType0(RingBuffer* reader, uint32_t packet) {
|
|||
|
||||
uint32_t base_index = (packet & 0x7FFF);
|
||||
uint32_t write_one_reg = (packet >> 15) & 0x1;
|
||||
for (uint32_t m = 0; m < count; m++) {
|
||||
uint32_t reg_data = reader->ReadAndSwap<uint32_t>();
|
||||
uint32_t target_index = write_one_reg ? base_index : base_index + m;
|
||||
WriteRegister(target_index, reg_data);
|
||||
if (write_one_reg) {
|
||||
WriteOneRegisterFromRing(reader, base_index, count);
|
||||
} else {
|
||||
WriteRegisterRangeFromRing(reader, base_index, count);
|
||||
}
|
||||
|
||||
trace_writer_.WritePacketEnd();
|
||||
|
@ -934,7 +979,7 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingBuffer* reader,
|
|||
uint32_t count) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
XELOGI("XE_SWAP");
|
||||
XELOGD("XE_SWAP");
|
||||
|
||||
Profiler::Flip();
|
||||
|
||||
|
@ -1467,10 +1512,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader,
|
|||
reader->AdvanceRead((count - 1) * sizeof(uint32_t));
|
||||
return true;
|
||||
}
|
||||
for (uint32_t n = 0; n < count - 1; n++, index++) {
|
||||
uint32_t data = reader->ReadAndSwap<uint32_t>();
|
||||
WriteRegister(index, data);
|
||||
}
|
||||
|
||||
WriteRegisterRangeFromRing(reader, index, count - 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1479,10 +1523,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader,
|
|||
uint32_t count) {
|
||||
uint32_t offset_type = reader->ReadAndSwap<uint32_t>();
|
||||
uint32_t index = offset_type & 0xFFFF;
|
||||
for (uint32_t n = 0; n < count - 1; n++, index++) {
|
||||
uint32_t data = reader->ReadAndSwap<uint32_t>();
|
||||
WriteRegister(index, data);
|
||||
}
|
||||
|
||||
WriteRegisterRangeFromRing(reader, index, count - 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1517,12 +1560,12 @@ bool CommandProcessor::ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader,
|
|||
assert_always();
|
||||
return true;
|
||||
}
|
||||
|
||||
trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4);
|
||||
for (uint32_t n = 0; n < size_dwords; n++, index++) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(
|
||||
memory_->TranslatePhysical(address + n * 4));
|
||||
WriteRegister(index, data);
|
||||
}
|
||||
|
||||
WriteRegistersFromMem(index, (uint32_t*)memory_->TranslatePhysical(address),
|
||||
size_dwords);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1530,10 +1573,9 @@ bool CommandProcessor::ExecutePacketType3_SET_SHADER_CONSTANTS(
|
|||
RingBuffer* reader, uint32_t packet, uint32_t count) {
|
||||
uint32_t offset_type = reader->ReadAndSwap<uint32_t>();
|
||||
uint32_t index = offset_type & 0xFFFF;
|
||||
for (uint32_t n = 0; n < count - 1; n++, index++) {
|
||||
uint32_t data = reader->ReadAndSwap<uint32_t>();
|
||||
WriteRegister(index, data);
|
||||
}
|
||||
|
||||
WriteRegisterRangeFromRing(reader, index, count - 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -153,8 +153,24 @@ class CommandProcessor {
|
|||
// rarely needed, most register writes have no special logic here
|
||||
XE_NOINLINE
|
||||
void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegister(uint32_t index, uint32_t value);
|
||||
|
||||
// mem has big-endian register values
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteOneRegisterFromRing(
|
||||
xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t
|
||||
num_times); // repeatedly write a value to one register, presumably a
|
||||
// register with special handling for writes
|
||||
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
|
||||
return gamma_ramp_256_entry_table_;
|
||||
}
|
||||
|
|
|
@ -1710,7 +1710,60 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
|||
}
|
||||
}
|
||||
}
|
||||
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
||||
D3D12CommandProcessor::WriteRegister(start_index + i, data);
|
||||
}
|
||||
}
|
||||
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_registers) {
|
||||
// we already brought it into L2 earlier
|
||||
RingBuffer::ReadRange range =
|
||||
ring->BeginPrefetchedRead<swcache::PrefetchTag::Level1>(num_registers *
|
||||
sizeof(uint32_t));
|
||||
|
||||
uint32_t num_regs_firstrange =
|
||||
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
||||
|
||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||
num_regs_firstrange);
|
||||
if (range.second) {
|
||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||
base + num_regs_firstrange,
|
||||
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
|
||||
num_registers - num_regs_firstrange);
|
||||
}
|
||||
ring->EndRead(range);
|
||||
}
|
||||
void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_times) {
|
||||
auto read = ring->BeginPrefetchedRead<swcache::PrefetchTag::Level1>(
|
||||
num_times * sizeof(uint32_t));
|
||||
|
||||
uint32_t first_length = read.first_length / sizeof(uint32_t);
|
||||
|
||||
for (uint32_t i = 0; i < first_length; ++i) {
|
||||
D3D12CommandProcessor::WriteRegister(
|
||||
base, xe::load_and_swap<uint32_t>(read.first + (sizeof(uint32_t) * i)));
|
||||
}
|
||||
|
||||
if (read.second) {
|
||||
uint32_t second_length = read.second_length / sizeof(uint32_t);
|
||||
|
||||
for (uint32_t i = 0; i < second_length; ++i) {
|
||||
D3D12CommandProcessor::WriteRegister(
|
||||
base,
|
||||
xe::load_and_swap<uint32_t>(read.second + (sizeof(uint32_t) * i)));
|
||||
}
|
||||
}
|
||||
ring->EndRead(read);
|
||||
}
|
||||
void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() {
|
||||
gamma_ramp_256_entry_table_up_to_date_ = false;
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ namespace xe {
|
|||
namespace gpu {
|
||||
namespace d3d12 {
|
||||
|
||||
class D3D12CommandProcessor : public CommandProcessor {
|
||||
class D3D12CommandProcessor final : public CommandProcessor {
|
||||
public:
|
||||
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
|
||||
kernel::KernelState* kernel_state);
|
||||
|
@ -203,9 +203,17 @@ class D3D12CommandProcessor : public CommandProcessor {
|
|||
protected:
|
||||
bool SetupContext() override;
|
||||
void ShutdownContext() override;
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers) override;
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_registers) override;
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times) override;
|
||||
void OnGammaRamp256EntryTableValueWritten() override;
|
||||
void OnGammaRampPWLValueWritten() override;
|
||||
|
||||
|
|
|
@ -406,14 +406,16 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
|
|||
}
|
||||
|
||||
bool D3D12SharedMemory::UploadRanges(
|
||||
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
|
||||
if (upload_page_ranges.empty()) {
|
||||
const std::pair<uint32_t, uint32_t>* upload_page_ranges, unsigned num_upload_page_ranges) {
|
||||
if (!num_upload_page_ranges) {
|
||||
return true;
|
||||
}
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
|
||||
command_processor_.SubmitBarriers();
|
||||
auto& command_list = command_processor_.GetDeferredCommandList();
|
||||
for (auto upload_range : upload_page_ranges) {
|
||||
//for (auto upload_range : upload_page_ranges) {
|
||||
for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
|
||||
auto& upload_range = upload_page_ranges[i];
|
||||
uint32_t upload_range_start = upload_range.first;
|
||||
uint32_t upload_range_length = upload_range.second;
|
||||
trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),
|
||||
|
|
|
@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
|
|||
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
|
||||
uint32_t length_allocations) override;
|
||||
|
||||
bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
|
||||
upload_page_ranges) override;
|
||||
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
|
||||
upload_page_ranges, unsigned num_ranges) override;
|
||||
|
||||
private:
|
||||
D3D12CommandProcessor& command_processor_;
|
||||
|
|
|
@ -31,8 +31,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
|
|||
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
||||
const uintmax_t* stream = command_stream_.data();
|
||||
size_t stream_remaining = command_stream_.size();
|
||||
const uintmax_t* stream = (const uintmax_t*)command_stream_.data();
|
||||
size_t stream_remaining = command_stream_.size() / sizeof(uintmax_t);
|
||||
ID3D12PipelineState* current_pipeline_state = nullptr;
|
||||
while (stream_remaining != 0) {
|
||||
const CommandHeader& header =
|
||||
|
@ -266,8 +266,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
|
|||
|
||||
void* DeferredCommandList::WriteCommand(Command command,
|
||||
size_t arguments_size_bytes) {
|
||||
|
||||
size_t arguments_size_elements =
|
||||
(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
|
||||
round_up(arguments_size_bytes, sizeof(uintmax_t), false);
|
||||
|
||||
//(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
|
||||
#if 0
|
||||
size_t offset = command_stream_.size();
|
||||
command_stream_.resize(offset + kCommandHeaderSizeElements +
|
||||
arguments_size_elements);
|
||||
|
@ -276,6 +280,19 @@ void* DeferredCommandList::WriteCommand(Command command,
|
|||
header.command = command;
|
||||
header.arguments_size_elements = uint32_t(arguments_size_elements);
|
||||
return command_stream_.data() + (offset + kCommandHeaderSizeElements);
|
||||
#else
|
||||
|
||||
size_t offset = command_stream_.size();
|
||||
constexpr size_t kCommandHeaderSizeBytes =
|
||||
kCommandHeaderSizeElements * sizeof(uintmax_t);
|
||||
command_stream_.resize(offset + kCommandHeaderSizeBytes +
|
||||
arguments_size_elements);
|
||||
CommandHeader& header =
|
||||
*reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
|
||||
header.command = command;
|
||||
header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
|
||||
return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace d3d12
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include "xenia/base/literals.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/ui/d3d12/d3d12_api.h"
|
||||
|
||||
#include "xenia/base/memory.h"
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace d3d12 {
|
||||
|
@ -30,8 +30,12 @@ class D3D12CommandProcessor;
|
|||
|
||||
class DeferredCommandList {
|
||||
public:
|
||||
static constexpr size_t MAX_SIZEOF_COMMANDLIST = 65536 * 128; //around 8 mb
|
||||
/*
|
||||
chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps
|
||||
*/
|
||||
DeferredCommandList(const D3D12CommandProcessor& command_processor,
|
||||
size_t initial_size_bytes = 1_MiB);
|
||||
size_t initial_size_bytes = MAX_SIZEOF_COMMANDLIST);
|
||||
|
||||
void Reset();
|
||||
void Execute(ID3D12GraphicsCommandList* command_list,
|
||||
|
@ -562,7 +566,8 @@ class DeferredCommandList {
|
|||
const D3D12CommandProcessor& command_processor_;
|
||||
|
||||
// uintmax_t to ensure uint64_t and pointer alignment of all structures.
|
||||
std::vector<uintmax_t> command_stream_;
|
||||
//std::vector<uintmax_t> command_stream_;
|
||||
FixedVMemVector<MAX_SIZEOF_COMMANDLIST> command_stream_;
|
||||
};
|
||||
|
||||
} // namespace d3d12
|
||||
|
|
|
@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
xenos::kMaxResolveSize);
|
||||
y1 = y0 + int32_t(xenos::kMaxResolveSize);
|
||||
}
|
||||
|
||||
//fails in forza horizon 1
|
||||
assert_true(x0 < x1 && y0 < y1);
|
||||
if (x0 >= x1 || y0 >= y1) {
|
||||
XELOGE("Resolve region is empty");
|
||||
|
|
|
@ -883,7 +883,7 @@ class PrimitiveProcessor {
|
|||
// Must be called in a global critical region.
|
||||
void UpdateCacheBucketsNonEmptyL2(
|
||||
uint32_t bucket_index_div_64,
|
||||
[[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
|
||||
[[maybe_unused]] const global_unique_lock_type&
|
||||
global_lock) {
|
||||
uint64_t& cache_buckets_non_empty_l2_ref =
|
||||
cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
|
||||
|
|
|
@ -320,8 +320,7 @@ void Shader::GatherVertexFetchInformation(
|
|||
for (auto& vertex_binding : vertex_bindings_) {
|
||||
if (vertex_binding.fetch_constant == op.fetch_constant_index()) {
|
||||
// It may not hold that all strides are equal, but I hope it does.
|
||||
assert_true(!fetch_instr.attributes.stride ||
|
||||
vertex_binding.stride_words == fetch_instr.attributes.stride);
|
||||
|
||||
vertex_binding.attributes.push_back({});
|
||||
attrib = &vertex_binding.attributes.back();
|
||||
break;
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/bit_range.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
|
@ -344,7 +345,7 @@ void SharedMemory::UnlinkWatchRange(WatchRange* range) {
|
|||
range->next_free = watch_range_first_free_;
|
||||
watch_range_first_free_ = range;
|
||||
}
|
||||
|
||||
// todo: optimize, an enormous amount of cpu time (1.34%) is spent here.
|
||||
bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
|
||||
bool* any_data_resolved_out) {
|
||||
if (!length) {
|
||||
|
@ -364,14 +365,20 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
|
|||
return false;
|
||||
}
|
||||
|
||||
unsigned int current_upload_range = 0;
|
||||
uint32_t page_first = start >> page_size_log2_;
|
||||
uint32_t page_last = (start + length - 1) >> page_size_log2_;
|
||||
|
||||
upload_ranges_.clear();
|
||||
|
||||
std::pair<uint32_t, uint32_t>* uploads =
|
||||
reinterpret_cast<std::pair<uint32_t, uint32_t>*>(upload_ranges_.data());
|
||||
|
||||
bool any_data_resolved = false;
|
||||
uint32_t block_first = page_first >> 6;
|
||||
uint32_t block_last = page_last >> 6;
|
||||
uint32_t range_start = UINT32_MAX;
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (uint32_t i = block_first; i <= block_last; ++i) {
|
||||
|
@ -412,8 +419,13 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
|
|||
if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
|
||||
break;
|
||||
}
|
||||
upload_ranges_.push_back(
|
||||
std::make_pair(range_start, (i << 6) + block_page - range_start));
|
||||
if (current_upload_range + 1 >= MAX_UPLOAD_RANGES) {
|
||||
xe::FatalError(
|
||||
"Hit max upload ranges in shared_memory.cc, tell a dev to "
|
||||
"raise the limit!");
|
||||
}
|
||||
uploads[current_upload_range++] =
|
||||
std::make_pair(range_start, (i << 6) + block_page - range_start);
|
||||
// In the next iteration within this block, consider this range valid
|
||||
// since it has been queued for upload.
|
||||
block_valid |= (uint64_t(1) << block_page) - 1;
|
||||
|
@ -423,17 +435,17 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
|
|||
}
|
||||
}
|
||||
if (range_start != UINT32_MAX) {
|
||||
upload_ranges_.push_back(
|
||||
std::make_pair(range_start, page_last + 1 - range_start));
|
||||
uploads[current_upload_range++] =
|
||||
(std::make_pair(range_start, page_last + 1 - range_start));
|
||||
}
|
||||
if (any_data_resolved_out) {
|
||||
*any_data_resolved_out = any_data_resolved;
|
||||
}
|
||||
if (upload_ranges_.empty()) {
|
||||
if (!current_upload_range) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return UploadRanges(upload_ranges_);
|
||||
return UploadRanges(uploads, current_upload_range);
|
||||
}
|
||||
|
||||
std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
|
||||
|
|
|
@ -35,7 +35,7 @@ class SharedMemory {
|
|||
virtual void SetSystemPageBlocksValidWithGpuDataWritten();
|
||||
|
||||
typedef void (*GlobalWatchCallback)(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
|
||||
const global_unique_lock_type& global_lock, void* context,
|
||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
|
||||
typedef void* GlobalWatchHandle;
|
||||
// Registers a callback invoked when something is invalidated in the GPU
|
||||
|
@ -49,9 +49,9 @@ class SharedMemory {
|
|||
GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback,
|
||||
void* callback_context);
|
||||
void UnregisterGlobalWatch(GlobalWatchHandle handle);
|
||||
typedef void (*WatchCallback)(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
|
||||
void* data, uint64_t argument, bool invalidated_by_gpu);
|
||||
typedef void (*WatchCallback)(const global_unique_lock_type& global_lock,
|
||||
void* context, void* data, uint64_t argument,
|
||||
bool invalidated_by_gpu);
|
||||
typedef void* WatchHandle;
|
||||
// Registers a callback invoked when the specified memory range is invalidated
|
||||
// in the GPU memory copy by the CPU or (if triggered explicitly - such as by
|
||||
|
@ -140,7 +140,8 @@ class SharedMemory {
|
|||
// ascending address order, so front and back can be used to determine the
|
||||
// overall bounds of pages to be uploaded.
|
||||
virtual bool UploadRanges(
|
||||
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
|
||||
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||
unsigned num_upload_ranges) = 0;
|
||||
|
||||
const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
|
||||
return trace_download_ranges_;
|
||||
|
@ -174,10 +175,13 @@ class SharedMemory {
|
|||
|
||||
void* memory_invalidation_callback_handle_ = nullptr;
|
||||
void* memory_data_provider_handle_ = nullptr;
|
||||
static constexpr unsigned int MAX_UPLOAD_RANGES = 65536;
|
||||
|
||||
// Ranges that need to be uploaded, generated by GetRangesToUpload (a
|
||||
// persistently allocated vector).
|
||||
std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
|
||||
// std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
|
||||
FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
|
||||
upload_ranges_;
|
||||
|
||||
// GPU-written memory downloading for traces. <Start address, length>.
|
||||
std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
|
||||
|
|
|
@ -507,7 +507,7 @@ TextureCache::Texture::~Texture() {
|
|||
}
|
||||
|
||||
void TextureCache::Texture::MakeUpToDateAndWatch(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock) {
|
||||
const global_unique_lock_type& global_lock) {
|
||||
SharedMemory& shared_memory = texture_cache().shared_memory();
|
||||
if (base_outdated_) {
|
||||
assert_not_zero(GetGuestBaseSize());
|
||||
|
@ -552,7 +552,7 @@ void TextureCache::Texture::MarkAsUsed() {
|
|||
}
|
||||
|
||||
void TextureCache::Texture::WatchCallback(
|
||||
[[maybe_unused]] const std::unique_lock<std::recursive_mutex>& global_lock,
|
||||
[[maybe_unused]] const global_unique_lock_type& global_lock,
|
||||
bool is_mip) {
|
||||
if (is_mip) {
|
||||
assert_not_zero(GetGuestMipsSize());
|
||||
|
@ -565,8 +565,8 @@ void TextureCache::Texture::WatchCallback(
|
|||
}
|
||||
}
|
||||
|
||||
void TextureCache::WatchCallback(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
|
||||
void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
|
||||
void* context,
|
||||
void* data, uint64_t argument, bool invalidated_by_gpu) {
|
||||
Texture& texture = *static_cast<Texture*>(context);
|
||||
texture.WatchCallback(global_lock, argument != 0);
|
||||
|
@ -902,7 +902,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled,
|
|||
}
|
||||
|
||||
void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
|
||||
const global_unique_lock_type& global_lock, void* context,
|
||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
|
||||
TextureCache* texture_cache = reinterpret_cast<TextureCache*>(context);
|
||||
texture_cache->ScaledResolveGlobalWatchCallback(
|
||||
|
@ -910,7 +910,7 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
|
|||
}
|
||||
|
||||
void TextureCache::ScaledResolveGlobalWatchCallback(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock,
|
||||
const global_unique_lock_type& global_lock,
|
||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
|
||||
assert_true(IsDrawResolutionScaled());
|
||||
if (invalidated_by_gpu) {
|
||||
|
|
|
@ -230,19 +230,15 @@ class TextureCache {
|
|||
}
|
||||
bool IsResolved() const { return base_resolved_ || mips_resolved_; }
|
||||
|
||||
bool base_outdated(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock) const {
|
||||
bool base_outdated(const global_unique_lock_type& global_lock) const {
|
||||
return base_outdated_;
|
||||
}
|
||||
bool mips_outdated(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock) const {
|
||||
bool mips_outdated(const global_unique_lock_type& global_lock) const {
|
||||
return mips_outdated_;
|
||||
}
|
||||
void MakeUpToDateAndWatch(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock);
|
||||
void MakeUpToDateAndWatch(const global_unique_lock_type& global_lock);
|
||||
|
||||
void WatchCallback(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock, bool is_mip);
|
||||
void WatchCallback(const global_unique_lock_type& global_lock, bool is_mip);
|
||||
|
||||
// For LRU caching - updates the last usage frame and moves the texture to
|
||||
// the end of the usage queue. Must be called any time the texture is
|
||||
|
@ -579,8 +575,8 @@ class TextureCache {
|
|||
void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract);
|
||||
|
||||
// Shared memory callback for texture data invalidation.
|
||||
static void WatchCallback(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
|
||||
static void WatchCallback(const global_unique_lock_type& global_lock,
|
||||
void* context,
|
||||
void* data, uint64_t argument, bool invalidated_by_gpu);
|
||||
|
||||
// Checks if there are any pages that contain scaled resolve data within the
|
||||
|
@ -589,10 +585,10 @@ class TextureCache {
|
|||
// Global shared memory invalidation callback for invalidating scaled resolved
|
||||
// texture data.
|
||||
static void ScaledResolveGlobalWatchCallbackThunk(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
|
||||
const global_unique_lock_type& global_lock, void* context,
|
||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
|
||||
void ScaledResolveGlobalWatchCallback(
|
||||
const std::unique_lock<std::recursive_mutex>& global_lock,
|
||||
const global_unique_lock_type& global_lock,
|
||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
|
||||
|
||||
const RegisterFile& register_file_;
|
||||
|
|
|
@ -1157,7 +1157,14 @@ void VulkanCommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VulkanCommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
||||
VulkanCommandProcessor::WriteRegister(start_index + i, data);
|
||||
}
|
||||
}
|
||||
void VulkanCommandProcessor::SparseBindBuffer(
|
||||
VkBuffer buffer, uint32_t bind_count, const VkSparseMemoryBind* binds,
|
||||
VkPipelineStageFlags wait_stage_mask) {
|
||||
|
|
|
@ -45,7 +45,7 @@ namespace xe {
|
|||
namespace gpu {
|
||||
namespace vulkan {
|
||||
|
||||
class VulkanCommandProcessor : public CommandProcessor {
|
||||
class VulkanCommandProcessor final : public CommandProcessor {
|
||||
public:
|
||||
// Single-descriptor layouts for use within a single frame.
|
||||
enum class SingleTransientDescriptorLayout {
|
||||
|
@ -259,8 +259,11 @@ class VulkanCommandProcessor : public CommandProcessor {
|
|||
protected:
|
||||
bool SetupContext() override;
|
||||
void ShutdownContext() override;
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers) override;
|
||||
|
||||
void OnGammaRamp256EntryTableValueWritten() override;
|
||||
void OnGammaRampPWLValueWritten() override;
|
||||
|
|
|
@ -376,18 +376,21 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(
|
|||
}
|
||||
|
||||
bool VulkanSharedMemory::UploadRanges(
|
||||
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
|
||||
if (upload_page_ranges.empty()) {
|
||||
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||
unsigned num_upload_ranges) {
|
||||
if (!num_upload_ranges) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto& range_front = upload_page_ranges[0];
|
||||
auto& range_back = upload_page_ranges[num_upload_ranges - 1];
|
||||
|
||||
// upload_page_ranges are sorted, use them to determine the range for the
|
||||
// ordering barrier.
|
||||
Use(Usage::kTransferDestination,
|
||||
std::make_pair(
|
||||
upload_page_ranges.front().first << page_size_log2(),
|
||||
(upload_page_ranges.back().first + upload_page_ranges.back().second -
|
||||
upload_page_ranges.front().first)
|
||||
<< page_size_log2()));
|
||||
std::make_pair(range_front.first << page_size_log2(),
|
||||
(range_back.first + range_back.second - range_front.first)
|
||||
<< page_size_log2()));
|
||||
command_processor_.SubmitBarriers(true);
|
||||
DeferredCommandBuffer& command_buffer =
|
||||
command_processor_.deferred_command_buffer();
|
||||
|
@ -395,9 +398,11 @@ bool VulkanSharedMemory::UploadRanges(
|
|||
bool successful = true;
|
||||
upload_regions_.clear();
|
||||
VkBuffer upload_buffer_previous = VK_NULL_HANDLE;
|
||||
for (auto upload_range : upload_page_ranges) {
|
||||
uint32_t upload_range_start = upload_range.first;
|
||||
uint32_t upload_range_length = upload_range.second;
|
||||
|
||||
// for (auto upload_range : upload_page_ranges) {
|
||||
for (unsigned int i = 0; i < num_upload_ranges; ++i) {
|
||||
uint32_t upload_range_start = upload_page_ranges[i].first;
|
||||
uint32_t upload_range_length = upload_page_ranges[i].second;
|
||||
trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),
|
||||
upload_range_length << page_size_log2());
|
||||
while (upload_range_length) {
|
||||
|
|
|
@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
|
|||
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
|
||||
uint32_t length_allocations) override;
|
||||
|
||||
bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
|
||||
upload_page_ranges) override;
|
||||
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
|
||||
upload_page_ranges, unsigned num_ranges) override;
|
||||
|
||||
private:
|
||||
void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
|
||||
|
|
|
@ -137,6 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
|
|||
modified_vibration.right_motor_speed = 0;
|
||||
return modified_vibration;
|
||||
}
|
||||
|
||||
std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
|
||||
return std::unique_lock<xe_unlikely_mutex>{lock_};
|
||||
}
|
||||
} // namespace hid
|
||||
} // namespace xe
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/mutex.h"
|
||||
#include "xenia/hid/input.h"
|
||||
#include "xenia/hid/input_driver.h"
|
||||
#include "xenia/xbox.h"
|
||||
|
@ -48,6 +48,8 @@ class InputSystem {
|
|||
void UpdateUsedSlot(uint8_t slot, bool connected);
|
||||
uint8_t GetConnectedSlots() const { return connected_slot; }
|
||||
|
||||
std::unique_lock<xe_unlikely_mutex> lock();
|
||||
|
||||
private:
|
||||
xe::ui::Window* window_ = nullptr;
|
||||
|
||||
|
@ -55,6 +57,7 @@ class InputSystem {
|
|||
|
||||
X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
|
||||
uint8_t connected_slot = 0b0001;
|
||||
xe_unlikely_mutex lock_;
|
||||
};
|
||||
|
||||
} // namespace hid
|
||||
|
|
|
@ -14,9 +14,9 @@
|
|||
|
||||
#include <xinput.h> // NOLINT(build/include_order)
|
||||
|
||||
#include "xenia/base/clock.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/hid/hid_flags.h"
|
||||
|
||||
namespace xe {
|
||||
namespace hid {
|
||||
namespace xinput {
|
||||
|
@ -81,13 +81,39 @@ X_STATUS XInputInputDriver::Setup() {
|
|||
}
|
||||
return X_STATUS_SUCCESS;
|
||||
}
|
||||
constexpr uint64_t SKIP_INVALID_CONTROLLER_TIME = 1100;
|
||||
static uint64_t last_invalid_time[4];
|
||||
|
||||
static DWORD should_skip(uint32_t user_index) {
|
||||
uint64_t time = last_invalid_time[user_index];
|
||||
if (time) {
|
||||
uint64_t deltatime = xe::Clock::QueryHostUptimeMillis() - time;
|
||||
|
||||
if (deltatime < SKIP_INVALID_CONTROLLER_TIME) {
|
||||
return ERROR_DEVICE_NOT_CONNECTED;
|
||||
}
|
||||
last_invalid_time[user_index] = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void set_skip(uint32_t user_index) {
|
||||
last_invalid_time[user_index] = xe::Clock::QueryHostUptimeMillis();
|
||||
}
|
||||
|
||||
X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags,
|
||||
X_INPUT_CAPABILITIES* out_caps) {
|
||||
DWORD skipper = should_skip(user_index);
|
||||
if (skipper) {
|
||||
return skipper;
|
||||
}
|
||||
XINPUT_CAPABILITIES native_caps;
|
||||
auto xigc = (decltype(&XInputGetCapabilities))XInputGetCapabilities_;
|
||||
DWORD result = xigc(user_index, flags, &native_caps);
|
||||
if (result) {
|
||||
if (result == ERROR_DEVICE_NOT_CONNECTED) {
|
||||
set_skip(user_index);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -110,10 +136,18 @@ X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags,
|
|||
|
||||
X_RESULT XInputInputDriver::GetState(uint32_t user_index,
|
||||
X_INPUT_STATE* out_state) {
|
||||
DWORD skipper = should_skip(user_index);
|
||||
if (skipper) {
|
||||
return skipper;
|
||||
}
|
||||
XINPUT_STATE native_state;
|
||||
auto xigs = (decltype(&XInputGetState))XInputGetState_;
|
||||
|
||||
DWORD result = xigs(user_index, &native_state);
|
||||
if (result) {
|
||||
if (result == ERROR_DEVICE_NOT_CONNECTED) {
|
||||
set_skip(user_index);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -131,11 +165,18 @@ X_RESULT XInputInputDriver::GetState(uint32_t user_index,
|
|||
|
||||
X_RESULT XInputInputDriver::SetState(uint32_t user_index,
|
||||
X_INPUT_VIBRATION* vibration) {
|
||||
DWORD skipper = should_skip(user_index);
|
||||
if (skipper) {
|
||||
return skipper;
|
||||
}
|
||||
XINPUT_VIBRATION native_vibration;
|
||||
native_vibration.wLeftMotorSpeed = vibration->left_motor_speed;
|
||||
native_vibration.wRightMotorSpeed = vibration->right_motor_speed;
|
||||
auto xiss = (decltype(&XInputSetState))XInputSetState_;
|
||||
DWORD result = xiss(user_index, &native_vibration);
|
||||
if (result == ERROR_DEVICE_NOT_CONNECTED) {
|
||||
set_skip(user_index);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -948,7 +948,11 @@ bool KernelState::Restore(ByteStream* stream) {
|
|||
}
|
||||
|
||||
uint8_t KernelState::GetConnectedUsers() const {
|
||||
return emulator_->input_system()->GetConnectedSlots();
|
||||
auto input_sys = emulator_->input_system();
|
||||
|
||||
auto lock = input_sys->lock();
|
||||
|
||||
return input_sys->GetConnectedSlots();
|
||||
}
|
||||
|
||||
void KernelState::UpdateUsedUserProfiles() {
|
||||
|
|
|
@ -58,6 +58,7 @@ dword_result_t XamInputGetCapabilities_entry(
|
|||
}
|
||||
|
||||
auto input_system = kernel_state()->emulator()->input_system();
|
||||
auto lock = input_system->lock();
|
||||
return input_system->GetCapabilities(actual_user_index, flags, caps);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XamInputGetCapabilities, kInput, kSketchy);
|
||||
|
@ -81,6 +82,7 @@ dword_result_t XamInputGetCapabilitiesEx_entry(
|
|||
}
|
||||
|
||||
auto input_system = kernel_state()->emulator()->input_system();
|
||||
auto lock = input_system->lock();
|
||||
return input_system->GetCapabilities(actual_user_index, flags, caps);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy);
|
||||
|
@ -88,6 +90,13 @@ DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy);
|
|||
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetstate(v=vs.85).aspx
|
||||
dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
|
||||
pointer_t<X_INPUT_STATE> input_state) {
|
||||
if (input_state) {
|
||||
memset((void*)input_state.host_address(), 0, sizeof X_INPUT_STATE);
|
||||
}
|
||||
if (user_index >= 4) {
|
||||
return X_ERROR_DEVICE_NOT_CONNECTED;
|
||||
}
|
||||
|
||||
// Games call this with a NULL state ptr, probably as a query.
|
||||
|
||||
if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
|
||||
|
@ -96,12 +105,14 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
|
|||
}
|
||||
|
||||
uint32_t actual_user_index = user_index;
|
||||
// chrispy: change this, logic is not right
|
||||
if ((actual_user_index & 0xFF) == 0xFF || (flags & XINPUT_FLAG_ANY_USER)) {
|
||||
// Always pin user to 0.
|
||||
actual_user_index = 0;
|
||||
}
|
||||
|
||||
auto input_system = kernel_state()->emulator()->input_system();
|
||||
auto lock = input_system->lock();
|
||||
return input_system->GetState(user_index, input_state);
|
||||
}
|
||||
DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
|
||||
|
@ -109,6 +120,9 @@ DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
|
|||
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
|
||||
dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
|
||||
pointer_t<X_INPUT_VIBRATION> vibration) {
|
||||
if (user_index >= 4) {
|
||||
return X_E_DEVICE_NOT_CONNECTED;
|
||||
}
|
||||
if (!vibration) {
|
||||
return X_ERROR_BAD_ARGUMENTS;
|
||||
}
|
||||
|
@ -120,6 +134,7 @@ dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
|
|||
}
|
||||
|
||||
auto input_system = kernel_state()->emulator()->input_system();
|
||||
auto lock = input_system->lock();
|
||||
return input_system->SetState(user_index, vibration);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XamInputSetState, kInput, kImplemented);
|
||||
|
@ -147,6 +162,7 @@ dword_result_t XamInputGetKeystroke_entry(
|
|||
}
|
||||
|
||||
auto input_system = kernel_state()->emulator()->input_system();
|
||||
auto lock = input_system->lock();
|
||||
return input_system->GetKeystroke(user_index, flags, keystroke);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XamInputGetKeystroke, kInput, kImplemented);
|
||||
|
@ -166,14 +182,15 @@ dword_result_t XamInputGetKeystrokeEx_entry(
|
|||
|
||||
uint32_t user_index = *user_index_ptr;
|
||||
auto input_system = kernel_state()->emulator()->input_system();
|
||||
|
||||
auto lock = input_system->lock();
|
||||
if ((user_index & 0xFF) == 0xFF) {
|
||||
// Always pin user to 0.
|
||||
user_index = 0;
|
||||
}
|
||||
|
||||
if (flags & XINPUT_FLAG_ANY_USER) {
|
||||
// That flag means we should iterate over every connected controller and check which one have pending request.
|
||||
// That flag means we should iterate over every connected controller and
|
||||
// check which one have pending request.
|
||||
auto result = X_ERROR_DEVICE_NOT_CONNECTED;
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
auto result = input_system->GetKeystroke(i, flags, keystroke);
|
||||
|
@ -188,6 +205,7 @@ dword_result_t XamInputGetKeystrokeEx_entry(
|
|||
}
|
||||
|
||||
auto result = input_system->GetKeystroke(user_index, flags, keystroke);
|
||||
|
||||
if (XSUCCEEDED(result)) {
|
||||
*user_index_ptr = keystroke->user_index;
|
||||
}
|
||||
|
@ -202,7 +220,8 @@ X_HRESULT_result_t XamUserGetDeviceContext_entry(dword_t user_index,
|
|||
// If this function fails they assume zero, so let's fail AND
|
||||
// set zero just to be safe.
|
||||
*out_ptr = 0;
|
||||
if (kernel_state()->IsUserSignedIn(user_index) || (user_index & 0xFF) == 0xFF) {
|
||||
if (kernel_state()->IsUserSignedIn(user_index) ||
|
||||
(user_index & 0xFF) == 0xFF) {
|
||||
*out_ptr = (uint32_t)user_index;
|
||||
return X_E_SUCCESS;
|
||||
} else {
|
||||
|
|
|
@ -121,7 +121,8 @@ dword_result_t NtAllocateVirtualMemory_entry(lpdword_t base_addr_ptr,
|
|||
? -int32_t(region_size_ptr.value())
|
||||
: region_size_ptr.value();
|
||||
|
||||
adjusted_size = xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024);
|
||||
adjusted_size =
|
||||
xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024);
|
||||
|
||||
// Allocate.
|
||||
uint32_t allocation_type = 0;
|
||||
|
@ -295,10 +296,19 @@ struct X_MEMORY_BASIC_INFORMATION {
|
|||
be<uint32_t> protect;
|
||||
be<uint32_t> type;
|
||||
};
|
||||
|
||||
// chrispy: added region_type ? guessed name, havent seen any except 0 used
|
||||
dword_result_t NtQueryVirtualMemory_entry(
|
||||
dword_t base_address,
|
||||
pointer_t<X_MEMORY_BASIC_INFORMATION> memory_basic_information_ptr) {
|
||||
pointer_t<X_MEMORY_BASIC_INFORMATION> memory_basic_information_ptr,
|
||||
dword_t region_type) {
|
||||
switch (region_type) {
|
||||
case 0:
|
||||
case 1:
|
||||
case 2:
|
||||
break;
|
||||
default:
|
||||
return X_STATUS_INVALID_PARAMETER;
|
||||
}
|
||||
auto heap = kernel_state()->memory()->LookupHeap(base_address);
|
||||
HeapAllocationInfo alloc_info;
|
||||
if (heap == nullptr || !heap->QueryRegionInfo(base_address, &alloc_info)) {
|
||||
|
@ -373,8 +383,9 @@ dword_result_t MmAllocatePhysicalMemoryEx_entry(
|
|||
// min_addr_range/max_addr_range are bounds in physical memory, not virtual.
|
||||
uint32_t heap_base = heap->heap_base();
|
||||
uint32_t heap_physical_address_offset = heap->GetPhysicalAddress(heap_base);
|
||||
// TODO(Gliniak): Games like 545108B4 compares min_addr_range with value returned.
|
||||
// 0x1000 offset causes it to go below that minimal range and goes haywire
|
||||
// TODO(Gliniak): Games like 545108B4 compares min_addr_range with value
|
||||
// returned. 0x1000 offset causes it to go below that minimal range and goes
|
||||
// haywire
|
||||
if (min_addr_range && max_addr_range) {
|
||||
heap_physical_address_offset = 0;
|
||||
}
|
||||
|
|
|
@ -53,21 +53,20 @@ DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemory, kMemory, kImplemented);
|
|||
// https://msdn.microsoft.com/en-us/library/ff552123
|
||||
dword_result_t RtlCompareMemoryUlong_entry(lpvoid_t source, dword_t length,
|
||||
dword_t pattern) {
|
||||
// Return 0 if source/length not aligned
|
||||
if (source.guest_address() % 4 || length % 4) {
|
||||
return 0;
|
||||
}
|
||||
uint32_t num_compared_bytes = 0;
|
||||
|
||||
uint32_t n = 0;
|
||||
for (uint32_t i = 0; i < (length / 4); i++) {
|
||||
// FIXME: This assumes as_array returns xe::be
|
||||
uint32_t val = source.as_array<uint32_t>()[i];
|
||||
if (val == pattern) {
|
||||
n++;
|
||||
uint32_t swapped_pattern = xe::byte_swap(pattern.value());
|
||||
|
||||
char* host_source = (char*)source.host_address();
|
||||
|
||||
for (uint32_t aligned_length = length & 0xFFFFFFFCU; aligned_length;
|
||||
num_compared_bytes += 4) {
|
||||
if (*(uint32_t*)(host_source + num_compared_bytes) != swapped_pattern) {
|
||||
break;
|
||||
}
|
||||
aligned_length = aligned_length - 4;
|
||||
}
|
||||
|
||||
return n;
|
||||
return num_compared_bytes;
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemoryUlong, kMemory, kImplemented);
|
||||
|
||||
|
@ -85,23 +84,61 @@ void RtlFillMemoryUlong_entry(lpvoid_t destination, dword_t length,
|
|||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(RtlFillMemoryUlong, kMemory, kImplemented);
|
||||
|
||||
dword_result_t RtlUpperChar_entry(dword_t in) {
|
||||
char c = in & 0xFF;
|
||||
if (c >= 'a' && c <= 'z') {
|
||||
return c ^ 0x20;
|
||||
}
|
||||
static constexpr const unsigned char rtl_lower_table[256] = {
|
||||
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB,
|
||||
0xC, 0xD, 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
|
||||
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
|
||||
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
|
||||
0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
|
||||
0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
||||
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
|
||||
0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
|
||||
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
|
||||
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
|
||||
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
|
||||
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
|
||||
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
|
||||
0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
|
||||
0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
|
||||
0xFC, 0xFD, 0xFE, 0xFF};
|
||||
|
||||
return c;
|
||||
static constexpr const unsigned char rtl_upper_table[256] = {
|
||||
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB,
|
||||
0xC, 0xD, 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
|
||||
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
|
||||
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
|
||||
0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
|
||||
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
|
||||
0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
||||
0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
|
||||
0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
|
||||
0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
|
||||
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
|
||||
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
|
||||
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
|
||||
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
|
||||
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
|
||||
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
|
||||
0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
|
||||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xC0, 0xC1, 0xC2, 0xC3,
|
||||
0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
|
||||
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xF7, 0xD8, 0xD9, 0xDA, 0xDB,
|
||||
0xDC, 0xDD, 0xDE, 0x3F};
|
||||
|
||||
dword_result_t RtlUpperChar_entry(dword_t in) {
|
||||
return rtl_upper_table[in & 0xff];
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(RtlUpperChar, kNone, kImplemented);
|
||||
|
||||
dword_result_t RtlLowerChar_entry(dword_t in) {
|
||||
char c = in & 0xFF;
|
||||
if (c >= 'A' && c <= 'Z') {
|
||||
return c ^ 0x20;
|
||||
}
|
||||
|
||||
return c;
|
||||
return rtl_lower_table[in & 0xff];
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(RtlLowerChar, kNone, kImplemented);
|
||||
|
||||
|
|
|
@ -473,7 +473,8 @@ void VdSwap_entry(
|
|||
dwords[i] = xenos::MakePacketType2();
|
||||
}
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT2(VdSwap, kVideo, kImplemented, kImportant);
|
||||
DECLARE_XBOXKRNL_EXPORT3(VdSwap, kVideo, kImplemented, kHighFrequency,
|
||||
kImportant);
|
||||
|
||||
void RegisterVideoExports(xe::cpu::ExportResolver* export_resolver,
|
||||
KernelState* kernel_state) {
|
||||
|
|
|
@ -465,7 +465,7 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
|
|||
}
|
||||
|
||||
bool Memory::AccessViolationCallback(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
void* host_address, bool is_write) {
|
||||
// Access via physical_membase_ is special, when need to bypass everything
|
||||
// (for instance, for a data provider to actually write the data) so only
|
||||
|
@ -493,14 +493,14 @@ bool Memory::AccessViolationCallback(
|
|||
}
|
||||
|
||||
bool Memory::AccessViolationCallbackThunk(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
void* context, void* host_address, bool is_write) {
|
||||
return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
|
||||
std::move(global_lock_locked_once), host_address, is_write);
|
||||
}
|
||||
|
||||
bool Memory::TriggerPhysicalMemoryCallbacks(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect) {
|
||||
BaseHeap* heap = LookupHeap(virtual_address);
|
||||
|
@ -1711,7 +1711,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|||
}
|
||||
|
||||
bool PhysicalHeap::TriggerCallbacks(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect) {
|
||||
// TODO(Triang3l): Support read watches.
|
||||
|
|
|
@ -271,8 +271,7 @@ class PhysicalHeap : public BaseHeap {
|
|||
bool enable_invalidation_notifications,
|
||||
bool enable_data_providers);
|
||||
// Returns true if any page in the range was watched.
|
||||
bool TriggerCallbacks(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect = true);
|
||||
|
||||
|
@ -459,7 +458,7 @@ class Memory {
|
|||
// TODO(Triang3l): Implement data providers - this is why locking depth of 1
|
||||
// will be required in the future.
|
||||
bool TriggerPhysicalMemoryCallbacks(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect = true);
|
||||
|
||||
|
@ -508,11 +507,10 @@ class Memory {
|
|||
static uint32_t HostToGuestVirtualThunk(const void* context,
|
||||
const void* host_address);
|
||||
|
||||
bool AccessViolationCallback(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
|
||||
void* host_address, bool is_write);
|
||||
static bool AccessViolationCallbackThunk(
|
||||
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
void* context, void* host_address, bool is_write);
|
||||
|
||||
std::filesystem::path file_name_;
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 15ece0882e8d5875051ff5b73c5a8326f7cee9f5
|
||||
Subproject commit a437fe6d8efef17c8ad33d39f5815032e7adf5d7
|
|
@ -1 +1 @@
|
|||
Subproject commit 5787e9cb7551c8c79cf9ce14f7be860dc907e9a4
|
||||
Subproject commit 302b6e03e829c6d6a70415f10d818a5088cb6ccf
|
Loading…
Reference in New Issue