squash reallocs in command buffers by using large prealloced buffer, directly use virtual memory with it so os allocs on demand

mark raw clock functions as noinline, the way msvc was inlining them and ordering the branches meant that rdtsc would often be speculatively executed
add alternative clock impl for win, instead of using queryperformancecounter we grab systemtime from kusershared. it does not have the same precision as queryperformancecounter, we only have 100 nanosecond precision, but we round to milliseconds so it never made sense to use the performance counter in the first place
stubbed out the "guest clock mutex"... (the entirety of clock.cc needs a rewrite)
added some helpers for minf/maxf without the nan handling behavior
This commit is contained in:
chss95cs@gmail.com 2022-08-14 13:42:08 -07:00
parent c9b2d10e17
commit 7cc364dcb8
11 changed files with 263 additions and 38 deletions

View File

@ -15,6 +15,13 @@
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
#include "xenia/base/mutex.h"
#if defined(_WIN32)
#include "xenia/base/platform_win.h"
#endif
DEFINE_bool(clock_no_scaling, false,
"Disable scaling code. Time management and locking is bypassed. "
@ -42,8 +49,19 @@ std::pair<uint64_t, uint64_t> guest_tick_ratio_ = std::make_pair(1, 1);
uint64_t last_guest_tick_count_ = 0;
// Last sampled host tick count.
uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
struct null_lock {
public:
static void lock() {}
static void unlock() {}
static bool try_lock() { return true; }
};
using tick_mutex_type = null_lock; // xe::xe_mutex;
// Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
std::mutex tick_mutex_;
// std::mutex tick_mutex_;
static tick_mutex_type tick_mutex_;
void RecomputeGuestTickScalar() {
// Create a rational number with numerator (first) and denominator (second)
@ -61,7 +79,7 @@ void RecomputeGuestTickScalar() {
// Keep this a rational calculation and reduce the fraction
reduce_fraction(frac);
std::lock_guard<std::mutex> lock(tick_mutex_);
std::lock_guard<tick_mutex_type> lock(tick_mutex_);
guest_tick_ratio_ = frac;
}
@ -75,7 +93,7 @@ uint64_t UpdateGuestClock() {
return host_tick_count * guest_tick_ratio_.first / guest_tick_ratio_.second;
}
std::unique_lock<std::mutex> lock(tick_mutex_, std::defer_lock);
std::unique_lock<tick_mutex_type> lock(tick_mutex_, std::defer_lock);
if (lock.try_lock()) {
// Translate host tick count to guest tick count.
uint64_t host_tick_delta = host_tick_count > last_host_tick_count_
@ -107,7 +125,6 @@ inline uint64_t QueryGuestSystemTimeOffset() {
return guest_tick_count * numerator / denominator;
}
uint64_t Clock::QueryHostTickFrequency() {
#if XE_CLOCK_RAW_AVAILABLE
if (cvars::clock_source_raw) {
@ -137,7 +154,7 @@ void Clock::set_guest_time_scalar(double scalar) {
}
std::pair<uint64_t, uint64_t> Clock::guest_tick_ratio() {
std::lock_guard<std::mutex> lock(tick_mutex_);
std::lock_guard<tick_mutex_type> lock(tick_mutex_);
return guest_tick_ratio_;
}

View File

@ -33,11 +33,15 @@ class Clock {
// Either from platform suplied time source or from hardware directly.
static uint64_t host_tick_frequency_platform();
#if XE_CLOCK_RAW_AVAILABLE
XE_NOINLINE
static uint64_t host_tick_frequency_raw();
#endif
// Host tick count. Generally QueryHostTickCount() should be used.
static uint64_t host_tick_count_platform();
#if XE_CLOCK_RAW_AVAILABLE
//chrispy: the way msvc was ordering the branches was causing rdtsc to be speculatively executed each time
//the branch history was lost
XE_NOINLINE
static uint64_t host_tick_count_raw();
#endif

View File

@ -12,7 +12,18 @@
#include "xenia/base/platform_win.h"
namespace xe {
#if XE_USE_KUSER_SHARED==1
uint64_t Clock::host_tick_frequency_platform() { return 10000000ULL; }
uint64_t Clock::host_tick_count_platform() {
return *reinterpret_cast<volatile uint64_t*>(&KUserShared()->SystemTime);
}
uint64_t Clock::QueryHostSystemTime() {
return *reinterpret_cast<volatile uint64_t*>(&KUserShared()->SystemTime);
}
#else
uint64_t Clock::host_tick_frequency_platform() {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
@ -27,7 +38,6 @@ uint64_t Clock::host_tick_count_platform() {
}
return time;
}
uint64_t Clock::QueryHostSystemTime() {
FILETIME t;
GetSystemTimeAsFileTime(&t);
@ -37,5 +47,10 @@ uint64_t Clock::QueryHostSystemTime() {
uint64_t Clock::QueryHostUptimeMillis() {
return host_tick_count_platform() * 1000 / host_tick_frequency_platform();
}
#endif
uint64_t Clock::QueryHostUptimeMillis() {
return host_tick_count_platform() * 1000 / host_tick_frequency_platform();
}
} // namespace xe

View File

@ -41,10 +41,14 @@
"\n" \
"Set the cvar 'clock_source_raw' to 'false'.");
namespace xe {
// Getting the TSC frequency can be a bit tricky. This method here only works on
// Intel as it seems. There is no easy way to get the frequency outside of ring0
// on AMD, so we fail gracefully if not possible.
XE_NOINLINE
uint64_t Clock::host_tick_frequency_raw() {
uint32_t eax, ebx, ecx, edx;
@ -71,6 +75,8 @@ uint64_t Clock::host_tick_frequency_raw() {
return 0;
}
if (max_cpuid >= 0x15) {
// 15H Get TSC/Crystal ratio and Crystal Hz.
xe_cpu_cpuid(0x15, eax, ebx, ecx, edx);
@ -92,10 +98,11 @@ uint64_t Clock::host_tick_frequency_raw() {
return cpu_base_freq;
}
CLOCK_FATAL("The clock frequency could not be determined.");
return 0;
}
XE_NOINLINE
uint64_t Clock::host_tick_count_raw() { return xe_cpu_rdtsc(); }
} // namespace xe

View File

@ -376,6 +376,29 @@ template <int N>
int64_t m128_i64(const __m128& v) {
return m128_i64<N>(_mm_castps_pd(v));
}
/*
std::min/max float has handling for nans, where if either argument is nan the first argument is returned
minss/maxss are different, if either argument is nan the second operand to the instruction is returned
this is problematic because we have no assurances from the compiler on the argument ordering
so only use in places where nan handling is not needed
*/
static float xe_minf(float x, float y) {
return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
}
static float xe_maxf(float x, float y) {
return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
}
static float xe_rcpf(float den) {
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
}
#else
static float xe_minf(float x, float y) { return std::min<float>(x, y); }
static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
static float xe_rcpf(float den) { return 1.0f / den; }
#endif
// Similar to the C++ implementation of XMConvertFloatToHalf and

View File

@ -478,12 +478,13 @@ class fixed_vmem_vector {
public:
fixed_vmem_vector()
: data_((uint8_t*)AllocFixed(nullptr, sz, AllocationType::kReserveCommit,
PageAccess::kReadWrite)),
: data_((uint8_t*)memory::AllocFixed(
nullptr, sz, memory::AllocationType::kReserveCommit,
memory::PageAccess::kReadWrite)),
nbytes_(0) {}
~fixed_vmem_vector() {
if (data_) {
DeallocFixed(data_, sz, DeallocationType::kRelease);
memory::DeallocFixed(data_, sz, memory::DeallocationType::kRelease);
data_ = nullptr;
}
nbytes_ = 0;

View File

@ -34,31 +34,169 @@
#undef DeleteFile
#undef GetFirstChild
#define XE_USE_NTDLL_FUNCTIONS 1
#if XE_USE_NTDLL_FUNCTIONS==1
#define XE_USE_NTDLL_FUNCTIONS 1
#define XE_USE_KUSER_SHARED 1
#if XE_USE_NTDLL_FUNCTIONS == 1
/*
ntdll versions of functions often skip through a lot of extra garbage in KernelBase
ntdll versions of functions often skip through a lot of extra garbage in
KernelBase
*/
#define XE_NTDLL_IMPORT(name, cls, clsvar) \
static class cls { \
public: \
FARPROC fn;\
cls() : fn(nullptr) {\
auto ntdll = GetModuleHandleA("ntdll.dll");\
if (ntdll) { \
fn = GetProcAddress(ntdll, #name );\
}\
} \
template <typename TRet = void, typename... TArgs> \
inline TRet invoke(TArgs... args) {\
return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...);\
}\
inline operator bool() const {\
return fn!=nullptr;\
}\
#define XE_NTDLL_IMPORT(name, cls, clsvar) \
static class cls { \
public: \
FARPROC fn; \
cls() : fn(nullptr) { \
auto ntdll = GetModuleHandleA("ntdll.dll"); \
if (ntdll) { \
fn = GetProcAddress(ntdll, #name); \
} \
} \
template <typename TRet = void, typename... TArgs> \
inline TRet invoke(TArgs... args) { \
return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...); \
} \
inline operator bool() const { return fn != nullptr; } \
} clsvar
#else
#define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false
#endif
// KUSER_SHARED
struct __declspec(align(4)) _KSYSTEM_TIME {
unsigned int LowPart;
int High1Time;
int High2Time;
};
enum _NT_PRODUCT_TYPE {
NtProductWinNt = 0x1,
NtProductLanManNt = 0x2,
NtProductServer = 0x3,
};
enum _ALTERNATIVE_ARCHITECTURE_TYPE {
StandardDesign = 0x0,
NEC98x86 = 0x1,
EndAlternatives = 0x2,
};
#pragma pack(push, 1)
struct $3D940D5D03EF7F98CEE6737EDE752E57 {
__int8 _bf_0;
};
union $DA7A7E727E24E4DD62317E27558CCADA {
unsigned __int8 MitigationPolicies;
$3D940D5D03EF7F98CEE6737EDE752E57 __s1;
};
struct __declspec(align(4)) $4BF4056B39611650D41923F164DAFA52 {
__int32 _bf_0;
};
union __declspec(align(4)) $BB68545E345A5F8046EF3BC0FE928142 {
unsigned int SharedDataFlags;
$4BF4056B39611650D41923F164DAFA52 __s1;
};
union $5031D289C483414B89DA3F368D1FE62C {
volatile _KSYSTEM_TIME TickCount;
volatile unsigned __int64 TickCountQuad;
unsigned int ReservedTickCountOverlay[3];
};
struct $F91ACE6F13277DFC9425B9B8BBCB30F7 {
volatile unsigned __int8 QpcBypassEnabled;
unsigned __int8 QpcShift;
};
union __declspec(align(2)) $3C927F8BB7EAEE13CF0CFC3E60EDC8A9 {
unsigned __int16 QpcData;
$F91ACE6F13277DFC9425B9B8BBCB30F7 __s1;
};
struct __declspec(align(8)) _KUSER_SHARED_DATA {
unsigned int TickCountLowDeprecated;
unsigned int TickCountMultiplier;
volatile _KSYSTEM_TIME InterruptTime;
volatile _KSYSTEM_TIME SystemTime;
volatile _KSYSTEM_TIME TimeZoneBias;
unsigned __int16 ImageNumberLow;
unsigned __int16 ImageNumberHigh;
wchar_t NtSystemRoot[260];
unsigned int MaxStackTraceDepth;
unsigned int CryptoExponent;
unsigned int TimeZoneId;
unsigned int LargePageMinimum;
unsigned int AitSamplingValue;
unsigned int AppCompatFlag;
unsigned __int64 RNGSeedVersion;
unsigned int GlobalValidationRunlevel;
volatile int TimeZoneBiasStamp;
unsigned int NtBuildNumber;
_NT_PRODUCT_TYPE NtProductType;
unsigned __int8 ProductTypeIsValid;
unsigned __int8 Reserved0[1];
unsigned __int16 NativeProcessorArchitecture;
unsigned int NtMajorVersion;
unsigned int NtMinorVersion;
unsigned __int8 ProcessorFeatures[64];
unsigned int Reserved1;
unsigned int Reserved3;
volatile unsigned int TimeSlip;
_ALTERNATIVE_ARCHITECTURE_TYPE AlternativeArchitecture;
unsigned int BootId;
_LARGE_INTEGER SystemExpirationDate;
unsigned int SuiteMask;
unsigned __int8 KdDebuggerEnabled;
$DA7A7E727E24E4DD62317E27558CCADA ___u33;
unsigned __int8 Reserved6[2];
volatile unsigned int ActiveConsoleId;
volatile unsigned int DismountCount;
unsigned int ComPlusPackage;
unsigned int LastSystemRITEventTickCount;
unsigned int NumberOfPhysicalPages;
unsigned __int8 SafeBootMode;
unsigned __int8 VirtualizationFlags;
unsigned __int8 Reserved12[2];
$BB68545E345A5F8046EF3BC0FE928142 ___u43;
unsigned int DataFlagsPad[1];
unsigned __int64 TestRetInstruction;
__int64 QpcFrequency;
unsigned int SystemCall;
unsigned int SystemCallPad0;
unsigned __int64 SystemCallPad[2];
$5031D289C483414B89DA3F368D1FE62C ___u50;
unsigned int TickCountPad[1];
unsigned int Cookie;
unsigned int CookiePad[1];
__int64 ConsoleSessionForegroundProcessId;
unsigned __int64 TimeUpdateLock;
unsigned __int64 BaselineSystemTimeQpc;
unsigned __int64 BaselineInterruptTimeQpc;
unsigned __int64 QpcSystemTimeIncrement;
unsigned __int64 QpcInterruptTimeIncrement;
unsigned __int8 QpcSystemTimeIncrementShift;
unsigned __int8 QpcInterruptTimeIncrementShift;
unsigned __int16 UnparkedProcessorCount;
unsigned int EnclaveFeatureMask[4];
unsigned int TelemetryCoverageRound;
unsigned __int16 UserModeGlobalLogger[16];
unsigned int ImageFileExecutionOptions;
unsigned int LangGenerationCount;
unsigned __int64 Reserved4;
volatile unsigned __int64 InterruptTimeBias;
volatile unsigned __int64 QpcBias;
unsigned int ActiveProcessorCount;
volatile unsigned __int8 ActiveGroupCount;
unsigned __int8 Reserved9;
$3C927F8BB7EAEE13CF0CFC3E60EDC8A9 ___u74;
_LARGE_INTEGER TimeZoneBiasEffectiveStart;
_LARGE_INTEGER TimeZoneBiasEffectiveEnd;
_XSTATE_CONFIGURATION XState;
};
static constexpr unsigned KUSER_SIZE = sizeof(_KUSER_SHARED_DATA);
static_assert(KUSER_SIZE == 1808, "yay");
#pragma pack(pop)
static _KUSER_SHARED_DATA* KUserShared() {
return (_KUSER_SHARED_DATA*)0x7FFE0000;
}
#endif // XENIA_BASE_PLATFORM_WIN_H_

View File

@ -148,6 +148,7 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value);
// be kept short or else all timers will be impacted. This is a simplified
// wrapper around QueueTimerRecurring which automatically cancels the timer on
// destruction.
//only used by XboxkrnlModule::XboxkrnlModule
class HighResolutionTimer {
HighResolutionTimer(std::chrono::milliseconds interval,
std::function<void()> callback) {

View File

@ -205,7 +205,7 @@ void TimerQueueWaitItem::Disarm() {
spinner.spin_once();
}
}
//unused
std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
void* userdata,
WaitItem::clock::time_point due) {
@ -213,7 +213,7 @@ std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
std::make_shared<WaitItem>(std::move(callback), userdata, &timer_queue_,
due, WaitItem::clock::duration::zero()));
}
// only used by HighResolutionTimer
std::weak_ptr<WaitItem> QueueTimerRecurring(
std::function<void(void*)> callback, void* userdata,
WaitItem::clock::time_point due, WaitItem::clock::duration interval) {

View File

@ -31,8 +31,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
const uintmax_t* stream = command_stream_.data();
size_t stream_remaining = command_stream_.size();
const uintmax_t* stream = (const uintmax_t*)command_stream_.data();
size_t stream_remaining = command_stream_.size() / sizeof(uintmax_t);
ID3D12PipelineState* current_pipeline_state = nullptr;
while (stream_remaining != 0) {
const CommandHeader& header =
@ -266,8 +266,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
void* DeferredCommandList::WriteCommand(Command command,
size_t arguments_size_bytes) {
size_t arguments_size_elements =
(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
round_up(arguments_size_bytes, sizeof(uintmax_t), false);
//(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
#if 0
size_t offset = command_stream_.size();
command_stream_.resize(offset + kCommandHeaderSizeElements +
arguments_size_elements);
@ -276,6 +280,19 @@ void* DeferredCommandList::WriteCommand(Command command,
header.command = command;
header.arguments_size_elements = uint32_t(arguments_size_elements);
return command_stream_.data() + (offset + kCommandHeaderSizeElements);
#else
size_t offset = command_stream_.size();
constexpr size_t kCommandHeaderSizeBytes =
kCommandHeaderSizeElements * sizeof(uintmax_t);
command_stream_.resize(offset + kCommandHeaderSizeBytes +
arguments_size_elements);
CommandHeader& header =
*reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
header.command = command;
header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
#endif
}
} // namespace d3d12

View File

@ -19,7 +19,7 @@
#include "xenia/base/literals.h"
#include "xenia/base/math.h"
#include "xenia/ui/d3d12/d3d12_api.h"
#include "xenia/base/memory.h"
namespace xe {
namespace gpu {
namespace d3d12 {
@ -30,11 +30,12 @@ class D3D12CommandProcessor;
class DeferredCommandList {
public:
static constexpr size_t MAX_SIZEOF_COMMANDLIST = 65536 * 128; //around 8 mb
/*
chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps
*/
DeferredCommandList(const D3D12CommandProcessor& command_processor,
size_t initial_size_bytes = 4_MiB);
size_t initial_size_bytes = MAX_SIZEOF_COMMANDLIST);
void Reset();
void Execute(ID3D12GraphicsCommandList* command_list,
@ -565,7 +566,8 @@ class DeferredCommandList {
const D3D12CommandProcessor& command_processor_;
// uintmax_t to ensure uint64_t and pointer alignment of all structures.
std::vector<uintmax_t> command_stream_;
//std::vector<uintmax_t> command_stream_;
fixed_vmem_vector<MAX_SIZEOF_COMMANDLIST> command_stream_;
};
} // namespace d3d12