From cde092ece173db2b06a000afbc43dcbe06386b60 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 21 Mar 2020 19:21:00 +0300 Subject: [PATCH 1/5] [D3D12] Persistent shader and PSO storage --- src/xenia/app/emulator_window.cc | 18 +- src/xenia/app/emulator_window.h | 2 + src/xenia/app/xenia_main.cc | 7 +- src/xenia/base/filesystem.h | 11 + src/xenia/base/filesystem_posix.cc | 25 + src/xenia/base/filesystem_win.cc | 27 + src/xenia/emulator.cc | 11 +- src/xenia/emulator.h | 8 +- src/xenia/gpu/command_processor.cc | 4 + src/xenia/gpu/command_processor.h | 6 + .../gpu/d3d12/d3d12_command_processor.cc | 8 +- src/xenia/gpu/d3d12/d3d12_command_processor.h | 12 +- src/xenia/gpu/d3d12/deferred_command_list.cc | 5 +- src/xenia/gpu/d3d12/pipeline_cache.cc | 973 +++++++++++++++--- src/xenia/gpu/d3d12/pipeline_cache.h | 150 ++- src/xenia/gpu/gpu_flags.cc | 6 +- src/xenia/gpu/graphics_system.cc | 34 + src/xenia/gpu/graphics_system.h | 4 + src/xenia/gpu/trace_dump.cc | 2 +- src/xenia/gpu/trace_viewer.cc | 2 +- 20 files changed, 1112 insertions(+), 203 deletions(-) diff --git a/src/xenia/app/emulator_window.cc b/src/xenia/app/emulator_window.cc index 19dc370df..ae00cdfaf 100644 --- a/src/xenia/app/emulator_window.cc +++ b/src/xenia/app/emulator_window.cc @@ -245,9 +245,9 @@ bool EmulatorWindow::Initialize() { } gpu_menu->AddChild(MenuItem::Create(MenuItem::Type::kSeparator)); { - gpu_menu->AddChild( - MenuItem::Create(MenuItem::Type::kString, L"&Clear Caches", L"F5", - std::bind(&EmulatorWindow::GpuClearCaches, this))); + gpu_menu->AddChild(MenuItem::Create( + MenuItem::Type::kString, L"&Clear Runtime Caches", L"F5", + std::bind(&EmulatorWindow::GpuClearCaches, this))); } main_menu->AddChild(std::move(gpu_menu)); @@ -454,8 +454,20 @@ void EmulatorWindow::UpdateTitle() { title += xe::format_string(L" (@%.2fx)", Clock::guest_time_scalar()); } + if (initializing_shader_storage_) { + title += L" (Preloading shaders\u2026)"; + } + window_->set_title(title); } +void EmulatorWindow::SetInitializingShaderStorage(bool initializing) { + if (initializing_shader_storage_ == initializing) { + return; + } + initializing_shader_storage_ = initializing; + UpdateTitle(); +} + } // namespace app } // namespace xe diff --git a/src/xenia/app/emulator_window.h b/src/xenia/app/emulator_window.h index 3afb1e631..6d2c999b1 100644 --- a/src/xenia/app/emulator_window.h +++ b/src/xenia/app/emulator_window.h @@ -37,6 +37,7 @@ class EmulatorWindow { void UpdateTitle(); void ToggleFullscreen(); + void SetInitializingShaderStorage(bool initializing); private: explicit EmulatorWindow(Emulator* emulator); @@ -63,6 +64,7 @@ class EmulatorWindow { std::unique_ptr window_; std::wstring base_title_; uint64_t cursor_hide_time_ = 0; + bool initializing_shader_storage_ = false; }; } // namespace app diff --git a/src/xenia/app/xenia_main.cc b/src/xenia/app/xenia_main.cc index 492345d6a..6f4d85595 100644 --- a/src/xenia/app/xenia_main.cc +++ b/src/xenia/app/xenia_main.cc @@ -243,7 +243,7 @@ int xenia_main(const std::vector& args) { } // Create the emulator but don't initialize so we can setup the window. - auto emulator = std::make_unique(L"", content_root); + auto emulator = std::make_unique(L"", storage_root, content_root); // Main emulator display window. auto emulator_window = EmulatorWindow::Create(emulator.get()); @@ -331,6 +331,11 @@ int xenia_main(const std::vector& args) { evt->Set(); }); + emulator->on_shader_storage_initialization.AddListener( + [&](bool initializing) { + emulator_window->SetInitializingShaderStorage(initializing); + }); + emulator->on_terminate.AddListener([&]() { if (cvars::discord) { discord::DiscordPresence::NotPlaying(); diff --git a/src/xenia/base/filesystem.h b/src/xenia/base/filesystem.h index d627796e7..7a1cdee32 100644 --- a/src/xenia/base/filesystem.h +++ b/src/xenia/base/filesystem.h @@ -58,6 +58,17 @@ bool CreateFile(const std::wstring& path); // This behaves like fopen and the returned handle can be used with stdio. FILE* OpenFile(const std::wstring& path, const char* mode); +// Wrapper for the 64-bit version of fseek, returns true on success. +bool Seek(FILE* file, int64_t offset, int origin); + +// Wrapper for the 64-bit version of ftell, returns a positive value on success. +int64_t Tell(FILE* file); + +// Reduces the size of a stdio file opened for writing. The file pointer is +// clamped. If this returns false, the size of the file and the file pointer are +// undefined. +bool TruncateStdioFile(FILE* file, uint64_t length); + // Deletes the file at the given path. // Returns true if the file was found and removed. bool DeleteFile(const std::wstring& path); diff --git a/src/xenia/base/filesystem_posix.cc b/src/xenia/base/filesystem_posix.cc index f0ca77bf4..619af97ac 100644 --- a/src/xenia/base/filesystem_posix.cc +++ b/src/xenia/base/filesystem_posix.cc @@ -76,6 +76,31 @@ FILE* OpenFile(const std::wstring& path, const char* mode) { return fopen(xe::to_string(fixed_path).c_str(), mode); } +bool Seek(FILE* file, int64_t offset, int origin) { + return fseeko64(file, off64_t(offset), origin) == 0; +} + +int64_t Tell(FILE* file) { return int64_t(ftello64(file)); } + +bool TruncateStdioFile(FILE* file, uint64_t length) { + if (fflush(file)) { + return false; + } + int64_t position = Tell(file); + if (position < 0) { + return false; + } + if (ftruncate64(fileno(file), off64_t(length))) { + return false; + } + if (uint64_t(position) > length) { + if (!Seek(file, 0, SEEK_END)) { + return false; + } + } + return true; +} + bool CreateFolder(const std::wstring& path) { return mkdir(xe::to_string(path).c_str(), 0774); } diff --git a/src/xenia/base/filesystem_win.cc b/src/xenia/base/filesystem_win.cc index 02ff006f4..e38d9c226 100644 --- a/src/xenia/base/filesystem_win.cc +++ b/src/xenia/base/filesystem_win.cc @@ -12,6 +12,7 @@ #include +#include #include #include "xenia/base/platform_win.h" @@ -87,6 +88,32 @@ FILE* OpenFile(const std::wstring& path, const char* mode) { return _wfopen(fixed_path.c_str(), xe::to_wstring(mode).c_str()); } +bool Seek(FILE* file, int64_t offset, int origin) { + return _fseeki64(file, offset, origin) == 0; +} + +int64_t Tell(FILE* file) { return _ftelli64(file); } + +bool TruncateStdioFile(FILE* file, uint64_t length) { + // Flush is necessary - if not flushing, stream position may be out of sync. + if (fflush(file)) { + return false; + } + int64_t position = Tell(file); + if (position < 0) { + return false; + } + if (_chsize_s(_fileno(file), int64_t(length))) { + return false; + } + if (uint64_t(position) > length) { + if (!Seek(file, 0, SEEK_END)) { + return false; + } + } + return true; +} + bool DeleteFile(const std::wstring& path) { return DeleteFileW(path.c_str()) ? true : false; } diff --git a/src/xenia/emulator.cc b/src/xenia/emulator.cc index 6e2767dd3..9f33b784c 100644 --- a/src/xenia/emulator.cc +++ b/src/xenia/emulator.cc @@ -57,11 +57,13 @@ DEFINE_string( namespace xe { Emulator::Emulator(const std::wstring& command_line, + const std::wstring& storage_root, const std::wstring& content_root) : on_launch(), on_terminate(), on_exit(), command_line_(command_line), + storage_root_(storage_root), content_root_(content_root), game_title_(), display_window_(nullptr), @@ -685,11 +687,18 @@ X_STATUS Emulator::CompleteLaunch(const std::wstring& path, } } + // Initializing the shader storage in a blocking way so the user doesn't miss + // the initial seconds - for instance, sound from an intro video may start + // playing before the video can be seen if doing this in parallel with the + // main thread. + on_shader_storage_initialization(true); + graphics_system_->InitializeShaderStorage(storage_root_, title_id_, true); + on_shader_storage_initialization(false); + auto main_thread = kernel_state_->LaunchModule(module); if (!main_thread) { return X_STATUS_UNSUCCESSFUL; } - main_thread_ = main_thread; on_launch(title_id_, game_title_); diff --git a/src/xenia/emulator.h b/src/xenia/emulator.h index b9db4a31a..f973b6220 100644 --- a/src/xenia/emulator.h +++ b/src/xenia/emulator.h @@ -48,13 +48,17 @@ namespace xe { class Emulator { public: explicit Emulator(const std::wstring& command_line, + const std::wstring& storage_root, const std::wstring& content_root); ~Emulator(); // Full command line used when launching the process. const std::wstring& command_line() const { return command_line_; } - // Folder content is stored in. + // Folder persistent internal emulator data is stored in. + const std::wstring& storage_root() const { return storage_root_; } + + // Folder guest content is stored in. const std::wstring& content_root() const { return content_root_; } // Title of the game in the default language. @@ -146,6 +150,7 @@ class Emulator { public: xe::Delegate on_launch; + xe::Delegate on_shader_storage_initialization; xe::Delegate<> on_terminate; xe::Delegate<> on_exit; @@ -159,6 +164,7 @@ class Emulator { const std::string& module_path); std::wstring command_line_; + std::wstring storage_root_; std::wstring content_root_; std::wstring game_title_; diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 8cdb7f331..6ceddfea3 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -87,6 +87,10 @@ void CommandProcessor::Shutdown() { worker_thread_.reset(); } +void CommandProcessor::InitializeShaderStorage(const std::wstring& storage_root, + uint32_t title_id, + bool blocking) {} + void CommandProcessor::RequestFrameTrace(const std::wstring& root_path) { if (trace_state_ == TraceState::kStreaming) { XELOGE("Streaming trace; cannot also trace frame."); diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index 53120ae4b..1be5345f6 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -130,6 +130,12 @@ class CommandProcessor { swap_request_handler_ = fn; } + // May be called not only from the command processor thread when the command + // processor is paused, and the termination of this function may be explicitly + // awaited. + virtual void InitializeShaderStorage(const std::wstring& storage_root, + uint32_t title_id, bool blocking); + virtual void RequestFrameTrace(const std::wstring& root_path); virtual void BeginTracing(const std::wstring& root_path); virtual void EndTracing(); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 705480281..9dd8600ce 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -77,6 +77,12 @@ void D3D12CommandProcessor::ClearCaches() { cache_clear_requested_ = true; } +void D3D12CommandProcessor::InitializeShaderStorage( + const std::wstring& storage_root, uint32_t title_id, bool blocking) { + CommandProcessor::InitializeShaderStorage(storage_root, title_id, blocking); + pipeline_cache_->InitializeShaderStorage(storage_root, title_id, blocking); +} + void D3D12CommandProcessor::RequestFrameTrace(const std::wstring& root_path) { // Capture with PIX if attached. if (GetD3D12Context()->GetD3D12Provider()->GetGraphicsAnalysis() != nullptr) { @@ -2123,7 +2129,7 @@ bool D3D12CommandProcessor::EndSubmission(bool is_swap) { } bool D3D12CommandProcessor::CanEndSubmissionImmediately() const { - return !submission_open_ || !pipeline_cache_->IsCreatingPipelines(); + return !submission_open_ || !pipeline_cache_->IsCreatingPipelineStates(); } void D3D12CommandProcessor::AwaitAllSubmissionsCompletion() { diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 08fcf7510..3a203bee7 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -43,6 +43,9 @@ class D3D12CommandProcessor : public CommandProcessor { void ClearCaches() override; + void InitializeShaderStorage(const std::wstring& storage_root, + uint32_t title_id, bool blocking) override; + void RequestFrameTrace(const std::wstring& root_path) override; void TracePlaybackWroteMemory(uint32_t base_ptr, uint32_t length) override; @@ -125,10 +128,11 @@ class D3D12CommandProcessor : public CommandProcessor { // render targets or copying to depth render targets. void SetSamplePositions(MsaaSamples sample_positions); - // Returns a pipeline with deferred creation by its handle. May return nullptr - // if failed to create the pipeline. - inline ID3D12PipelineState* GetPipelineStateByHandle(void* handle) const { - return pipeline_cache_->GetPipelineStateByHandle(handle); + // Returns a pipeline state object with deferred creation by its handle. May + // return nullptr if failed to create the pipeline state object. + inline ID3D12PipelineState* GetD3D12PipelineStateByHandle( + void* handle) const { + return pipeline_cache_->GetD3D12PipelineStateByHandle(handle); } // Sets the current pipeline state to a compute pipeline. This is for cache diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc index bdebaf3c9..20a655581 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.cc +++ b/src/xenia/gpu/d3d12/deferred_command_list.cc @@ -200,8 +200,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, } } break; case Command::kSetPipelineStateHandle: { - current_pipeline_state = command_processor_->GetPipelineStateByHandle( - *reinterpret_cast(stream)); + current_pipeline_state = + command_processor_->GetD3D12PipelineStateByHandle( + *reinterpret_cast(stream)); if (current_pipeline_state) { command_list->SetPipelineState(current_pipeline_state); } diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index ba3314007..8bd5b894e 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -10,15 +10,21 @@ #include "xenia/gpu/d3d12/pipeline_cache.h" #include +#include #include #include #include +#include +#include #include #include "third_party/xxhash/xxhash.h" #include "xenia/base/assert.h" +#include "xenia/base/byte_order.h" +#include "xenia/base/clock.h" #include "xenia/base/cvar.h" +#include "xenia/base/filesystem.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/profiling.h" @@ -30,10 +36,10 @@ DEFINE_bool(d3d12_dxbc_disasm, false, "Disassemble DXBC shaders after generation.", "D3D12"); DEFINE_int32( d3d12_pipeline_creation_threads, -1, - "Number of threads used for graphics pipeline state creation. -1 to " - "calculate automatically (75% of logical CPU cores), 1-16 to specify the " - "number of threads explicitly, 0 to disable multithreaded pipeline state " - "creation.", + "Number of threads used for graphics pipeline state object creation. -1 to " + "calculate automatically (75% of logical CPU cores), a positive number to " + "specify the number of threads explicitly (up to the number of logical CPU " + "cores), 0 to disable multithreaded pipeline state object creation.", "D3D12"); DEFINE_bool( d3d12_tessellation_adaptive, false, @@ -62,6 +68,8 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/tessellation_quad_vs.h" #include "xenia/gpu/d3d12/shaders/dxbc/tessellation_triangle_vs.h" +constexpr uint32_t PipelineCache::PipelineDescription::kVersion; + PipelineCache::PipelineCache(D3D12CommandProcessor* command_processor, RegisterFile* register_file, bool edram_rov_used, uint32_t resolution_scale) @@ -84,24 +92,33 @@ PipelineCache::PipelineCache(D3D12CommandProcessor* command_processor, PipelineCache::~PipelineCache() { Shutdown(); } bool PipelineCache::Initialize() { + uint32_t logical_processor_count = xe::threading::logical_processor_count(); + if (!logical_processor_count) { + // Pick some reasonable amount if couldn't determine the number of cores. + logical_processor_count = 6; + } + // Initialize creation thread synchronization data even if not using creation + // threads because they may be used anyway to create pipeline state objects + // from the storage. + creation_threads_busy_ = 0; + creation_completion_event_ = + xe::threading::Event::CreateManualResetEvent(true); + creation_completion_set_event_ = false; + creation_threads_shutdown_from_ = SIZE_MAX; if (cvars::d3d12_pipeline_creation_threads != 0) { - creation_threads_busy_ = 0; - creation_completion_event_ = - xe::threading::Event::CreateManualResetEvent(true); - creation_completion_set_event_ = false; - creation_threads_shutdown_ = false; - uint32_t creation_thread_count; + size_t creation_thread_count; if (cvars::d3d12_pipeline_creation_threads < 0) { - creation_thread_count = std::max( - xe::threading::logical_processor_count() * 3 / 4, uint32_t(1)); + creation_thread_count = + std::max(logical_processor_count * 3 / 4, uint32_t(1)); } else { - creation_thread_count = uint32_t(cvars::d3d12_pipeline_creation_threads); + creation_thread_count = + std::min(uint32_t(cvars::d3d12_pipeline_creation_threads), + logical_processor_count); } - creation_thread_count = std::min(creation_thread_count, uint32_t(16)); - for (uint32_t i = 0; i < creation_thread_count; ++i) { + for (size_t i = 0; i < creation_thread_count; ++i) { std::unique_ptr creation_thread = - xe::threading::Thread::Create({}, [this]() { CreationThread(); }); - creation_thread->set_name("D3D12 Pipelines"); + xe::threading::Thread::Create({}, [this, i]() { CreationThread(i); }); + creation_thread->set_name("D3D12 Pipeline States"); creation_threads_.push_back(std::move(creation_thread)); } } @@ -109,71 +126,571 @@ bool PipelineCache::Initialize() { } void PipelineCache::Shutdown() { - ClearCache(); + ClearCache(true); // Shut down all threads. if (!creation_threads_.empty()) { { std::lock_guard lock(creation_request_lock_); - creation_threads_shutdown_ = true; + creation_threads_shutdown_from_ = 0; } creation_request_cond_.notify_all(); for (size_t i = 0; i < creation_threads_.size(); ++i) { xe::threading::Wait(creation_threads_[i].get(), false); } creation_threads_.clear(); - creation_completion_event_.reset(); } + creation_completion_event_.reset(); } -void PipelineCache::ClearCache() { - // Remove references to the current pipeline. - current_pipeline_ = nullptr; +void PipelineCache::ClearCache(bool shutting_down) { + bool reinitialize_shader_storage = + !shutting_down && storage_write_thread_ != nullptr; + std::wstring shader_storage_root; + uint32_t shader_storage_title_id = shader_storage_title_id_; + if (reinitialize_shader_storage) { + shader_storage_root = shader_storage_root_; + } + ShutdownShaderStorage(); + + // Remove references to the current pipeline state object. + current_pipeline_state_ = nullptr; if (!creation_threads_.empty()) { - // Empty the pipeline creation queue. + // Empty the pipeline state object creation queue and make sure there are no + // threads currently creating pipeline state objects because pipeline states + // are going to be deleted. + bool await_creation_completion_event = false; { std::lock_guard lock(creation_request_lock_); creation_queue_.clear(); - creation_completion_set_event_ = true; + await_creation_completion_event = creation_threads_busy_ != 0; + if (await_creation_completion_event) { + creation_completion_event_->Reset(); + creation_completion_set_event_ = true; + } + } + if (await_creation_completion_event) { + creation_request_cond_.notify_one(); + xe::threading::Wait(creation_completion_event_.get(), false); } - creation_request_cond_.notify_one(); } - // Destroy all pipelines. - for (auto it : pipelines_) { + // Destroy all pipeline state objects. + for (auto it : pipeline_states_) { it.second->state->Release(); delete it.second; } - pipelines_.clear(); - COUNT_profile_set("gpu/pipeline_cache/pipelines", 0); + pipeline_states_.clear(); + COUNT_profile_set("gpu/pipeline_cache/pipeline_states", 0); // Destroy all shaders. for (auto it : shader_map_) { delete it.second; } shader_map_.clear(); + + if (reinitialize_shader_storage) { + InitializeShaderStorage(shader_storage_root, shader_storage_title_id, + false); + } +} + +void PipelineCache::InitializeShaderStorage(const std::wstring& storage_root, + uint32_t title_id, bool blocking) { + ShutdownShaderStorage(); + + auto shader_storage_root = xe::join_paths(storage_root, L"shaders"); + // For files that can be moved between different hosts. + // Host PSO blobs - if ever added - should be stored in shaders/local/ (they + // currently aren't used because because they may be not very practical - + // would need to invalidate them every commit likely, and additional I/O + // cost - though D3D's internal validation would possibly be enough to ensure + // they are up to date). + auto shader_storage_shareable_root = + xe::join_paths(shader_storage_root, L"shareable"); + if (!xe::filesystem::CreateFolder(shader_storage_shareable_root)) { + return; + } + + size_t logical_processor_count = xe::threading::logical_processor_count(); + if (!logical_processor_count) { + // Pick some reasonable amount if couldn't determine the number of cores. + logical_processor_count = 6; + } + + // Initialize the Xenos shader storage stream. + uint64_t shader_storage_initialization_start = + xe::Clock::QueryHostTickCount(); + shader_storage_file_ = xe::filesystem::OpenFile( + xe::join_paths(shader_storage_shareable_root, + xe::format_string(L"%.8X.xsh", title_id)), + "a+b"); + if (!shader_storage_file_) { + return; + } + shader_storage_file_flush_needed_ = false; + struct { + uint32_t magic; + uint32_t version_swapped; + } shader_storage_file_header; + // 'XESH'. + const uint32_t shader_storage_magic = 0x48534558; + if (fread(&shader_storage_file_header, sizeof(shader_storage_file_header), 1, + shader_storage_file_) && + shader_storage_file_header.magic == shader_storage_magic && + xe::byte_swap(shader_storage_file_header.version_swapped) == + ShaderStoredHeader::kVersion) { + uint64_t shader_storage_valid_bytes = sizeof(shader_storage_file_header); + // Load and translate shaders written by previous Xenia executions until the + // end of the file or until a corrupted one is detected. + ShaderStoredHeader shader_header; + std::vector ucode_dwords; + ucode_dwords.reserve(0xFFFF); + size_t shaders_translated = 0; + + // Threads overlapping file reading. + std::mutex shaders_translation_thread_mutex; + std::condition_variable shaders_translation_thread_cond; + std::deque> + shaders_to_translate; + size_t shader_translation_threads_busy = 0; + bool shader_translation_threads_shutdown = false; + std::mutex shaders_failed_to_translate_mutex; + std::vector shaders_failed_to_translate; + auto shader_translation_thread_function = [&]() { + auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); + DxbcShaderTranslator translator( + provider->GetAdapterVendorID(), edram_rov_used_, + provider->GetGraphicsAnalysis() != nullptr); + for (;;) { + std::pair shader_to_translate; + for (;;) { + std::unique_lock lock(shaders_translation_thread_mutex); + if (shaders_to_translate.empty()) { + if (shader_translation_threads_shutdown) { + return; + } + shaders_translation_thread_cond.wait(lock); + continue; + } + shader_to_translate = shaders_to_translate.front(); + shaders_to_translate.pop_front(); + ++shader_translation_threads_busy; + break; + } + assert_not_null(shader_to_translate.second); + if (!TranslateShader(translator, shader_to_translate.second, + shader_to_translate.first.sq_program_cntl, + shader_to_translate.first.patch_primitive_type)) { + std::unique_lock lock(shaders_failed_to_translate_mutex); + shaders_failed_to_translate.push_back(shader_to_translate.second); + } + { + std::unique_lock lock(shaders_translation_thread_mutex); + --shader_translation_threads_busy; + } + } + }; + std::vector> + shader_translation_threads; + + while (true) { + if (!fread(&shader_header, sizeof(shader_header), 1, + shader_storage_file_)) { + break; + } + size_t ucode_byte_count = + shader_header.ucode_dword_count * sizeof(uint32_t); + if (shader_map_.find(shader_header.ucode_data_hash) != + shader_map_.end()) { + // Already added - usually shaders aren't added without the intention of + // translating them imminently, so don't do additional checks to + // actually ensure that translation happens right now (they would cause + // a race condition with shaders currently queued for translation). + if (!xe::filesystem::Seek(shader_storage_file_, + int64_t(ucode_byte_count), SEEK_CUR)) { + break; + } + shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count; + continue; + } + ucode_dwords.resize(shader_header.ucode_dword_count); + if (shader_header.ucode_dword_count && + !fread(ucode_dwords.data(), ucode_byte_count, 1, + shader_storage_file_)) { + break; + } + uint64_t ucode_data_hash = + XXH64(ucode_dwords.data(), ucode_byte_count, 0); + if (shader_header.ucode_data_hash != ucode_data_hash) { + // Validation failed. + break; + } + D3D12Shader* shader = + new D3D12Shader(shader_header.type, ucode_data_hash, + ucode_dwords.data(), shader_header.ucode_dword_count); + shader_map_.insert({ucode_data_hash, shader}); + // Create new threads if the currently existing threads can't keep up with + // file reading, but not more than the number of logical processors minus + // one. + size_t shader_translation_threads_needed; + { + std::unique_lock lock(shaders_translation_thread_mutex); + shader_translation_threads_needed = + std::min(shader_translation_threads_busy + + shaders_to_translate.size() + size_t(1), + logical_processor_count - size_t(1)); + } + while (shader_translation_threads.size() < + shader_translation_threads_needed) { + shader_translation_threads.push_back(xe::threading::Thread::Create( + {}, shader_translation_thread_function)); + shader_translation_threads.back()->set_name("Shader Translation"); + } + { + std::unique_lock lock(shaders_translation_thread_mutex); + shaders_to_translate.emplace_back(shader_header, shader); + } + shaders_translation_thread_cond.notify_one(); + shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count; + ++shaders_translated; + } + if (!shader_translation_threads.empty()) { + { + std::unique_lock lock(shaders_translation_thread_mutex); + shader_translation_threads_shutdown = true; + } + shaders_translation_thread_cond.notify_all(); + for (auto& shader_translation_thread : shader_translation_threads) { + xe::threading::Wait(shader_translation_thread.get(), false); + } + shader_translation_threads.clear(); + for (D3D12Shader* shader : shaders_failed_to_translate) { + shader_map_.erase(shader->ucode_data_hash()); + delete shader; + } + } + XELOGGPU("Translated %zu shaders from the storage in %" PRIu64 + " milliseconds", + shaders_translated, + (xe::Clock::QueryHostTickCount() - + shader_storage_initialization_start) * + 1000 / xe::Clock::QueryHostTickFrequency()); + xe::filesystem::TruncateStdioFile(shader_storage_file_, + shader_storage_valid_bytes); + } else { + xe::filesystem::TruncateStdioFile(shader_storage_file_, 0); + shader_storage_file_header.magic = shader_storage_magic; + shader_storage_file_header.version_swapped = + xe::byte_swap(ShaderStoredHeader::kVersion); + fwrite(&shader_storage_file_header, sizeof(shader_storage_file_header), 1, + shader_storage_file_); + } + + // 'DXRO' or 'DXRT'. + const uint32_t pipeline_state_storage_magic_api = + edram_rov_used_ ? 0x4F525844 : 0x54525844; + + // Initialize the pipeline state storage stream. + uint64_t pipeline_state_storage_initialization_start_ = + xe::Clock::QueryHostTickCount(); + pipeline_state_storage_file_ = xe::filesystem::OpenFile( + xe::join_paths(shader_storage_shareable_root, + xe::format_string(L"%.8X.%s.d3d12.xpso", title_id, + edram_rov_used_ ? L"rov" : L"rtv")), + "a+b"); + if (!pipeline_state_storage_file_) { + fclose(shader_storage_file_); + shader_storage_file_ = nullptr; + return; + } + pipeline_state_storage_file_flush_needed_ = false; + // 'XEPS'. + const uint32_t pipeline_state_storage_magic = 0x53504558; + struct { + uint32_t magic; + uint32_t magic_api; + uint32_t version_swapped; + } pipeline_state_storage_file_header; + if (fread(&pipeline_state_storage_file_header, + sizeof(pipeline_state_storage_file_header), 1, + pipeline_state_storage_file_) && + pipeline_state_storage_file_header.magic == + pipeline_state_storage_magic && + pipeline_state_storage_file_header.magic_api == + pipeline_state_storage_magic_api && + xe::byte_swap(pipeline_state_storage_file_header.version_swapped) == + PipelineDescription::kVersion) { + uint64_t pipeline_state_storage_valid_bytes = + sizeof(pipeline_state_storage_file_header); + // Enqueue pipeline state descriptions written by previous Xenia executions + // until the end of the file or until a corrupted one is detected. + xe::filesystem::Seek(pipeline_state_storage_file_, 0, SEEK_END); + int64_t pipeline_state_storage_told_end = + xe::filesystem::Tell(pipeline_state_storage_file_); + size_t pipeline_state_storage_told_count = + size_t(pipeline_state_storage_told_end >= + int64_t(pipeline_state_storage_valid_bytes) + ? (uint64_t(pipeline_state_storage_told_end) - + pipeline_state_storage_valid_bytes) / + sizeof(PipelineStoredDescription) + : 0); + if (pipeline_state_storage_told_count && + xe::filesystem::Seek(pipeline_state_storage_file_, + int64_t(pipeline_state_storage_valid_bytes), + SEEK_SET)) { + std::vector pipeline_stored_descriptions; + pipeline_stored_descriptions.resize(pipeline_state_storage_told_count); + pipeline_stored_descriptions.resize(fread( + pipeline_stored_descriptions.data(), + sizeof(PipelineStoredDescription), pipeline_state_storage_told_count, + pipeline_state_storage_file_)); + if (!pipeline_stored_descriptions.empty()) { + // Launch additional creation threads to use all cores to create + // pipeline state objects faster. Will also be using the main thread, so + // minus 1. + size_t creation_thread_original_count = creation_threads_.size(); + size_t creation_thread_needed_count = + std::max(std::min(pipeline_stored_descriptions.size(), + logical_processor_count) - + size_t(1), + creation_thread_original_count); + while (creation_threads_.size() < creation_thread_original_count) { + size_t creation_thread_index = creation_threads_.size(); + std::unique_ptr creation_thread = + xe::threading::Thread::Create( + {}, [this, creation_thread_index]() { + CreationThread(creation_thread_index); + }); + creation_thread->set_name("D3D12 Pipeline States Additional"); + creation_threads_.push_back(std::move(creation_thread)); + } + size_t pipeline_states_created = 0; + for (const PipelineStoredDescription& pipeline_stored_description : + pipeline_stored_descriptions) { + const PipelineDescription& pipeline_description = + pipeline_stored_description.description; + // Validate file integrity, stop and truncate the stream if data is + // corrupted. + if (XXH64(&pipeline_stored_description.description, + sizeof(pipeline_stored_description.description), + 0) != pipeline_stored_description.description_hash) { + break; + } + pipeline_state_storage_valid_bytes += + sizeof(PipelineStoredDescription); + // Skip already known pipeline states - those have already been + // enqueued. + auto found_range = pipeline_states_.equal_range( + pipeline_stored_description.description_hash); + bool pipeline_state_found = false; + for (auto it = found_range.first; it != found_range.second; ++it) { + PipelineState* found_pipeline_state = it->second; + if (!std::memcmp(&found_pipeline_state->description.description, + &pipeline_description, + sizeof(pipeline_description))) { + pipeline_state_found = true; + break; + } + } + if (pipeline_state_found) { + continue; + } + + PipelineRuntimeDescription pipeline_runtime_description; + auto vertex_shader_it = + shader_map_.find(pipeline_description.vertex_shader_hash); + if (vertex_shader_it == shader_map_.end()) { + continue; + } + pipeline_runtime_description.vertex_shader = vertex_shader_it->second; + if (!pipeline_runtime_description.vertex_shader->is_valid()) { + continue; + } + if (pipeline_description.pixel_shader_hash) { + auto pixel_shader_it = + shader_map_.find(pipeline_description.pixel_shader_hash); + if (pixel_shader_it == shader_map_.end()) { + continue; + } + pipeline_runtime_description.pixel_shader = pixel_shader_it->second; + if (!pipeline_runtime_description.pixel_shader->is_valid()) { + continue; + } + } else { + pipeline_runtime_description.pixel_shader = nullptr; + } + pipeline_runtime_description.root_signature = + command_processor_->GetRootSignature( + pipeline_runtime_description.vertex_shader, + pipeline_runtime_description.pixel_shader, + pipeline_description.patch_type != PipelinePatchType::kNone); + if (!pipeline_runtime_description.root_signature) { + continue; + } + std::memcpy(&pipeline_runtime_description.description, + &pipeline_description, sizeof(pipeline_description)); + + PipelineState* new_pipeline_state = new PipelineState; + new_pipeline_state->state = nullptr; + std::memcpy(&new_pipeline_state->description, + &pipeline_runtime_description, + sizeof(pipeline_runtime_description)); + pipeline_states_.insert( + std::make_pair(pipeline_stored_description.description_hash, + new_pipeline_state)); + COUNT_profile_set("gpu/pipeline_cache/pipeline_states", + pipeline_states_.size()); + if (!creation_threads_.empty()) { + // Submit the pipeline for creation to any available thread. + { + std::lock_guard lock(creation_request_lock_); + creation_queue_.push_back(new_pipeline_state); + } + creation_request_cond_.notify_one(); + } else { + new_pipeline_state->state = + CreateD3D12PipelineState(pipeline_runtime_description); + } + ++pipeline_states_created; + } + CreateQueuedPipelineStatesOnProcessorThread(); + if (creation_threads_.size() > creation_thread_original_count) { + { + std::lock_guard lock(creation_request_lock_); + creation_threads_shutdown_from_ = creation_thread_original_count; + // Assuming the queue is empty because of + // CreateQueuedPipelineStatesOnProcessorThread. + } + creation_request_cond_.notify_all(); + while (creation_threads_.size() > creation_thread_original_count) { + xe::threading::Wait(creation_threads_.back().get(), false); + creation_threads_.pop_back(); + } + bool await_creation_completion_event; + { + // Cleanup so additional threads can be created later again. + std::lock_guard lock(creation_request_lock_); + creation_threads_shutdown_from_ = SIZE_MAX; + // If the invocation is blocking, all the shader storage + // initialization is expected to be done before proceeding, to avoid + // latency in the command processor after the invocation. + await_creation_completion_event = + blocking && creation_threads_busy_ != 0; + if (await_creation_completion_event) { + creation_completion_event_->Reset(); + creation_completion_set_event_ = true; + } + } + if (await_creation_completion_event) { + creation_request_cond_.notify_one(); + xe::threading::Wait(creation_completion_event_.get(), false); + } + } + XELOGGPU( + "Created %zu graphics pipeline state objects from the storage in " + "%" PRIu64 " milliseconds", + pipeline_states_created, + (xe::Clock::QueryHostTickCount() - + pipeline_state_storage_initialization_start_) * + 1000 / xe::Clock::QueryHostTickFrequency()); + } + } + xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_, + pipeline_state_storage_valid_bytes); + } else { + xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_, 0); + pipeline_state_storage_file_header.magic = pipeline_state_storage_magic; + pipeline_state_storage_file_header.magic_api = + pipeline_state_storage_magic_api; + pipeline_state_storage_file_header.version_swapped = + xe::byte_swap(PipelineDescription::kVersion); + fwrite(&pipeline_state_storage_file_header, + sizeof(pipeline_state_storage_file_header), 1, + pipeline_state_storage_file_); + } + + shader_storage_root_ = storage_root; + shader_storage_title_id_ = title_id; + + // Start the storage writing thread. + storage_write_flush_shaders_ = false; + storage_write_flush_pipeline_states_ = false; + storage_write_thread_shutdown_ = false; + storage_write_thread_ = + xe::threading::Thread::Create({}, [this]() { StorageWriteThread(); }); +} + +void PipelineCache::ShutdownShaderStorage() { + if (storage_write_thread_) { + { + std::lock_guard lock(storage_write_request_lock_); + storage_write_thread_shutdown_ = true; + } + storage_write_request_cond_.notify_all(); + xe::threading::Wait(storage_write_thread_.get(), false); + storage_write_thread_.reset(); + } + storage_write_shader_queue_.clear(); + storage_write_pipeline_state_queue_.clear(); + + if (pipeline_state_storage_file_) { + fclose(pipeline_state_storage_file_); + pipeline_state_storage_file_ = nullptr; + pipeline_state_storage_file_flush_needed_ = false; + } + + if (shader_storage_file_) { + fclose(shader_storage_file_); + shader_storage_file_ = nullptr; + shader_storage_file_flush_needed_ = false; + } + + shader_storage_root_.clear(); + shader_storage_title_id_ = 0; } void PipelineCache::EndSubmission() { - if (!creation_threads_.empty()) { - // Await creation of all queued pipelines. - bool await_event = false; + if (shader_storage_file_flush_needed_ || + pipeline_state_storage_file_flush_needed_) { { - std::lock_guard lock(creation_request_lock_); - if (!creation_queue_.empty() || creation_threads_busy_ != 0) { - creation_completion_event_->Reset(); - creation_completion_set_event_ = true; - await_event = true; + std::unique_lock lock(storage_write_request_lock_); + if (shader_storage_file_flush_needed_) { + storage_write_flush_shaders_ = true; + } + if (pipeline_state_storage_file_flush_needed_) { + storage_write_flush_pipeline_states_ = true; } } - if (await_event) { + storage_write_request_cond_.notify_one(); + shader_storage_file_flush_needed_ = false; + pipeline_state_storage_file_flush_needed_ = false; + } + if (!creation_threads_.empty()) { + CreateQueuedPipelineStatesOnProcessorThread(); + // Await creation of all queued pipeline state objects. + bool await_creation_completion_event; + { + std::lock_guard lock(creation_request_lock_); + // Assuming the creation queue is already empty (because the processor + // thread also worked on creating the leftover pipeline state objects), so + // only check if there are threads with pipeline state objects currently + // being created. + await_creation_completion_event = creation_threads_busy_ != 0; + if (await_creation_completion_event) { + creation_completion_event_->Reset(); + creation_completion_set_event_ = true; + } + } + if (await_creation_completion_event) { + creation_request_cond_.notify_one(); xe::threading::Wait(creation_completion_event_.get(), false); } } } -bool PipelineCache::IsCreatingPipelines() { +bool PipelineCache::IsCreatingPipelineStates() { if (creation_threads_.empty()) { return false; } @@ -225,18 +742,45 @@ bool PipelineCache::EnsureShadersTranslated(D3D12Shader* vertex_shader, xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill); assert_false(sq_program_cntl.gen_index_vtx); - if (!vertex_shader->is_translated() && - !TranslateShader(vertex_shader, sq_program_cntl, tessellated, - primitive_type)) { - XELOGE("Failed to translate the vertex shader!"); - return false; + PrimitiveType patch_primitive_type = + tessellated ? primitive_type : PrimitiveType::kNone; + + if (!vertex_shader->is_translated()) { + if (!TranslateShader(*shader_translator_, vertex_shader, sq_program_cntl, + patch_primitive_type)) { + XELOGE("Failed to translate the vertex shader!"); + return false; + } + if (shader_storage_file_) { + assert_not_null(storage_write_thread_); + shader_storage_file_flush_needed_ = true; + { + std::lock_guard lock(storage_write_request_lock_); + storage_write_shader_queue_.push_back( + std::make_pair(vertex_shader, sq_program_cntl)); + } + storage_write_request_cond_.notify_all(); + } } - if (pixel_shader != nullptr && !pixel_shader->is_translated() && - !TranslateShader(pixel_shader, sq_program_cntl, tessellated, - primitive_type)) { - XELOGE("Failed to translate the pixel shader!"); - return false; + + if (pixel_shader != nullptr && !pixel_shader->is_translated()) { + if (!TranslateShader(*shader_translator_, pixel_shader, sq_program_cntl, + patch_primitive_type)) { + XELOGE("Failed to translate the pixel shader!"); + return false; + } + if (shader_storage_file_) { + assert_not_null(storage_write_thread_); + shader_storage_file_flush_needed_ = true; + { + std::lock_guard lock(storage_write_request_lock_); + storage_write_shader_queue_.push_back( + std::make_pair(pixel_shader, sq_program_cntl)); + } + storage_write_request_cond_.notify_all(); + } } + return true; } @@ -244,39 +788,41 @@ bool PipelineCache::ConfigurePipeline( D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, bool tessellated, PrimitiveType primitive_type, IndexFormat index_format, bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], - void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) { + void** pipeline_state_handle_out, + ID3D12RootSignature** root_signature_out) { #if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // FINE_GRAINED_DRAW_SCOPES - assert_not_null(pipeline_handle_out); + assert_not_null(pipeline_state_handle_out); assert_not_null(root_signature_out); - PipelineDescription description; + PipelineRuntimeDescription runtime_description; if (!GetCurrentStateDescription(vertex_shader, pixel_shader, tessellated, primitive_type, index_format, early_z, - render_targets, description)) { + render_targets, runtime_description)) { return false; } + PipelineDescription& description = runtime_description.description; - if (current_pipeline_ != nullptr && - !std::memcmp(¤t_pipeline_->description, &description, - sizeof(description))) { - *pipeline_handle_out = current_pipeline_; - *root_signature_out = description.root_signature; + if (current_pipeline_state_ != nullptr && + !std::memcmp(¤t_pipeline_state_->description.description, + &description, sizeof(description))) { + *pipeline_state_handle_out = current_pipeline_state_; + *root_signature_out = runtime_description.root_signature; return true; } - // Find an existing pipeline in the cache. + // Find an existing pipeline state object in the cache. uint64_t hash = XXH64(&description, sizeof(description), 0); - auto found_range = pipelines_.equal_range(hash); - for (auto iter = found_range.first; iter != found_range.second; ++iter) { - Pipeline* found_pipeline = iter->second; - if (!std::memcmp(&found_pipeline->description, &description, - sizeof(description))) { - current_pipeline_ = found_pipeline; - *pipeline_handle_out = found_pipeline; - *root_signature_out = found_pipeline->description.root_signature; + auto found_range = pipeline_states_.equal_range(hash); + for (auto it = found_range.first; it != found_range.second; ++it) { + PipelineState* found_pipeline_state = it->second; + if (!std::memcmp(&found_pipeline_state->description.description, + &description, sizeof(description))) { + current_pipeline_state_ = found_pipeline_state; + *pipeline_state_handle_out = found_pipeline_state; + *root_signature_out = found_pipeline_state->description.root_signature; return true; } } @@ -286,36 +832,53 @@ bool PipelineCache::ConfigurePipeline( return false; } - Pipeline* new_pipeline = new Pipeline; - new_pipeline->state = nullptr; - std::memcpy(&new_pipeline->description, &description, sizeof(description)); - pipelines_.insert(std::make_pair(hash, new_pipeline)); - COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); + PipelineState* new_pipeline_state = new PipelineState; + new_pipeline_state->state = nullptr; + std::memcpy(&new_pipeline_state->description, &runtime_description, + sizeof(runtime_description)); + pipeline_states_.insert(std::make_pair(hash, new_pipeline_state)); + COUNT_profile_set("gpu/pipeline_cache/pipeline_states", + pipeline_states_.size()); if (!creation_threads_.empty()) { - // Submit the pipeline for creation to any available thread. + // Submit the pipeline state object for creation to any available thread. { std::lock_guard lock(creation_request_lock_); - creation_queue_.push_back(new_pipeline); + creation_queue_.push_back(new_pipeline_state); } creation_request_cond_.notify_one(); } else { - new_pipeline->state = CreatePipelineState(description); + new_pipeline_state->state = CreateD3D12PipelineState(runtime_description); } - current_pipeline_ = new_pipeline; - *pipeline_handle_out = new_pipeline; - *root_signature_out = description.root_signature; + if (pipeline_state_storage_file_) { + assert_not_null(storage_write_thread_); + pipeline_state_storage_file_flush_needed_ = true; + { + std::lock_guard lock(storage_write_request_lock_); + storage_write_pipeline_state_queue_.emplace_back(); + PipelineStoredDescription& stored_description = + storage_write_pipeline_state_queue_.back(); + stored_description.description_hash = hash; + std::memcpy(&stored_description.description, &description, + sizeof(description)); + } + storage_write_request_cond_.notify_all(); + } + + current_pipeline_state_ = new_pipeline_state; + *pipeline_state_handle_out = new_pipeline_state; + *root_signature_out = runtime_description.root_signature; return true; } -bool PipelineCache::TranslateShader(D3D12Shader* shader, - reg::SQ_PROGRAM_CNTL cntl, bool tessellated, - PrimitiveType primitive_type) { +bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, + D3D12Shader* shader, + reg::SQ_PROGRAM_CNTL cntl, + PrimitiveType patch_primitive_type) { // Perform translation. // If this fails the shader will be marked as invalid and ignored later. - if (!shader_translator_->Translate( - shader, tessellated ? primitive_type : PrimitiveType::kNone, cntl)) { + if (!translator.Translate(shader, patch_primitive_type, cntl)) { XELOGE("Shader %.16" PRIX64 " translation failed; marking as ignored", shader->ucode_data_hash()); return false; @@ -323,10 +886,10 @@ bool PipelineCache::TranslateShader(D3D12Shader* shader, uint32_t texture_srv_count; const DxbcShaderTranslator::TextureSRV* texture_srvs = - shader_translator_->GetTextureSRVs(texture_srv_count); + translator.GetTextureSRVs(texture_srv_count); uint32_t sampler_binding_count; const DxbcShaderTranslator::SamplerBinding* sampler_bindings = - shader_translator_->GetSamplerBindings(sampler_binding_count); + translator.GetSamplerBindings(sampler_binding_count); shader->SetTexturesAndSamplers(texture_srvs, texture_srv_count, sampler_bindings, sampler_binding_count); @@ -358,7 +921,10 @@ bool PipelineCache::TranslateShader(D3D12Shader* shader, // Dump shader files if desired. if (!cvars::dump_shaders.empty()) { - shader->Dump(cvars::dump_shaders, "d3d12"); + shader->Dump(cvars::dump_shaders, + (shader->type() == ShaderType::kPixel) + ? (edram_rov_used_ ? "d3d12_rov" : "d3d12_rtv") + : "d3d12"); } return shader->is_valid(); @@ -368,24 +934,30 @@ bool PipelineCache::GetCurrentStateDescription( D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, bool tessellated, PrimitiveType primitive_type, IndexFormat index_format, bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], - PipelineDescription& description_out) { + PipelineRuntimeDescription& runtime_description_out) { + PipelineDescription& description_out = runtime_description_out.description; + auto& regs = *register_file_; auto pa_su_sc_mode_cntl = regs.Get(); bool primitive_two_faced = IsPrimitiveTwoFaced(tessellated, primitive_type); // Initialize all unused fields to zero for comparison/hashing. - std::memset(&description_out, 0, sizeof(description_out)); + std::memset(&runtime_description_out, 0, sizeof(runtime_description_out)); // Root signature. - description_out.root_signature = command_processor_->GetRootSignature( + runtime_description_out.root_signature = command_processor_->GetRootSignature( vertex_shader, pixel_shader, tessellated); - if (description_out.root_signature == nullptr) { + if (runtime_description_out.root_signature == nullptr) { return false; } // Shaders. - description_out.vertex_shader = vertex_shader; - description_out.pixel_shader = pixel_shader; + runtime_description_out.vertex_shader = vertex_shader; + description_out.vertex_shader_hash = vertex_shader->ucode_data_hash(); + if (pixel_shader) { + runtime_description_out.pixel_shader = pixel_shader; + description_out.pixel_shader_hash = pixel_shader->ucode_data_hash(); + } // Index buffer strip cut value. if (pa_su_sc_mode_cntl.multi_prim_ib_ena) { @@ -672,7 +1244,8 @@ bool PipelineCache::GetCurrentStateDescription( /* 16 */ PipelineBlendFactor::kSrcAlphaSat, }; // Like kBlendFactorMap, but with color modes changed to alpha. Some - // pipelines aren't created in Prey because a color mode is used for alpha. + // pipeline state objects aren't created in Prey because a color mode is + // used for alpha. static const PipelineBlendFactor kBlendFactorAlphaMap[32] = { /* 0 */ PipelineBlendFactor::kZero, /* 1 */ PipelineBlendFactor::kOne, @@ -732,23 +1305,25 @@ bool PipelineCache::GetCurrentStateDescription( return true; } -ID3D12PipelineState* PipelineCache::CreatePipelineState( - const PipelineDescription& description) { - if (description.pixel_shader != nullptr) { +ID3D12PipelineState* PipelineCache::CreateD3D12PipelineState( + const PipelineRuntimeDescription& runtime_description) { + const PipelineDescription& description = runtime_description.description; + + if (runtime_description.pixel_shader != nullptr) { XELOGGPU("Creating graphics pipeline state with VS %.16" PRIX64 ", PS %.16" PRIX64, - description.vertex_shader->ucode_data_hash(), - description.pixel_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash(), + runtime_description.pixel_shader->ucode_data_hash()); } else { XELOGGPU("Creating graphics pipeline state with VS %.16" PRIX64, - description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash()); } D3D12_GRAPHICS_PIPELINE_STATE_DESC state_desc; std::memset(&state_desc, 0, sizeof(state_desc)); // Root signature. - state_desc.pRootSignature = description.root_signature; + state_desc.pRootSignature = runtime_description.root_signature; // Index buffer strip cut value. switch (description.strip_cut_index) { @@ -765,22 +1340,22 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState( } // Vertex or hull/domain shaders. - if (!description.vertex_shader->is_translated()) { + if (!runtime_description.vertex_shader->is_translated()) { XELOGE("Vertex shader %.16" PRIX64 " not translated", - description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash()); assert_always(); return nullptr; } if (description.tessellation_mode != PipelineTessellationMode::kNone) { switch (description.patch_type) { case PipelinePatchType::kTriangle: - if (description.vertex_shader->patch_primitive_type() != + if (runtime_description.vertex_shader->patch_primitive_type() != PrimitiveType::kTrianglePatch) { XELOGE( "Tried to use vertex shader %.16" PRIX64 " for triangle patch tessellation, but it's not a tessellation " "domain shader or has the wrong domain", - description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash()); assert_always(); return nullptr; } @@ -800,12 +1375,12 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState( state_desc.VS.BytecodeLength = sizeof(tessellation_triangle_vs); break; case PipelinePatchType::kQuad: - if (description.vertex_shader->patch_primitive_type() != + if (runtime_description.vertex_shader->patch_primitive_type() != PrimitiveType::kQuadPatch) { XELOGE("Tried to use vertex shader %.16" PRIX64 " for quad patch tessellation, but it's not a tessellation " "domain shader or has the wrong domain", - description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash()); assert_always(); return nullptr; } @@ -827,22 +1402,22 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState( } // The Xenos vertex shader works like a domain shader with tessellation. state_desc.DS.pShaderBytecode = - description.vertex_shader->translated_binary().data(); + runtime_description.vertex_shader->translated_binary().data(); state_desc.DS.BytecodeLength = - description.vertex_shader->translated_binary().size(); + runtime_description.vertex_shader->translated_binary().size(); } else { - if (description.vertex_shader->patch_primitive_type() != + if (runtime_description.vertex_shader->patch_primitive_type() != PrimitiveType::kNone) { XELOGE("Tried to use vertex shader %.16" PRIX64 " without tessellation, but it's a tessellation domain shader", - description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash()); assert_always(); return nullptr; } state_desc.VS.pShaderBytecode = - description.vertex_shader->translated_binary().data(); + runtime_description.vertex_shader->translated_binary().data(); state_desc.VS.BytecodeLength = - description.vertex_shader->translated_binary().size(); + runtime_description.vertex_shader->translated_binary().size(); } // Pre-GS primitive topology type. @@ -883,23 +1458,23 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState( } // Pixel shader. - if (description.pixel_shader != nullptr) { - if (!description.pixel_shader->is_translated()) { + if (runtime_description.pixel_shader != nullptr) { + if (!runtime_description.pixel_shader->is_translated()) { XELOGE("Pixel shader %.16" PRIX64 " not translated", - description.pixel_shader->ucode_data_hash()); + runtime_description.pixel_shader->ucode_data_hash()); assert_always(); return nullptr; } const auto& forced_early_z_shader = - description.pixel_shader->GetForcedEarlyZShaderObject(); + runtime_description.pixel_shader->GetForcedEarlyZShaderObject(); if (description.force_early_z && forced_early_z_shader.size() != 0) { state_desc.PS.pShaderBytecode = forced_early_z_shader.data(); state_desc.PS.BytecodeLength = forced_early_z_shader.size(); } else { state_desc.PS.pShaderBytecode = - description.pixel_shader->translated_binary().data(); + runtime_description.pixel_shader->translated_binary().data(); state_desc.PS.BytecodeLength = - description.pixel_shader->translated_binary().size(); + runtime_description.pixel_shader->translated_binary().size(); } } else if (edram_rov_used_) { state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data(); @@ -1048,72 +1623,159 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState( } } - // Create the pipeline. + // Create the pipeline state object. auto device = command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); ID3D12PipelineState* state; if (FAILED(device->CreateGraphicsPipelineState(&state_desc, IID_PPV_ARGS(&state)))) { - if (description.pixel_shader != nullptr) { + if (runtime_description.pixel_shader != nullptr) { XELOGE("Failed to create graphics pipeline state with VS %.16" PRIX64 ", PS %.16" PRIX64, - description.vertex_shader->ucode_data_hash(), - description.pixel_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash(), + runtime_description.pixel_shader->ucode_data_hash()); } else { XELOGE("Failed to create graphics pipeline state with VS %.16" PRIX64, - description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->ucode_data_hash()); } return nullptr; } std::wstring name; - if (description.pixel_shader != nullptr) { - name = xe::format_string(L"VS %.16I64X, PS %.16I64X", - description.vertex_shader->ucode_data_hash(), - description.pixel_shader->ucode_data_hash()); + if (runtime_description.pixel_shader != nullptr) { + name = + xe::format_string(L"VS %.16I64X, PS %.16I64X", + runtime_description.vertex_shader->ucode_data_hash(), + runtime_description.pixel_shader->ucode_data_hash()); } else { - name = xe::format_string(L"VS %.16I64X", - description.vertex_shader->ucode_data_hash()); + name = xe::format_string( + L"VS %.16I64X", runtime_description.vertex_shader->ucode_data_hash()); } state->SetName(name.c_str()); return state; } -void PipelineCache::CreationThread() { +void PipelineCache::StorageWriteThread() { + ShaderStoredHeader shader_header; + // Don't leak anything in unused bits. + std::memset(&shader_header, 0, sizeof(shader_header)); + + std::vector ucode_guest_endian; + ucode_guest_endian.reserve(0xFFFF); + + bool flush_shaders = false; + bool flush_pipeline_states = false; + while (true) { - Pipeline* pipeline_to_create = nullptr; + if (flush_shaders) { + flush_shaders = false; + assert_not_null(shader_storage_file_); + fflush(shader_storage_file_); + } + if (flush_pipeline_states) { + flush_pipeline_states = false; + assert_not_null(pipeline_state_storage_file_); + fflush(pipeline_state_storage_file_); + } + + std::pair shader_pair = {}; + PipelineStoredDescription pipeline_description; + bool write_pipeline_state = false; + { + std::unique_lock lock(storage_write_request_lock_); + if (storage_write_thread_shutdown_) { + return; + } + if (!storage_write_shader_queue_.empty()) { + shader_pair = storage_write_shader_queue_.front(); + storage_write_shader_queue_.pop_front(); + } else if (storage_write_flush_shaders_) { + storage_write_flush_shaders_ = false; + flush_shaders = true; + } + if (!storage_write_pipeline_state_queue_.empty()) { + std::memcpy(&pipeline_description, + &storage_write_pipeline_state_queue_.front(), + sizeof(pipeline_description)); + storage_write_pipeline_state_queue_.pop_front(); + write_pipeline_state = true; + } else if (storage_write_flush_pipeline_states_) { + storage_write_flush_pipeline_states_ = false; + flush_pipeline_states = true; + } + if (!shader_pair.first && !write_pipeline_state) { + storage_write_request_cond_.wait(lock); + continue; + } + } + + const Shader* shader = shader_pair.first; + if (shader) { + shader_header.ucode_data_hash = shader->ucode_data_hash(); + shader_header.ucode_dword_count = shader->ucode_dword_count(); + shader_header.type = shader->type(); + shader_header.patch_primitive_type = shader->patch_primitive_type(); + shader_header.sq_program_cntl = shader_pair.second; + assert_not_null(shader_storage_file_); + fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_); + if (shader_header.ucode_dword_count) { + ucode_guest_endian.resize(shader_header.ucode_dword_count); + // Need to swap because the hash is calculated for the shader with guest + // endianness. + xe::copy_and_swap(ucode_guest_endian.data(), shader->ucode_dwords(), + shader_header.ucode_dword_count); + fwrite(ucode_guest_endian.data(), + shader_header.ucode_dword_count * sizeof(uint32_t), 1, + shader_storage_file_); + } + } + + if (write_pipeline_state) { + assert_not_null(pipeline_state_storage_file_); + fwrite(&pipeline_description, sizeof(pipeline_description), 1, + pipeline_state_storage_file_); + } + } +} + +void PipelineCache::CreationThread(size_t thread_index) { + while (true) { + PipelineState* pipeline_state_to_create = nullptr; // Check if need to shut down or set the completion event and dequeue the - // pipeline if there is any. + // pipeline state if there is any. { std::unique_lock lock(creation_request_lock_); - if (creation_threads_shutdown_ || creation_queue_.empty()) { + if (thread_index >= creation_threads_shutdown_from_ || + creation_queue_.empty()) { if (creation_completion_set_event_ && creation_threads_busy_ == 0) { - // Last pipeline in the queue created - signal the event if requested. + // Last pipeline state object in the queue created - signal the event + // if requested. creation_completion_set_event_ = false; creation_completion_event_->Set(); } - if (creation_threads_shutdown_) { + if (thread_index >= creation_threads_shutdown_from_) { return; } creation_request_cond_.wait(lock); continue; } - // Take the pipeline from the queue and increment the busy thread count - // until the pipeline in created - other threads must be able to dequeue - // requests, but can't set the completion event until the pipelines are - // fully created (rather than just started creating). - pipeline_to_create = creation_queue_.front(); + // Take the pipeline state from the queue and increment the busy thread + // count until the pipeline state object is created - other threads must + // be able to dequeue requests, but can't set the completion event until + // the pipeline state objects are fully created (rather than just started + // creating). + pipeline_state_to_create = creation_queue_.front(); creation_queue_.pop_front(); ++creation_threads_busy_; } - // Create the pipeline. - pipeline_to_create->state = - CreatePipelineState(pipeline_to_create->description); + // Create the D3D12 pipeline state object. + pipeline_state_to_create->state = + CreateD3D12PipelineState(pipeline_state_to_create->description); - // Pipeline created - the thread is not busy anymore, safe to set the - // completion event if needed (at the next iteration, or in some other - // thread). + // Pipeline state object created - the thread is not busy anymore, safe to + // set the completion event if needed (at the next iteration, or in some + // other thread). { std::unique_lock lock(creation_request_lock_); --creation_threads_busy_; @@ -1121,6 +1783,23 @@ void PipelineCache::CreationThread() { } } +void PipelineCache::CreateQueuedPipelineStatesOnProcessorThread() { + assert_false(creation_threads_.empty()); + while (true) { + PipelineState* pipeline_state_to_create; + { + std::unique_lock lock(creation_request_lock_); + if (creation_queue_.empty()) { + break; + } + pipeline_state_to_create = creation_queue_.front(); + creation_queue_.pop_front(); + } + pipeline_state_to_create->state = + CreateD3D12PipelineState(pipeline_state_to_create->description); + } +} + } // namespace d3d12 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index fe1ff1bc1..7166185f7 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -11,13 +11,17 @@ #define XENIA_GPU_D3D12_PIPELINE_CACHE_H_ #include +#include #include #include #include +#include #include #include +#include #include +#include "xenia/base/platform.h" #include "xenia/base/threading.h" #include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/d3d12/render_target_cache.h" @@ -40,10 +44,14 @@ class PipelineCache { bool Initialize(); void Shutdown(); - void ClearCache(); + void ClearCache(bool shutting_down = false); + + void InitializeShaderStorage(const std::wstring& storage_root, + uint32_t title_id, bool blocking); + void ShutdownShaderStorage(); void EndSubmission(); - bool IsCreatingPipelines(); + bool IsCreatingPipelineStates(); D3D12Shader* LoadShader(ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count); @@ -57,15 +65,32 @@ class PipelineCache { D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, bool tessellated, PrimitiveType primitive_type, IndexFormat index_format, bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], - void** pipeline_handle_out, ID3D12RootSignature** root_signature_out); + void** pipeline_state_handle_out, + ID3D12RootSignature** root_signature_out); - // Returns a pipeline with deferred creation by its handle. May return nullptr - // if failed to create the pipeline. - inline ID3D12PipelineState* GetPipelineStateByHandle(void* handle) const { - return reinterpret_cast(handle)->state; + // Returns a pipeline state object with deferred creation by its handle. May + // return nullptr if failed to create the pipeline state object. + inline ID3D12PipelineState* GetD3D12PipelineStateByHandle( + void* handle) const { + return reinterpret_cast(handle)->state; } private: + XEPACKEDSTRUCT(ShaderStoredHeader, { + uint64_t ucode_data_hash; + + uint32_t ucode_dword_count : 16; + ShaderType type : 1; + PrimitiveType patch_primitive_type : 6; + + reg::SQ_PROGRAM_CNTL sq_program_cntl; + + static constexpr uint32_t kVersion = 0x20200301; + }); + + // Update PipelineDescription::kVersion if any of the Pipeline* enums are + // changed! + enum class PipelineStripCutIndex : uint32_t { kNone, kFFFF, @@ -122,7 +147,8 @@ class PipelineCache { kSrcAlphaSat, }; - struct PipelineRenderTarget { + // Update PipelineDescription::kVersion if anything is changed! + XEPACKEDSTRUCT(PipelineRenderTarget, { uint32_t used : 1; // 1 ColorRenderTargetFormat format : 4; // 5 PipelineBlendFactor src_blend : 4; // 9 @@ -132,12 +158,12 @@ class PipelineCache { PipelineBlendFactor dest_blend_alpha : 4; // 24 BlendOp blend_op_alpha : 3; // 27 uint32_t write_mask : 4; // 31 - }; + }); - struct PipelineDescription { - ID3D12RootSignature* root_signature; - D3D12Shader* vertex_shader; - D3D12Shader* pixel_shader; + XEPACKEDSTRUCT(PipelineDescription, { + uint64_t vertex_shader_hash; + // 0 if drawing without a pixel shader. + uint64_t pixel_shader_hash; int32_t depth_bias; float depth_bias_slope_scaled; @@ -170,19 +196,34 @@ class PipelineCache { CompareFunction stencil_back_func : 3; // 32 PipelineRenderTarget render_targets[4]; + + static constexpr uint32_t kVersion = 0x20200309; + }); + + XEPACKEDSTRUCT(PipelineStoredDescription, { + uint64_t description_hash; + PipelineDescription description; + }); + + struct PipelineRuntimeDescription { + ID3D12RootSignature* root_signature; + D3D12Shader* vertex_shader; + D3D12Shader* pixel_shader; + PipelineDescription description; }; - bool TranslateShader(D3D12Shader* shader, reg::SQ_PROGRAM_CNTL cntl, - bool tessellated, PrimitiveType primitive_type); + bool TranslateShader(DxbcShaderTranslator& translator, D3D12Shader* shader, + reg::SQ_PROGRAM_CNTL cntl, + PrimitiveType patch_primitive_type); bool GetCurrentStateDescription( D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, bool tessellated, PrimitiveType primitive_type, IndexFormat index_format, bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], - PipelineDescription& description_out); + PipelineRuntimeDescription& runtime_description_out); - ID3D12PipelineState* CreatePipelineState( - const PipelineDescription& description); + ID3D12PipelineState* CreateD3D12PipelineState( + const PipelineRuntimeDescription& runtime_description); D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; @@ -200,40 +241,71 @@ class PipelineCache { // Xenos pixel shader provided. std::vector depth_only_pixel_shader_; - struct Pipeline { + struct PipelineState { // nullptr if creation has failed. ID3D12PipelineState* state; - PipelineDescription description; + PipelineRuntimeDescription description; }; - // All previously generated pipelines identified by hash and the description. - std::unordered_multimap pipelines_; + // All previously generated pipeline state objects identified by hash and the + // description. + std::unordered_multimap pipeline_states_; - // Previously used pipeline. This matches our current state settings - // and allows us to quickly(ish) reuse the pipeline if no registers have - // changed. - Pipeline* current_pipeline_ = nullptr; + // Previously used pipeline state object. This matches our current state + // settings and allows us to quickly(ish) reuse the pipeline state if no + // registers have changed. + PipelineState* current_pipeline_state_ = nullptr; - // Pipeline creation threads. - void CreationThread(); + // Currently open shader storage path. + std::wstring shader_storage_root_; + uint32_t shader_storage_title_id_ = 0; + + // Shader storage output stream, for preload in the next emulator runs. + FILE* shader_storage_file_ = nullptr; + bool shader_storage_file_flush_needed_ = false; + + // Pipeline state storage output stream, for preload in the next emulator + // runs. + FILE* pipeline_state_storage_file_ = nullptr; + bool pipeline_state_storage_file_flush_needed_ = false; + + // Thread for asynchronous writing to the storage streams. + void StorageWriteThread(); + std::mutex storage_write_request_lock_; + std::condition_variable storage_write_request_cond_; + // Storage thread input is protected with storage_write_request_lock_, and the + // thread is notified about its change via storage_write_request_cond_. + std::deque> + storage_write_shader_queue_; + std::deque storage_write_pipeline_state_queue_; + bool storage_write_flush_shaders_ = false; + bool storage_write_flush_pipeline_states_ = false; + bool storage_write_thread_shutdown_ = false; + std::unique_ptr storage_write_thread_; + + // Pipeline state object creation threads. + void CreationThread(size_t thread_index); + void CreateQueuedPipelineStatesOnProcessorThread(); std::mutex creation_request_lock_; std::condition_variable creation_request_cond_; // Protected with creation_request_lock_, notify_one creation_request_cond_ // when set. - std::deque creation_queue_; - // Number of threads that are currently creating a pipeline - incremented when - // a pipeline is dequeued (the completion event can't be triggered before this - // is zero). Protected with creation_request_lock_. - uint32_t creation_threads_busy_ = 0; - // Manual-reset event set when the last queued pipeline is created and there - // are no more pipelines to create. This is triggered by the thread creating - // the last pipeline. + std::deque creation_queue_; + // Number of threads that are currently creating a pipeline state object - + // incremented when a pipeline state object is dequeued (the completion event + // can't be triggered before this is zero). Protected with + // creation_request_lock_. + size_t creation_threads_busy_ = 0; + // Manual-reset event set when the last queued pipeline state object is + // created and there are no more pipeline state objects to create. This is + // triggered by the thread creating the last pipeline state object. std::unique_ptr creation_completion_event_ = nullptr; // Whether setting the event on completion is queued. Protected with // creation_request_lock_, notify_one creation_request_cond_ when set. bool creation_completion_set_event_ = false; - // Whether to shut down the creation threads as soon as possible. Protected - // with creation_request_lock_, notify_all creation_request_cond_ when set. - bool creation_threads_shutdown_ = false; + // Creation threads with this index or above need to be shut down as soon as + // possible. Protected with creation_request_lock_, notify_all + // creation_request_cond_ when set. + size_t creation_threads_shutdown_from_ = SIZE_MAX; std::vector> creation_threads_; }; diff --git a/src/xenia/gpu/gpu_flags.cc b/src/xenia/gpu/gpu_flags.cc index 90c7b4598..41490a7bd 100644 --- a/src/xenia/gpu/gpu_flags.cc +++ b/src/xenia/gpu/gpu_flags.cc @@ -13,8 +13,10 @@ DEFINE_string(trace_gpu_prefix, "scratch/gpu/", "Prefix path for GPU trace files.", "GPU"); DEFINE_bool(trace_gpu_stream, false, "Trace all GPU packets.", "GPU"); -DEFINE_string(dump_shaders, "", - "Path to write GPU shaders to as they are compiled.", "GPU"); +DEFINE_string( + dump_shaders, "", + "For shader debugging, path to dump GPU shaders to as they are compiled.", + "GPU"); DEFINE_bool(vsync, true, "Enable VSYNC.", "GPU"); diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index 0e1c008d8..cbfc2cc9b 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -20,6 +20,12 @@ #include "xenia/ui/graphics_provider.h" #include "xenia/ui/loop.h" +DEFINE_bool( + store_shaders, true, + "Store shaders persistently and load them when loading games to avoid " + "runtime spikes and freezes when playing the game not for the first time.", + "GPU"); + namespace xe { namespace gpu { @@ -269,6 +275,34 @@ void GraphicsSystem::ClearCaches() { [&]() { command_processor_->ClearCaches(); }); } +void GraphicsSystem::InitializeShaderStorage(const std::wstring& storage_root, + uint32_t title_id, bool blocking) { + if (!cvars::store_shaders) { + return; + } + if (blocking) { + if (command_processor_->is_paused()) { + // Safe to run on any thread while the command processor is paused, no + // race condition. + command_processor_->InitializeShaderStorage(storage_root, title_id, true); + } else { + xe::threading::Fence fence; + command_processor_->CallInThread( + [this, storage_root, title_id, &fence]() { + command_processor_->InitializeShaderStorage(storage_root, title_id, + true); + fence.Signal(); + }); + fence.Wait(); + } + } else { + command_processor_->CallInThread([this, storage_root, title_id]() { + command_processor_->InitializeShaderStorage(storage_root, title_id, + false); + }); + } +} + void GraphicsSystem::RequestFrameTrace() { command_processor_->RequestFrameTrace( xe::to_wstring(cvars::trace_gpu_prefix)); diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 459dd7d3f..42bfe87d4 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -12,6 +12,7 @@ #include #include +#include #include #include "xenia/cpu/processor.h" @@ -62,6 +63,9 @@ class GraphicsSystem { virtual void ClearCaches(); + void InitializeShaderStorage(const std::wstring& storage_root, + uint32_t title_id, bool blocking); + void RequestFrameTrace(); void BeginTracing(); void EndTracing(); diff --git a/src/xenia/gpu/trace_dump.cc b/src/xenia/gpu/trace_dump.cc index b7b27df1a..0915613b1 100644 --- a/src/xenia/gpu/trace_dump.cc +++ b/src/xenia/gpu/trace_dump.cc @@ -102,7 +102,7 @@ int TraceDump::Main(const std::vector& args) { bool TraceDump::Setup() { // Create the emulator but don't initialize so we can setup the window. - emulator_ = std::make_unique(L"", L""); + emulator_ = std::make_unique(L"", L"", L""); X_STATUS result = emulator_->Setup( nullptr, nullptr, [this]() { return CreateGraphicsSystem(); }, nullptr); if (XFAILED(result)) { diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index d11faaecc..db3fc2da8 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -122,7 +122,7 @@ bool TraceViewer::Setup() { window_->Resize(1920, 1200); // Create the emulator but don't initialize so we can setup the window. - emulator_ = std::make_unique(L"", L""); + emulator_ = std::make_unique(L"", L"", L""); X_STATUS result = emulator_->Setup( window_.get(), nullptr, [this]() { return CreateGraphicsSystem(); }, nullptr); From 4f8cdd99675029bd8181af0c905fc7fa26b74824 Mon Sep 17 00:00:00 2001 From: Gliniak Date: Sun, 22 Mar 2020 21:19:21 +0100 Subject: [PATCH 2/5] [Kernel/XAM] XNotifyGetNext: Check for nullptr for param_ptr --- src/xenia/kernel/xam/xam_notify.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/xenia/kernel/xam/xam_notify.cc b/src/xenia/kernel/xam/xam_notify.cc index d65b0906a..9fb4eda25 100644 --- a/src/xenia/kernel/xam/xam_notify.cc +++ b/src/xenia/kernel/xam/xam_notify.cc @@ -67,10 +67,14 @@ dword_result_t XNotifyGetNext(dword_t handle, dword_t match_id, if (dequeued) { *id_ptr = id; - *param_ptr = param; + if (param_ptr) { + *param_ptr = param; + } } else { *id_ptr = 0; - *param_ptr = 0; + if (param_ptr) { + *param_ptr = 0; + } } return dequeued ? 1 : 0; From a4ffcd51756b5480bd2c9bf46cab0ed1cb2770a8 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 23 Mar 2020 00:13:54 +0300 Subject: [PATCH 3/5] [D3D12] Update DXBC contribution notes --- src/xenia/gpu/dxbc_shader_translator.cc | 12 +++--- src/xenia/gpu/dxbc_shader_translator.h | 56 ++++++++++++++++++++----- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 2c0aa1956..b465d2cc3 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -40,17 +40,15 @@ using namespace ucode; // Notes about operands: // // Reading and writing: -// - Writes to 4-component registers must be masked. -// - Reads from 4-component registers can be swizzled, or 1 component can be -// selected. // - r# (temporary registers) are 4-component and can be used anywhere. // - v# (inputs) are 4-component and read-only. // - o# (outputs) are 4-component and write-only. // - oDepth (pixel shader depth output) is 1-component and write-only. -// - x# (indexable temporary registers) are 4-component (though not sure what -// happens if you dcl them as 1-component) and can be accessed either via -// a mov load or a mov store (and those movs are counted as ArrayInstructions -// in STAT, not as MovInstructions). +// - x# (indexable temporary registers) are 4-component and can be accessed +// either via a mov load or a mov store (and those movs are counted as +// ArrayInstructions in STAT, not as MovInstructions), even though the D3D11.3 +// functional specification says x# can be used wherever r# can be used, but +// FXC emits only mov load/store in simple tests. // // Indexing: // - Constant buffers use 3D indices in CBx[y][z] format, where x is the ID of diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index e7b02f674..59d24351a 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -30,17 +30,53 @@ namespace gpu { // // IMPORTANT CONTRIBUTION NOTES: // -// Not all DXBC instructions accept all kinds of operands equally! -// Refer to Shader Model 4 and 5 Assembly on MSDN to see if the needed -// swizzle/selection, absolute/negate modifiers and saturation are supported by -// the instruction. +// While DXBC may look like a flexible and high-level representation with highly +// generalized building blocks, actually it has a lot of restrictions on operand +// usage! +// Check the Direct3D 11.3 Functional Specification before adding anything! +// https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm +// (the "7. Common Shader Internals" chapter and the documentation of the +// specific instruction you want to use). +// For instructions, MSDN also provides some information, but it's not as +// detailed as the functional specification: // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm -// Before adding anything that behaves in a way that doesn't follow patterns -// already used in Xenia, try to write the same logic in HLSL, compile it with -// FXC and see the resulting assembly *and preferably binary bytecode* as some -// instructions may, for example, require selection rather than swizzling for -// certain operands. For bytecode structure, see d3d12TokenizedProgramFormat.hpp -// from the Windows Driver Kit. +// Most important limitations: +// - This is very easy to hit, looks weird at first, and also not very important +// for modern drivers using DXILConv, but still needs to be respected for +// safety! One instruction can't accept more than one immediate or constant +// buffer source operand combined in total: +// and r0.x, CB0[0][0].x, l(1) +// and r0.x, CB0[0][0].x, CB0[0][0].y +// are illegal, even though pretty useful. Copy one of the operands to r#. +// - Absolute, negate and saturate are only supported by instructions that +// explicitly support them. +// - Component selection in the general case (ALU instructions - things like +// resource access and flow control mostly explicitly need a specific +// component selection mode defined in the specification of the instruction): +// - 0-component - for operand types with no data (samplers, labels). +// - 1-component - for scalar destination operand types, and for scalar source +// operand types when the destination vector has 1 component masked +// (including scalar immediates). +// - Mask - for vector destination operand types. +// - Swizzle - for both vector and scalar (replicated in this case) source +// operand types, when the destination vector has 2 or more components +// masked. Immediates in this case have XYZW swizzle. +// - Select 1 - for vector source operand types, when the destination has 1 +// component masked or is of a scalar type. +// - Input operands (v#) can be used only as sources, output operands (o#) can +// be used only as destinations. +// - The specification says that x#[] can be used wherever r# can be used, +// however, in tests, FXC only emits load/store mov instructions for x#[] +// (they are also counted in ArrayInstructions rather than MovInstructions in +// STAT), so it's better to only use mov for x#[]. The specification also +// permits using x#[] in relative addressing along with r# (as long as +// relative addressing isn't nested), but it's probably not very safe either. +// Don't do anything that FXC wouldn't do. +// TODO(Triang3l): Fix all places violating these rules - currently there are +// lots of them in Xenia! +// +// For bytecode structure, see d3d12TokenizedProgramFormat.hpp from the Windows +// Driver Kit. // // Avoid using uninitialized register components - such as registers written to // in "if" and not in "else", but then used outside unconditionally or with a From 61bcd467a6b63ffeaafb1894384e70a024f24554 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 23 Mar 2020 10:38:45 +0300 Subject: [PATCH 4/5] [Kernel/XAM] Rationale for XNotifyGetNext param_ptr null check --- src/xenia/kernel/xam/xam_notify.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/xenia/kernel/xam/xam_notify.cc b/src/xenia/kernel/xam/xam_notify.cc index 9fb4eda25..f7b009d9c 100644 --- a/src/xenia/kernel/xam/xam_notify.cc +++ b/src/xenia/kernel/xam/xam_notify.cc @@ -65,6 +65,8 @@ dword_result_t XNotifyGetNext(dword_t handle, dword_t match_id, dequeued = listener->DequeueNotification(&id, ¶m); } + // param_ptr may be null - Ghost Recon Advanced Warfighter 2 Demo. + // https://github.com/xenia-project/xenia/pull/1577 if (dequeued) { *id_ptr = id; if (param_ptr) { From 821a9897ef9e7c73a3251cd57e2f9c8bfeb0def4 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 23 Mar 2020 10:46:51 +0300 Subject: [PATCH 5/5] [Kernel/XAM] Even more detailed rationale for XNotifyGetNext param_ptr null check --- src/xenia/kernel/xam/xam_notify.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xenia/kernel/xam/xam_notify.cc b/src/xenia/kernel/xam/xam_notify.cc index f7b009d9c..a45b29bdf 100644 --- a/src/xenia/kernel/xam/xam_notify.cc +++ b/src/xenia/kernel/xam/xam_notify.cc @@ -65,7 +65,8 @@ dword_result_t XNotifyGetNext(dword_t handle, dword_t match_id, dequeued = listener->DequeueNotification(&id, ¶m); } - // param_ptr may be null - Ghost Recon Advanced Warfighter 2 Demo. + // param_ptr may be null - Ghost Recon Advanced Warfighter 2 Demo explicitly + // passes nullptr in the code. // https://github.com/xenia-project/xenia/pull/1577 if (dequeued) { *id_ptr = id;