From 890228b6f3480b498007916568fa948ad60228fb Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 4 Jan 2019 00:30:11 +0300 Subject: [PATCH] [D3D12] Prototype multithreaded PSO creation --- .../gpu/d3d12/d3d12_command_processor.cc | 31 ++-- src/xenia/gpu/d3d12/d3d12_command_processor.h | 15 +- src/xenia/gpu/d3d12/deferred_command_list.cc | 48 +++-- src/xenia/gpu/d3d12/deferred_command_list.h | 9 +- src/xenia/gpu/d3d12/pipeline_cache.cc | 168 +++++++++++++++--- src/xenia/gpu/d3d12/pipeline_cache.h | 43 ++++- 6 files changed, 255 insertions(+), 59 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index f75e1912a..f6b53311d 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -587,9 +587,10 @@ void D3D12CommandProcessor::SetSamplePositions(MsaaSamples sample_positions) { } void D3D12CommandProcessor::SetComputePipeline(ID3D12PipelineState* pipeline) { - if (current_pipeline_ != pipeline) { + if (current_external_pipeline_ != pipeline) { deferred_command_list_->D3DSetPipelineState(pipeline); - current_pipeline_ = pipeline; + current_external_pipeline_ = pipeline; + current_cached_pipeline_ = nullptr; } } @@ -600,9 +601,10 @@ void D3D12CommandProcessor::UnbindRenderTargets() { void D3D12CommandProcessor::SetExternalGraphicsPipeline( ID3D12PipelineState* pipeline, bool reset_viewport, bool reset_blend_factor, bool reset_stencil_ref) { - if (current_pipeline_ != pipeline) { + if (current_external_pipeline_ != pipeline) { deferred_command_list_->D3DSetPipelineState(pipeline); - current_pipeline_ = pipeline; + current_external_pipeline_ = pipeline; + current_cached_pipeline_ = nullptr; } current_graphics_root_signature_ = nullptr; current_graphics_root_up_to_date_ = 0; @@ -683,6 +685,10 @@ bool D3D12CommandProcessor::SetupContext() { pipeline_cache_ = std::make_unique(this, register_file_, IsROVUsedForEDRAM()); + if (!pipeline_cache_->Initialize()) { + XELOGE("Failed to initialize the graphics pipeline state cache"); + return false; + } primitive_converter_ = std::make_unique(this, register_file_, memory_); @@ -1279,17 +1285,19 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0); // Create the pipeline if needed and bind it. - ID3D12PipelineState* pipeline; + void* pipeline_handle; ID3D12RootSignature* root_signature; if (!pipeline_cache_->ConfigurePipeline( vertex_shader, pixel_shader, primitive_type_converted, indexed ? index_buffer_info->format : IndexFormat::kInt16, - pipeline_render_targets, &pipeline, &root_signature)) { + pipeline_render_targets, &pipeline_handle, &root_signature)) { return false; } - if (current_pipeline_ != pipeline) { - deferred_command_list_->D3DSetPipelineState(pipeline); - current_pipeline_ = pipeline; + if (current_cached_pipeline_ != pipeline_handle) { + deferred_command_list_->SetPipelineStateHandle( + reinterpret_cast(pipeline_handle)); + current_cached_pipeline_ = pipeline_handle; + current_external_pipeline_ = nullptr; } // Update viewport, scissor, blend factor and stencil reference. @@ -1600,7 +1608,8 @@ bool D3D12CommandProcessor::BeginFrame() { current_sample_positions_ = MsaaSamples::k1X; // Reset bindings, particularly because the buffers backing them are recycled. - current_pipeline_ = nullptr; + current_cached_pipeline_ = nullptr; + current_external_pipeline_ = nullptr; current_graphics_root_signature_ = nullptr; current_graphics_root_up_to_date_ = 0; current_view_heap_ = nullptr; @@ -1656,6 +1665,8 @@ bool D3D12CommandProcessor::EndFrame() { primitive_converter_->EndFrame(); + pipeline_cache_->EndFrame(); + render_target_cache_->EndFrame(); texture_cache_->EndFrame(); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 7788532ef..a8a2b2566 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -114,6 +114,12 @@ class D3D12CommandProcessor : public CommandProcessor { // render targets or copying to depth render targets. void SetSamplePositions(MsaaSamples sample_positions); + // Returns a pipeline with deferred creation by its handle. May return nullptr + // if failed to create the pipeline. + inline ID3D12PipelineState* GetPipelineStateByHandle(void* handle) const { + return pipeline_cache_->GetPipelineStateByHandle(handle); + } + // Sets the current pipeline state to a compute pipeline. This is for cache // invalidation primarily. A frame must be open. void SetComputePipeline(ID3D12PipelineState* pipeline); @@ -292,8 +298,13 @@ class D3D12CommandProcessor : public CommandProcessor { // Current SSAA sample positions (to be updated by the render target cache). MsaaSamples current_sample_positions_; - // Currently bound graphics or compute pipeline. - ID3D12PipelineState* current_pipeline_; + // Currently bound pipeline, either a graphics pipeline from the pipeline + // cache (with potentially deferred creation - current_external_pipeline_ is + // nullptr in this case) or a non-Xenos graphics or compute pipeline + // (current_cached_pipeline_ is nullptr in this case). + void* current_cached_pipeline_; + ID3D12PipelineState* current_external_pipeline_; + // Currently bound graphics root signature. ID3D12RootSignature* current_graphics_root_signature_; // Extra parameters which may or may not be present. diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc index 7d2767eb0..1a1a5a7f8 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.cc +++ b/src/xenia/gpu/d3d12/deferred_command_list.cc @@ -31,6 +31,7 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, ID3D12GraphicsCommandList1* command_list_1) { const uint8_t* stream = command_stream_.data(); size_t stream_remaining = command_stream_.size(); + ID3D12PipelineState* current_pipeline_state = nullptr; while (stream_remaining != 0) { const uint32_t* header = reinterpret_cast(stream); const size_t header_size = xe::align(2 * sizeof(uint32_t), kAlignment); @@ -53,25 +54,32 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, command_list->CopyTextureRegion(&args.dst, 0, 0, 0, &args.src, nullptr); } break; case Command::kD3DDispatch: { - auto& args = *reinterpret_cast(stream); - command_list->Dispatch(args.thread_group_count_x, - args.thread_group_count_y, - args.thread_group_count_z); + if (current_pipeline_state != nullptr) { + auto& args = *reinterpret_cast(stream); + command_list->Dispatch(args.thread_group_count_x, + args.thread_group_count_y, + args.thread_group_count_z); + } } break; case Command::kD3DDrawIndexedInstanced: { - auto& args = - *reinterpret_cast(stream); - command_list->DrawIndexedInstanced( - args.index_count_per_instance, args.instance_count, - args.start_index_location, args.base_vertex_location, - args.start_instance_location); + if (current_pipeline_state != nullptr) { + auto& args = + *reinterpret_cast( + stream); + command_list->DrawIndexedInstanced( + args.index_count_per_instance, args.instance_count, + args.start_index_location, args.base_vertex_location, + args.start_instance_location); + } } break; case Command::kD3DDrawInstanced: { - auto& args = - *reinterpret_cast(stream); - command_list->DrawInstanced( - args.vertex_count_per_instance, args.instance_count, - args.start_vertex_location, args.start_instance_location); + if (current_pipeline_state != nullptr) { + auto& args = + *reinterpret_cast(stream); + command_list->DrawInstanced( + args.vertex_count_per_instance, args.instance_count, + args.start_vertex_location, args.start_instance_location); + } } break; case Command::kD3DIASetIndexBuffer: { auto view = reinterpret_cast(stream); @@ -176,8 +184,14 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, descriptor_heaps); } break; case Command::kD3DSetPipelineState: { - command_list->SetPipelineState( - *reinterpret_cast(stream)); + current_pipeline_state = + *reinterpret_cast(stream); + command_list->SetPipelineState(current_pipeline_state); + } break; + case Command::kSetPipelineStateHandle: { + current_pipeline_state = command_processor_->GetPipelineStateByHandle( + *reinterpret_cast(stream)); + command_list->SetPipelineState(current_pipeline_state); } break; case Command::kD3DSetSamplePositions: { if (command_list_1 != nullptr) { diff --git a/src/xenia/gpu/d3d12/deferred_command_list.h b/src/xenia/gpu/d3d12/deferred_command_list.h index 3dd6644b6..a7670eefa 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.h +++ b/src/xenia/gpu/d3d12/deferred_command_list.h @@ -280,6 +280,12 @@ class DeferredCommandList { arg = pipeline_state; } + inline void SetPipelineStateHandle(void* pipeline_state_handle) { + auto& arg = *reinterpret_cast( + WriteCommand(Command::kSetPipelineStateHandle, sizeof(void*))); + arg = pipeline_state_handle; + } + inline void D3DSetSamplePositions( UINT num_samples_per_pixel, UINT num_pixels, const D3D12_SAMPLE_POSITION* sample_positions) { @@ -321,6 +327,7 @@ class DeferredCommandList { kD3DSetGraphicsRootSignature, kSetDescriptorHeaps, kD3DSetPipelineState, + kSetPipelineStateHandle, kD3DSetSamplePositions, }; @@ -368,7 +375,7 @@ class DeferredCommandList { bool rts_single_handle_to_descriptor_range; bool depth_stencil; D3D12_CPU_DESCRIPTOR_HANDLE - render_target_descriptors[D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT]; + render_target_descriptors[D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT]; D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_descriptor; }; diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 561df6abd..27f421944 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -68,11 +68,85 @@ PipelineCache::PipelineCache(D3D12CommandProcessor* command_processor, depth_only_pixel_shader_ = std::move(shader_translator_->CreateDepthOnlyPixelShader()); } + + creation_completion_event_ = + xe::threading::Event::CreateManualResetEvent(true); } PipelineCache::~PipelineCache() { Shutdown(); } -void PipelineCache::Shutdown() { ClearCache(); } +bool PipelineCache::Initialize() { + creation_threads_busy_ = 0; + creation_completion_set_event_ = false; + creation_threads_shutdown_ = false; + // TODO(Triang3l): Change the thread count to something non-fixed (3 is just + // for testing). + for (uint32_t i = 0; i < 3; ++i) { + std::unique_ptr creation_thread = + xe::threading::Thread::Create({}, [this]() { CreationThread(); }); + creation_thread->set_name("D3D12 Pipelines"); + creation_threads_.push_back(std::move(creation_thread)); + } + return true; +} + +void PipelineCache::Shutdown() { + ClearCache(); + + // Shut down all threads. + { + std::lock_guard lock(creation_request_lock_); + creation_threads_shutdown_ = true; + } + creation_request_cond_.notify_all(); + for (size_t i = 0; i < creation_threads_.size(); ++i) { + xe::threading::Wait(creation_threads_[i].get(), false); + } + creation_threads_.clear(); +} + +void PipelineCache::ClearCache() { + // Remove references to the current pipeline. + current_pipeline_ = nullptr; + + // Empty the pipeline creation queue. + { + std::lock_guard lock(creation_request_lock_); + creation_queue_.clear(); + creation_completion_set_event_ = true; + } + creation_request_cond_.notify_one(); + + // Destroy all pipelines. + for (auto it : pipelines_) { + it.second->state->Release(); + delete it.second; + } + pipelines_.clear(); + COUNT_profile_set("gpu/pipeline_cache/pipelines", 0); + + // Destroy all shaders. + for (auto it : shader_map_) { + delete it.second; + } + shader_map_.clear(); +} + +void PipelineCache::EndFrame() { + // Await creation of all queued pipelines. + bool await_event = false; + { + std::lock_guard lock(creation_request_lock_); + if (!creation_queue_.empty() || creation_threads_busy_ != 0) { + creation_completion_event_->Reset(); + creation_completion_set_event_ = true; + await_event = true; + } + } + if (await_event) { + xe::threading::Wait(creation_completion_event_.get(), false); + } +} D3D12Shader* PipelineCache::LoadShader(ShaderType shader_type, uint32_t guest_address, @@ -127,13 +201,12 @@ bool PipelineCache::ConfigurePipeline( D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, PrimitiveType primitive_type, IndexFormat index_format, const RenderTargetCache::PipelineRenderTarget render_targets[5], - ID3D12PipelineState** pipeline_out, - ID3D12RootSignature** root_signature_out) { + void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) { #if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // FINE_GRAINED_DRAW_SCOPES - assert_not_null(pipeline_out); + assert_not_null(pipeline_handle_out); assert_not_null(root_signature_out); PipelineDescription description; @@ -145,7 +218,7 @@ bool PipelineCache::ConfigurePipeline( if (current_pipeline_ != nullptr && !std::memcmp(¤t_pipeline_->description, &description, sizeof(description))) { - *pipeline_out = current_pipeline_->state; + *pipeline_handle_out = current_pipeline_; *root_signature_out = description.root_signature; return true; } @@ -158,12 +231,30 @@ bool PipelineCache::ConfigurePipeline( if (!std::memcmp(&found_pipeline->description, &description, sizeof(description))) { current_pipeline_ = found_pipeline; - *pipeline_out = found_pipeline->state; + *pipeline_handle_out = found_pipeline; *root_signature_out = found_pipeline->description.root_signature; return true; } } +#if 1 + if (!EnsureShadersTranslated(vertex_shader, pixel_shader, primitive_type)) { + return false; + } + + Pipeline* new_pipeline = new Pipeline; + new_pipeline->state = nullptr; + std::memcpy(&new_pipeline->description, &description, sizeof(description)); + pipelines_.insert(std::make_pair(hash, new_pipeline)); + COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); + + // Submit the pipeline for creation to any available thread. + { + std::lock_guard lock(creation_request_lock_); + creation_queue_.push_back(new_pipeline); + } + creation_request_cond_.notify_one(); +#else // Create a new pipeline if not found and add it to the cache. if (pixel_shader != nullptr) { XELOGGPU("Creating pipeline %.16" PRIX64 ", VS %.16" PRIX64 @@ -187,32 +278,14 @@ bool PipelineCache::ConfigurePipeline( std::memcpy(&new_pipeline->description, &description, sizeof(description)); pipelines_.insert(std::make_pair(hash, new_pipeline)); COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); +#endif current_pipeline_ = new_pipeline; - *pipeline_out = new_state; + *pipeline_handle_out = new_pipeline; *root_signature_out = description.root_signature; return true; } -void PipelineCache::ClearCache() { - // Remove references to the current pipeline. - current_pipeline_ = nullptr; - - // Destroy all pipelines. - for (auto it : pipelines_) { - it.second->state->Release(); - delete it.second; - } - pipelines_.clear(); - COUNT_profile_set("gpu/pipeline_cache/pipelines", 0); - - // Destroy all shaders. - for (auto it : shader_map_) { - delete it.second; - } - shader_map_.clear(); -} - bool PipelineCache::TranslateShader(D3D12Shader* shader, xenos::xe_gpu_program_cntl_t cntl, PrimitiveType primitive_type) { @@ -942,6 +1015,49 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState( return state; } +void PipelineCache::CreationThread() { + while (true) { + Pipeline* pipeline_to_create = nullptr; + + // Check if need to shut down or set the completion event and dequeue the + // pipeline if there is any. + { + std::unique_lock lock(creation_request_lock_); + if (creation_threads_shutdown_ || creation_queue_.empty()) { + if (creation_completion_set_event_ && creation_threads_busy_ == 0) { + // Last pipeline in the queue created - signal the event if requested. + creation_completion_set_event_ = false; + creation_completion_event_->Set(); + } + if (creation_threads_shutdown_) { + return; + } + creation_request_cond_.wait(lock); + continue; + } + // Take the pipeline from the queue and increment the busy thread count + // until the pipeline in created - other threads must be able to dequeue + // requests, but can't set the completion event until the pipelines are + // fully created (rather than just started creating). + pipeline_to_create = creation_queue_.front(); + creation_queue_.pop_front(); + ++creation_threads_busy_; + } + + // Create the pipeline. + pipeline_to_create->state = + CreatePipelineState(pipeline_to_create->description); + + // Pipeline created - the thread is not busy anymore, safe to set the + // completion event if needed (at the next iteration, or in some other + // thread). + { + std::unique_lock lock(creation_request_lock_); + --creation_threads_busy_; + } + } +} + } // namespace d3d12 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 4df2fe7a3..f0edb68fb 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -10,9 +10,15 @@ #ifndef XENIA_GPU_D3D12_PIPELINE_CACHE_H_ #define XENIA_GPU_D3D12_PIPELINE_CACHE_H_ +#include +#include +#include +#include +#include #include #include +#include "xenia/base/threading.h" #include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/dxbc_shader_translator.h" @@ -31,7 +37,11 @@ class PipelineCache { RegisterFile* register_file, bool edram_rov_used); ~PipelineCache(); + bool Initialize(); void Shutdown(); + void ClearCache(); + + void EndFrame(); D3D12Shader* LoadShader(ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count); @@ -45,10 +55,13 @@ class PipelineCache { D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, PrimitiveType primitive_type, IndexFormat index_format, const RenderTargetCache::PipelineRenderTarget render_targets[5], - ID3D12PipelineState** pipeline_out, - ID3D12RootSignature** root_signature_out); + void** pipeline_handle_out, ID3D12RootSignature** root_signature_out); - void ClearCache(); + // Returns a pipeline with deferred creation by its handle. May return nullptr + // if failed to create the pipeline. + inline ID3D12PipelineState* GetPipelineStateByHandle(void* handle) const { + return reinterpret_cast(handle)->state; + } private: enum class PipelineStripCutIndex : uint32_t { @@ -184,6 +197,7 @@ class PipelineCache { std::vector depth_only_pixel_shader_; struct Pipeline { + // nullptr if creation has failed. ID3D12PipelineState* state; PipelineDescription description; }; @@ -194,6 +208,29 @@ class PipelineCache { // and allows us to quickly(ish) reuse the pipeline if no registers have // changed. Pipeline* current_pipeline_ = nullptr; + + // Pipeline creation threads. + void CreationThread(); + std::mutex creation_request_lock_; + std::condition_variable creation_request_cond_; + // Protected with creation_request_lock_, notify_one creation_request_cond_ + // when set. + std::deque creation_queue_; + // Number of threads that are currently creating a pipeline - incremented when + // a pipeline is dequeued (the completion event can't be triggered before this + // is zero). Protected with creation_request_lock_. + uint32_t creation_threads_busy_ = 0; + // Manual-reset event set when the last queued pipeline is created and there + // are no more pipelines to create. This is triggered by the thread creating + // the last pipeline. + std::unique_ptr creation_completion_event_; + // Whether setting the event on completion is queued. Protected with + // creation_request_lock_, notify_one creation_request_cond_ when set. + bool creation_completion_set_event_ = false; + // Whether to shut down the creation threads as soon as possible. Protected + // with creation_request_lock_, notify_all creation_request_cond_ when set. + bool creation_threads_shutdown_ = false; + std::vector> creation_threads_; }; } // namespace d3d12