From 6d2e74325c0f1598e54f659850c6dca691da7502 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 10 Oct 2018 14:30:29 +0300 Subject: [PATCH] [D3D12] ROV: Check if supported --- .../gpu/d3d12/d3d12_command_processor.cc | 13 ++++--- src/xenia/gpu/d3d12/pipeline_cache.cc | 8 ++-- src/xenia/gpu/d3d12/pipeline_cache.h | 5 ++- src/xenia/gpu/d3d12/render_target_cache.cc | 14 +++++++ src/xenia/gpu/d3d12/render_target_cache.h | 37 +++++++++++++++++++ src/xenia/gpu/dxbc_shader_translator.cc | 3 +- src/xenia/gpu/dxbc_shader_translator.h | 5 ++- src/xenia/ui/d3d12/d3d12_provider.cc | 10 +++-- src/xenia/ui/d3d12/d3d12_provider.h | 4 ++ 9 files changed, 83 insertions(+), 16 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 02c282db7..4a3511d54 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -628,8 +628,6 @@ bool D3D12CommandProcessor::SetupContext() { return false; } - pipeline_cache_ = std::make_unique(this, register_file_); - texture_cache_ = std::make_unique(this, register_file_, shared_memory_.get()); if (!texture_cache_->Initialize()) { @@ -644,6 +642,9 @@ bool D3D12CommandProcessor::SetupContext() { return false; } + pipeline_cache_ = std::make_unique( + this, register_file_, render_target_cache_->IsROVUsedForEDRAM()); + primitive_converter_ = std::make_unique(this, register_file_, memory_); if (!primitive_converter_->Initialize()) { @@ -810,12 +811,12 @@ void D3D12CommandProcessor::ShutdownContext() { primitive_converter_.reset(); + pipeline_cache_.reset(); + render_target_cache_.reset(); texture_cache_.reset(); - pipeline_cache_.reset(); - // Root signatured are used by pipelines, thus freed after the pipelines. for (auto it : root_signatures_) { it.second->Release(); @@ -1021,12 +1022,12 @@ void D3D12CommandProcessor::PerformSwap(uint32_t frontbuffer_ptr, primitive_converter_->ClearCache(); + pipeline_cache_->ClearCache(); + render_target_cache_->ClearCache(); texture_cache_->ClearCache(); - pipeline_cache_->ClearCache(); - for (auto it : root_signatures_) { it.second->Release(); } diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index c6f7777fc..860de9aa9 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -36,9 +36,11 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/primitive_rectangle_list_gs.h" PipelineCache::PipelineCache(D3D12CommandProcessor* command_processor, - RegisterFile* register_file) - : command_processor_(command_processor), register_file_(register_file) { - shader_translator_ = std::make_unique(); + RegisterFile* register_file, bool edram_rov_used) + : command_processor_(command_processor), + register_file_(register_file), + edram_rov_used_(edram_rov_used) { + shader_translator_ = std::make_unique(edram_rov_used_); // Set pipeline state description values we never change. // Zero out tessellation, stream output, blend state and formats for render diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 5a01ff671..215950c59 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -35,7 +35,7 @@ class PipelineCache { }; PipelineCache(D3D12CommandProcessor* command_processor, - RegisterFile* register_file); + RegisterFile* register_file, bool edram_rov_used); ~PipelineCache(); void Shutdown(); @@ -85,6 +85,9 @@ class PipelineCache { D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; + // Whether the output merger is emulated in pixel shaders. + bool edram_rov_used_; + // Reusable shader translator. std::unique_ptr shader_translator_ = nullptr; // All loaded shaders mapped by their guest hash key. diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 2227b6f45..d2fa952ab 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -9,6 +9,8 @@ #include "xenia/gpu/d3d12/render_target_cache.h" +#include + #include #include #include @@ -23,6 +25,10 @@ #include "xenia/gpu/texture_util.h" #include "xenia/ui/d3d12/d3d12_util.h" +DEFINE_bool(d3d12_rov, false, + "Use rasterizer-ordered views for render target emulation where " + "available (experimental and currently largely unimplemented)."); + namespace xe { namespace gpu { namespace d3d12 { @@ -330,6 +336,14 @@ void RenderTargetCache::ClearCache() { } } +bool RenderTargetCache::IsROVUsedForEDRAM() const { + if (!FLAGS_d3d12_rov) { + return false; + } + auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); + return provider->AreRasterizerOrderedViewsSupported(); +} + void RenderTargetCache::BeginFrame() { ClearBindings(); diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index f2846f45a..83cc6649a 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -185,6 +185,38 @@ class D3D12CommandProcessor; // multisampled surface is the same as a single-sampled surface with 2x height // and width - however, format size doesn't effect the dimensions. Surface pitch // in the surface info register is single-sampled. +// +// ============================================================================= +// Rasterizer-ordered view usage: +// ============================================================================= +// +// There is a separate output merger emulation path currently in development, +// using rasterizer-ordered views for writing directly to the 10 MB EDRAM buffer +// instead of the host output merger for render target output. +// +// The convential method of implementing Xenos render targets via host render +// targets has various flaws that may be impossible to fix: +// - k_16_16 and k_16_16_16_16 have -32...32 range on Xenos, but there's no +// equivalent format on PC APIs. They may be emulated using snorm16 (by +// dividing shader color output by 32) or float32, however, blending behaves +// incorrectly for both. In the former case, multiplicative blending may not +// work correctly - 1 becomes 1/32, and instead of 1 * 1 = 1, you get +// 1/32 * 1/32 = 1/1024. For 32-bit floats, additive blending result may go up +// to infinity. +// - k_2_10_10_10_FLOAT has similar blending issues, though less prominent, when +// emulated via float16 render targets. In addition to a greater range for +// RGB (values can go up to 65504 and infinity rather than 31.875), alpha is +// represented totally differently - in k_2_10_10_10_FLOAT, it may have only +// 4 values, and adding, for example, 0.1 to 0.333 will still result in 0.333, +// while with float16, it will be increasing, and the limit is infinity. +// - Due to simultaneously bound host render targets being independent from each +// other, and because the height is unknown (and the viewport and scissor are +// not always present - D3DPT_RECTLIST is used very commonly, especially for +// clearing (Direct3D 9 Clear is implemented this way on the Xbox 360) and +// copying, and it's usually drawn without a viewport and with 8192x8192 +// scissor), there may be cases of simulatenously bound render targets +// overlapping each other in the EDRAM in a way that is difficult to resolve, +// and stores/loads may destroy data. class RenderTargetCache { public: // Direct3D 12 debug layer does some kaschenit-style trolling by giving errors @@ -208,6 +240,11 @@ class RenderTargetCache { void Shutdown(); void ClearCache(); + // Should a rasterizer-ordered UAV of the EDRAM buffer with format conversion + // and blending performed in pixel shaders be used instead of host render + // targets. + bool IsROVUsedForEDRAM() const; + void BeginFrame(); // Called in the beginning of a draw call - may bind pipelines. bool UpdateRenderTargets(const D3D12Shader* pixel_shader); diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 2d12151c8..23d480d16 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -62,7 +62,8 @@ using namespace ucode; // second buffer in the descriptor array at b2, which is assigned to CB1, the // index would be CB1[3][0]. -DxbcShaderTranslator::DxbcShaderTranslator() { +DxbcShaderTranslator::DxbcShaderTranslator(bool edram_rovs_used) + : edram_rovs_used_(edram_rovs_used) { // Don't allocate again and again for the first shader. shader_code_.reserve(8192); shader_object_.reserve(16384); diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 9e84772b5..0595b6ed4 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -23,7 +23,7 @@ namespace gpu { // Generates shader model 5_1 byte code (for Direct3D 12). class DxbcShaderTranslator : public ShaderTranslator { public: - DxbcShaderTranslator(); + DxbcShaderTranslator(bool edram_rovs_used); ~DxbcShaderTranslator() override; // Constant buffer bindings in space 0. @@ -431,6 +431,9 @@ class DxbcShaderTranslator : public ShaderTranslator { // generated in the end of translation. std::vector shader_object_; + // Whether the output merger should be emulated in pixel shaders. + bool edram_rovs_used_; + // Data types used in constants buffers. Listed in dependency order. enum class RdefTypeIndex { kFloat, diff --git a/src/xenia/ui/d3d12/d3d12_provider.cc b/src/xenia/ui/d3d12/d3d12_provider.cc index 7bf7259ad..258dfa0e6 100644 --- a/src/xenia/ui/d3d12/d3d12_provider.cc +++ b/src/xenia/ui/d3d12/d3d12_provider.cc @@ -150,12 +150,13 @@ bool D3D12Provider::Initialize() { descriptor_size_dsv_ = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_DSV); - // Check if tiled resources and programmable sample positions (programmable - // sample positions added in Creators Update) are supported. + // Check if optional features are supported. + rasterizer_ordered_views_supported_ = false; tiled_resources_tier_ = 0; D3D12_FEATURE_DATA_D3D12_OPTIONS options; if (SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS, &options, sizeof(options)))) { + rasterizer_ordered_views_supported_ = options.ROVsSupported ? true : false; tiled_resources_tier_ = uint32_t(options.TiledResourcesTier); } programmable_sample_positions_tier_ = 0; @@ -167,8 +168,9 @@ bool D3D12Provider::Initialize() { } XELOGD3D( "Direct3D 12 device supports tiled resources tier %u, programmable " - "sample positions tier %u", - tiled_resources_tier_, programmable_sample_positions_tier_); + "sample positions tier %u; rasterizer-ordered views %ssupported", + tiled_resources_tier_, programmable_sample_positions_tier_, + rasterizer_ordered_views_supported_ ? "" : "un"); // Get the graphics analysis interface, will silently fail if PIX not // attached. diff --git a/src/xenia/ui/d3d12/d3d12_provider.h b/src/xenia/ui/d3d12/d3d12_provider.h index 99244d9dd..449a09473 100644 --- a/src/xenia/ui/d3d12/d3d12_provider.h +++ b/src/xenia/ui/d3d12/d3d12_provider.h @@ -62,6 +62,9 @@ class D3D12Provider : public GraphicsProvider { return start; } + bool AreRasterizerOrderedViewsSupported() const { + return rasterizer_ordered_views_supported_; + } uint32_t GetTiledResourcesTier() const { return tiled_resources_tier_; } uint32_t GetProgrammableSamplePositionsTier() const { return programmable_sample_positions_tier_; @@ -82,6 +85,7 @@ class D3D12Provider : public GraphicsProvider { uint32_t descriptor_size_rtv_; uint32_t descriptor_size_dsv_; + bool rasterizer_ordered_views_supported_; uint32_t tiled_resources_tier_; uint32_t programmable_sample_positions_tier_; };