From 9194c3f8b0e981de2a4226ed646d9eedc749c26d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 6 Oct 2018 20:27:48 +0300 Subject: [PATCH] [D3D12] Primitive converter cache and strip restart, texture invalidation acquire/release --- .../gpu/d3d12/d3d12_command_processor.cc | 30 +-- src/xenia/gpu/d3d12/primitive_converter.cc | 206 +++++++++++++++--- src/xenia/gpu/d3d12/primitive_converter.h | 70 ++++-- src/xenia/gpu/d3d12/texture_cache.cc | 4 +- src/xenia/gpu/d3d12/texture_cache.h | 3 +- 5 files changed, 237 insertions(+), 76 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 1a0625b4a..52d0588a8 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -632,8 +632,8 @@ bool D3D12CommandProcessor::SetupContext() { return false; } - primitive_converter_ = std::make_unique( - this, register_file_, memory_, shared_memory_.get()); + primitive_converter_ = + std::make_unique(this, register_file_, memory_); if (!primitive_converter_->Initialize()) { XELOGE("Failed to initialize the geometric primitive converter"); return false; @@ -1060,30 +1060,6 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, return true; } - bool indexed = index_buffer_info != nullptr && index_buffer_info->guest_base; - if (indexed && regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21)) { - uint32_t reset_index = regs[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32; - uint32_t reset_index_expected; - if (index_buffer_info->format == IndexFormat::kInt32) { - reset_index_expected = 0xFFFFFFFFu; - } else { - reset_index_expected = 0xFFFFu; - } - if (reset_index != reset_index_expected) { - // Only 0xFFFF and 0xFFFFFFFF primitive restart indices are supported by - // Direct3D 12 (endianness doesn't matter for them). With shared memory, - // it's impossible to replace the cut index in the buffer without - // affecting the game memory. - XELOGE( - "The game uses the primitive restart index 0x%X that isn't 0xFFFF or " - "0xFFFFFFFF. Report the game to Xenia developers so geometry shaders " - "will be added to handle this!", - reset_index); - assert_always(); - return false; - } - } - // Shaders will have already been defined by previous loads. // We need them to do just about anything so validate here. auto vertex_shader = static_cast(active_vertex_shader()); @@ -1122,6 +1098,8 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, const RenderTargetCache::PipelineRenderTarget* pipeline_render_targets = render_target_cache_->GetCurrentPipelineRenderTargets(); + bool indexed = index_buffer_info != nullptr && index_buffer_info->guest_base; + // Set the primitive topology. PrimitiveType primitive_type_converted = PrimitiveConverter::GetReplacementPrimitiveType(primitive_type); diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc index 36f7a00bc..db4fa493f 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.cc +++ b/src/xenia/gpu/d3d12/primitive_converter.cc @@ -14,6 +14,7 @@ #include "xenia/base/assert.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" +#include "xenia/base/memory.h" #include "xenia/base/platform.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/ui/d3d12/d3d12_util.h" @@ -24,12 +25,12 @@ namespace d3d12 { PrimitiveConverter::PrimitiveConverter(D3D12CommandProcessor* command_processor, RegisterFile* register_file, - Memory* memory, - SharedMemory* shared_memory) + Memory* memory) : command_processor_(command_processor), register_file_(register_file), - memory_(memory), - shared_memory_(shared_memory) {} + memory_(memory) { + system_page_size_ = uint32_t(memory::page_size()); +} PrimitiveConverter::~PrimitiveConverter() { Shutdown(); } @@ -94,10 +95,18 @@ bool PrimitiveConverter::Initialize() { } static_ib_gpu_address_ = static_ib_->GetGPUVirtualAddress(); + memory_regions_invalidated_.store(0ull, std::memory_order_relaxed); + physical_write_watch_handle_ = + memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this); + return true; } void PrimitiveConverter::Shutdown() { + if (physical_write_watch_handle_ != nullptr) { + memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_); + physical_write_watch_handle_ = nullptr; + } ui::d3d12::util::ReleaseAndNull(static_ib_); ui::d3d12::util::ReleaseAndNull(static_ib_upload_); buffer_pool_.reset(); @@ -106,8 +115,6 @@ void PrimitiveConverter::Shutdown() { void PrimitiveConverter::ClearCache() { buffer_pool_->ClearCache(); } void PrimitiveConverter::BeginFrame() { - buffer_pool_->BeginFrame(); - // Got a command list now - upload and transition the static index buffer if // needed. if (static_ib_upload_ != nullptr) { @@ -126,6 +133,11 @@ void PrimitiveConverter::BeginFrame() { static_ib_upload_ = nullptr; } } + + buffer_pool_->BeginFrame(); + + converted_indices_cache_.clear(); + memory_regions_used_ = 0; } void PrimitiveConverter::EndFrame() { buffer_pool_->EndFrame(); } @@ -142,6 +154,7 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives( PrimitiveType source_type, uint32_t address, uint32_t index_count, IndexFormat index_format, Endian index_endianness, D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out, uint32_t& index_count_out) { + bool index_32bit = index_format == IndexFormat::kInt32; auto& regs = *register_file_; bool reset = (regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21)) != 0; // Swap the reset index because we will be comparing unswapped values to it. @@ -150,40 +163,79 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives( // If the specified reset index is the same as the one used by Direct3D 12 // (0xFFFF or 0xFFFFFFFF - in the pipeline cache, we use the former for // 16-bit and the latter for 32-bit indices), we can use the buffer directly. - uint32_t reset_index_host = - index_format == IndexFormat::kInt32 ? 0xFFFFFFFFu : 0xFFFFu; + uint32_t reset_index_host = index_32bit ? 0xFFFFFFFFu : 0xFFFFu; // Check if need to convert at all. if (source_type != PrimitiveType::kTriangleFan) { if (!reset || reset_index == reset_index_host) { return ConversionResult::kConversionNotNeeded; } - if (source_type != PrimitiveType::kTriangleStrip || + if (source_type != PrimitiveType::kTriangleStrip && source_type != PrimitiveType::kLineStrip) { return ConversionResult::kConversionNotNeeded; } - // TODO(Triang3l): Write conversion for triangle and line strip reset index - // and for indexed line loops. - return ConversionResult::kConversionNotNeeded; } // Exit early for clearly empty draws, without even reading the memory. - if (source_type == PrimitiveType::kTriangleFan || - source_type == PrimitiveType::kTriangleStrip) { - if (index_count < 3) { - return ConversionResult::kPrimitiveEmpty; - } - } else if (source_type == PrimitiveType::kLineStrip || - source_type == PrimitiveType::kLineLoop) { - if (index_count < 2) { - return ConversionResult::kPrimitiveEmpty; - } + uint32_t index_count_min; + if (source_type == PrimitiveType::kLineStrip || + source_type == PrimitiveType::kLineLoop) { + index_count_min = 2; + } else { + index_count_min = 3; + } + if (index_count < index_count_min) { + return ConversionResult::kPrimitiveEmpty; } - // TODO(Triang3l): Find the converted data in the cache. + // Invalidate the cache if data behind any entry was modified. + if (memory_regions_invalidated_.exchange(0ull, std::memory_order_acquire) & + memory_regions_used_) { + converted_indices_cache_.clear(); + memory_regions_used_ = 0; + } - // Calculate the index count, and also check if there's nothing to convert in - // the buffer (for instance, if not using primitive reset). + address &= index_32bit ? 0x1FFFFFFC : 0x1FFFFFFE; + uint32_t index_size = index_32bit ? sizeof(uint32_t) : sizeof(uint16_t); + uint32_t address_last = address + index_size * (index_count - 1); + + // Create the cache entry, currently only for the key. + ConvertedIndices converted_indices; + converted_indices.key.address = address; + converted_indices.key.source_type = source_type; + converted_indices.key.format = index_format; + converted_indices.key.count = index_count; + converted_indices.key.reset = reset ? 1 : 0; + converted_indices.reset_index = reset_index; + + // Try to find the previously converted index buffer. + auto found_range = + converted_indices_cache_.equal_range(converted_indices.key.value); + for (auto iter = found_range.first; iter != found_range.second; ++iter) { + const ConvertedIndices& found_converted = iter->second; + if (reset && found_converted.reset_index != reset_index) { + continue; + } + if (found_converted.converted_index_count == 0) { + return ConversionResult::kPrimitiveEmpty; + } + if (!found_converted.gpu_address) { + return ConversionResult::kConversionNotNeeded; + } + gpu_address_out = found_converted.gpu_address; + index_count_out = found_converted.converted_index_count; + return ConversionResult::kConverted; + } + + // Get the memory usage mask for cache invalidation. + // 1 bit = (512 / 64) MB = 8 MB. + uint64_t memory_regions_used_bits = ~((1ull << (address >> 23)) - 1); + if (address_last < (63 << 23)) { + memory_regions_used_bits = (1ull << ((address_last >> 23) + 1)) - 1; + } + + // Calculate the new index count, and also check if there's nothing to convert + // in the buffer (for instance, if not using actually primitive reset). uint32_t converted_index_count = 0; bool conversion_needed = false; bool simd = false; @@ -196,22 +248,44 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives( } else { converted_index_count = 3 * (index_count - 2); } + } else if (source_type == PrimitiveType::kTriangleStrip || + source_type == PrimitiveType::kLineStrip) { + // TODO(Triang3l): Check if the restart index is used at all in this buffer. + conversion_needed = true; + converted_index_count = index_count; + simd = true; + } + converted_indices.converted_index_count = converted_index_count; + + // If nothing to convert, store this result so the check won't be happening + // again and again and exit. + if (!conversion_needed || converted_index_count == 0) { + converted_indices.gpu_address = 0; + converted_indices_cache_.insert( + std::make_pair(converted_indices.key.value, converted_indices)); + memory_regions_used_ |= memory_regions_used_bits; + return converted_index_count == 0 ? ConversionResult::kPrimitiveEmpty + : ConversionResult::kConversionNotNeeded; } + // Convert. + union { - void* source; - uint16_t* source_16; - uint32_t* source_32; + const void* source; + const uint8_t* source_8; + const uint16_t* source_16; + const uint32_t* source_32; }; source = memory_->TranslatePhysical(address); union { void* target; + uint8_t* target_8; uint16_t* target_16; uint32_t* target_32; }; D3D12_GPU_VIRTUAL_ADDRESS gpu_address; - target = AllocateIndices(index_format, index_count, simd ? address & 15 : 0, - gpu_address); + target = AllocateIndices(index_format, converted_index_count, + simd ? address & 15 : 0, gpu_address); if (target == nullptr) { return ConversionResult::kFailed; } @@ -237,11 +311,62 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives( } } } + } else if (source_type == PrimitiveType::kTriangleStrip || + source_type == PrimitiveType::kLineStrip) { + // Replace the reset index with the maximum representable value - vector OR + // gives 0 or 0xFFFF/0xFFFFFFFF, which is exactly what is needed. + // Allocations in the target index buffer are aligned with 16-byte + // granularity, and within 16-byte vectors, both the source and the target + // start at the same offset. +#if XE_ARCH_AMD64 + source = reinterpret_cast(reinterpret_cast(source) & + ~(uintptr_t(15))); + target = reinterpret_cast(reinterpret_cast(target) & + ~(uintptr_t(15))); + uint32_t vector_count = (address_last >> 4) - (address >> 4) + 1; + if (index_format == IndexFormat::kInt32) { + __m128i reset_index_vector = _mm_set1_epi32(reset_index); + for (uint32_t i = 0; i < vector_count; ++i) { + __m128i indices_vector = + _mm_load_si128(reinterpret_cast(&source_8[i << 4])); + __m128i indices_are_reset_vector = + _mm_cmpeq_epi32(indices_vector, reset_index_vector); + _mm_store_si128(reinterpret_cast<__m128i*>(&target_8[i << 4]), + _mm_or_si128(indices_vector, indices_are_reset_vector)); + } + } else { + __m128i reset_index_vector = _mm_set1_epi16(reset_index); + for (uint32_t i = 0; i < vector_count; ++i) { + __m128i indices_vector = + _mm_load_si128(reinterpret_cast(&source_8[i << 4])); + __m128i indices_are_reset_vector = + _mm_cmpeq_epi16(indices_vector, reset_index_vector); + _mm_store_si128(reinterpret_cast<__m128i*>(&target_8[i << 4]), + _mm_or_si128(indices_vector, indices_are_reset_vector)); + } + } +#else + if (index_format == IndexFormat::kInt32) { + for (uint32_t i = 0; i < index_count; ++i) { + uint32_t index = source_32[i]; + target_32[i] = index == reset_index ? 0xFFFFFFFFu : index; + } + } else { + for (uint32_t i = 0; i < index_count; ++i) { + uint16_t index = source_16[i]; + target_16[i] = index == reset_index ? 0xFFFFu : index; + } + } +#endif } - // TODO(Triang3l): Replace primitive reset index in triangle and line strips. // TODO(Triang3l): Line loops. + // Cache and return the indices. + converted_indices.gpu_address = gpu_address; + converted_indices_cache_.insert( + std::make_pair(converted_indices.key.value, converted_indices)); + memory_regions_used_ |= memory_regions_used_bits; gpu_address_out = gpu_address; index_count_out = converted_index_count; return ConversionResult::kConverted; @@ -277,6 +402,25 @@ void* PrimitiveConverter::AllocateIndices( return mapping + simd_offset; } +void PrimitiveConverter::MemoryWriteCallback(uint32_t page_first, + uint32_t page_last) { + // 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size. + uint32_t bit_index_first = (page_first * system_page_size_) >> 23; + uint32_t bit_index_last = (page_last * system_page_size_) >> 23; + uint64_t bits = ~((1ull << bit_index_first) - 1); + if (bit_index_last < 63) { + bits &= (1ull << (bit_index_last + 1)) - 1; + } + memory_regions_invalidated_ |= bits; +} + +void PrimitiveConverter::MemoryWriteCallbackThunk(void* context_ptr, + uint32_t page_first, + uint32_t page_last) { + reinterpret_cast(context_ptr) + ->MemoryWriteCallback(page_first, page_last); +} + D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer( PrimitiveType source_type, uint32_t index_count, uint32_t& index_count_out) const { diff --git a/src/xenia/gpu/d3d12/primitive_converter.h b/src/xenia/gpu/d3d12/primitive_converter.h index cad6260e9..39fb5dcf5 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.h +++ b/src/xenia/gpu/d3d12/primitive_converter.h @@ -14,11 +14,11 @@ #include #include -#include "xenia/gpu/d3d12/shared_memory.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/xenos.h" #include "xenia/memory.h" #include "xenia/ui/d3d12/d3d12_context.h" +#include "xenia/ui/d3d12/pools.h" namespace xe { namespace gpu { @@ -35,8 +35,7 @@ class D3D12CommandProcessor; class PrimitiveConverter { public: PrimitiveConverter(D3D12CommandProcessor* command_processor, - RegisterFile* register_file, Memory* memory, - SharedMemory* shared_memory); + RegisterFile* register_file, Memory* memory); ~PrimitiveConverter(); bool Initialize(); @@ -78,8 +77,6 @@ class PrimitiveConverter { D3D12_GPU_VIRTUAL_ADDRESS GetStaticIndexBuffer( PrimitiveType source_type, uint32_t index_count, uint32_t& index_count_out) const; - // TODO(Triang3l): A function that returns a static index buffer for - // non-indexed drawing of unsupported primitives private: // simd_offset is source address & 15 - if SIMD is used, the source and the @@ -89,10 +86,14 @@ class PrimitiveConverter { uint32_t simd_offset, D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out); + // Callback for invalidating buffers mid-frame. + void MemoryWriteCallback(uint32_t page_first, uint32_t page_last); + static void MemoryWriteCallbackThunk(void* context_ptr, uint32_t page_first, + uint32_t page_last); + D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; Memory* memory_; - SharedMemory* shared_memory_; std::unique_ptr buffer_pool_ = nullptr; @@ -113,17 +114,56 @@ class PrimitiveConverter { static constexpr uint32_t kStaticIBTotalCount = kStaticIBTriangleFanOffset + kStaticIBTriangleFanCount; - struct ConvertedIndices { - D3D12_GPU_VIRTUAL_ADDRESS gpu_address; - PrimitiveType primitive_type; - uint32_t index_count; - IndexFormat index_format; - // Index pre-swapped - in guest storage endian. - uint32_t reset_index; - bool reset; + // Not identifying the index buffer uniquely - reset index must also be + // checked if reset is enabled. + union ConvertedIndicesKey { + uint64_t value; + struct { + uint32_t address; // 32 + PrimitiveType source_type : 6; // 38 + IndexFormat format : 1; // 39 + uint32_t count : 16; // 55 + uint32_t reset : 1; // 56; + }; + + // Clearing the unused bits. + ConvertedIndicesKey() : value(0) {} + ConvertedIndicesKey(const ConvertedIndicesKey& key) : value(key.value) {} + ConvertedIndicesKey& operator=(const ConvertedIndicesKey& key) { + value = key.value; + return *this; + } + bool operator==(const ConvertedIndicesKey& key) const { + return value == key.value; + } + bool operator!=(const ConvertedIndicesKey& key) const { + return value != key.value; + } }; + + struct ConvertedIndices { + ConvertedIndicesKey key; + // If reset is enabled, this also must be checked to find cached indices. + uint32_t reset_index; + + // Zero GPU address if conversion not needed or the resulting index buffer + // is empty. + D3D12_GPU_VIRTUAL_ADDRESS gpu_address; + // When conversion is not needed, this must be equal to the original index + // count. + uint32_t converted_index_count; + }; + // Cache for a single frame. - std::unordered_multimap converted_indices_; + std::unordered_multimap converted_indices_cache_; + + // Very coarse cache invalidation - if something is modified in a 8 MB portion + // of the physical memory and converted indices are also there, invalidate all + // the cache. + uint64_t memory_regions_used_; + std::atomic memory_regions_invalidated_ = 0; + void* physical_write_watch_handle_ = nullptr; + uint32_t system_page_size_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 3455529d1..3f783a826 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -440,7 +440,7 @@ void TextureCache::RequestTextures(uint32_t used_vertex_texture_mask, SCOPE_profile_cpu_f("gpu"); #endif // FINE_GRAINED_DRAW_SCOPES - if (texture_invalidated_.exchange(false, std::memory_order_relaxed)) { + if (texture_invalidated_.exchange(false, std::memory_order_acquire)) { // Clear the bindings not only for this draw call, but entirely, because // loading may be needed in some draw call later, which may have the same // key for some binding as before the invalidation, but texture_invalidated_ @@ -1297,7 +1297,7 @@ void TextureCache::WatchCallback(Texture* texture, bool is_mip) { texture->base_in_sync = false; texture->base_watch_handle = nullptr; } - texture_invalidated_.store(true, std::memory_order_relaxed); + texture_invalidated_.store(true, std::memory_order_release); } void TextureCache::ClearBindings() { diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 3984f5557..8a4a095af 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -384,8 +384,7 @@ class TextureCache { // Whether a texture has been invalidated (a watch has been triggered), so // need to try to reload textures, disregarding whether fetch constants have - // been changed. A simple notification (texture validity is protected by a - // mutex), so memory_order_relaxed is enough. + // been changed. std::atomic texture_invalidated_ = false; // Unsupported texture formats used during this frame (for research and