[D3D12] Primitive converter cache and strip restart, texture invalidation acquire/release

This commit is contained in:
Triang3l 2018-10-06 20:27:48 +03:00
parent 128ac2a3f9
commit 9194c3f8b0
5 changed files with 237 additions and 76 deletions

View File

@ -632,8 +632,8 @@ bool D3D12CommandProcessor::SetupContext() {
return false;
}
primitive_converter_ = std::make_unique<PrimitiveConverter>(
this, register_file_, memory_, shared_memory_.get());
primitive_converter_ =
std::make_unique<PrimitiveConverter>(this, register_file_, memory_);
if (!primitive_converter_->Initialize()) {
XELOGE("Failed to initialize the geometric primitive converter");
return false;
@ -1060,30 +1060,6 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type,
return true;
}
bool indexed = index_buffer_info != nullptr && index_buffer_info->guest_base;
if (indexed && regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21)) {
uint32_t reset_index = regs[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32;
uint32_t reset_index_expected;
if (index_buffer_info->format == IndexFormat::kInt32) {
reset_index_expected = 0xFFFFFFFFu;
} else {
reset_index_expected = 0xFFFFu;
}
if (reset_index != reset_index_expected) {
// Only 0xFFFF and 0xFFFFFFFF primitive restart indices are supported by
// Direct3D 12 (endianness doesn't matter for them). With shared memory,
// it's impossible to replace the cut index in the buffer without
// affecting the game memory.
XELOGE(
"The game uses the primitive restart index 0x%X that isn't 0xFFFF or "
"0xFFFFFFFF. Report the game to Xenia developers so geometry shaders "
"will be added to handle this!",
reset_index);
assert_always();
return false;
}
}
// Shaders will have already been defined by previous loads.
// We need them to do just about anything so validate here.
auto vertex_shader = static_cast<D3D12Shader*>(active_vertex_shader());
@ -1122,6 +1098,8 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type,
const RenderTargetCache::PipelineRenderTarget* pipeline_render_targets =
render_target_cache_->GetCurrentPipelineRenderTargets();
bool indexed = index_buffer_info != nullptr && index_buffer_info->guest_base;
// Set the primitive topology.
PrimitiveType primitive_type_converted =
PrimitiveConverter::GetReplacementPrimitiveType(primitive_type);

View File

@ -14,6 +14,7 @@
#include "xenia/base/assert.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/platform.h"
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
#include "xenia/ui/d3d12/d3d12_util.h"
@ -24,12 +25,12 @@ namespace d3d12 {
PrimitiveConverter::PrimitiveConverter(D3D12CommandProcessor* command_processor,
RegisterFile* register_file,
Memory* memory,
SharedMemory* shared_memory)
Memory* memory)
: command_processor_(command_processor),
register_file_(register_file),
memory_(memory),
shared_memory_(shared_memory) {}
memory_(memory) {
system_page_size_ = uint32_t(memory::page_size());
}
PrimitiveConverter::~PrimitiveConverter() { Shutdown(); }
@ -94,10 +95,18 @@ bool PrimitiveConverter::Initialize() {
}
static_ib_gpu_address_ = static_ib_->GetGPUVirtualAddress();
memory_regions_invalidated_.store(0ull, std::memory_order_relaxed);
physical_write_watch_handle_ =
memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this);
return true;
}
void PrimitiveConverter::Shutdown() {
if (physical_write_watch_handle_ != nullptr) {
memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_);
physical_write_watch_handle_ = nullptr;
}
ui::d3d12::util::ReleaseAndNull(static_ib_);
ui::d3d12::util::ReleaseAndNull(static_ib_upload_);
buffer_pool_.reset();
@ -106,8 +115,6 @@ void PrimitiveConverter::Shutdown() {
void PrimitiveConverter::ClearCache() { buffer_pool_->ClearCache(); }
void PrimitiveConverter::BeginFrame() {
buffer_pool_->BeginFrame();
// Got a command list now - upload and transition the static index buffer if
// needed.
if (static_ib_upload_ != nullptr) {
@ -126,6 +133,11 @@ void PrimitiveConverter::BeginFrame() {
static_ib_upload_ = nullptr;
}
}
buffer_pool_->BeginFrame();
converted_indices_cache_.clear();
memory_regions_used_ = 0;
}
void PrimitiveConverter::EndFrame() { buffer_pool_->EndFrame(); }
@ -142,6 +154,7 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
PrimitiveType source_type, uint32_t address, uint32_t index_count,
IndexFormat index_format, Endian index_endianness,
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out, uint32_t& index_count_out) {
bool index_32bit = index_format == IndexFormat::kInt32;
auto& regs = *register_file_;
bool reset = (regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21)) != 0;
// Swap the reset index because we will be comparing unswapped values to it.
@ -150,40 +163,79 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
// If the specified reset index is the same as the one used by Direct3D 12
// (0xFFFF or 0xFFFFFFFF - in the pipeline cache, we use the former for
// 16-bit and the latter for 32-bit indices), we can use the buffer directly.
uint32_t reset_index_host =
index_format == IndexFormat::kInt32 ? 0xFFFFFFFFu : 0xFFFFu;
uint32_t reset_index_host = index_32bit ? 0xFFFFFFFFu : 0xFFFFu;
// Check if need to convert at all.
if (source_type != PrimitiveType::kTriangleFan) {
if (!reset || reset_index == reset_index_host) {
return ConversionResult::kConversionNotNeeded;
}
if (source_type != PrimitiveType::kTriangleStrip ||
if (source_type != PrimitiveType::kTriangleStrip &&
source_type != PrimitiveType::kLineStrip) {
return ConversionResult::kConversionNotNeeded;
}
// TODO(Triang3l): Write conversion for triangle and line strip reset index
// and for indexed line loops.
return ConversionResult::kConversionNotNeeded;
}
// Exit early for clearly empty draws, without even reading the memory.
if (source_type == PrimitiveType::kTriangleFan ||
source_type == PrimitiveType::kTriangleStrip) {
if (index_count < 3) {
return ConversionResult::kPrimitiveEmpty;
}
} else if (source_type == PrimitiveType::kLineStrip ||
uint32_t index_count_min;
if (source_type == PrimitiveType::kLineStrip ||
source_type == PrimitiveType::kLineLoop) {
if (index_count < 2) {
index_count_min = 2;
} else {
index_count_min = 3;
}
if (index_count < index_count_min) {
return ConversionResult::kPrimitiveEmpty;
}
// Invalidate the cache if data behind any entry was modified.
if (memory_regions_invalidated_.exchange(0ull, std::memory_order_acquire) &
memory_regions_used_) {
converted_indices_cache_.clear();
memory_regions_used_ = 0;
}
// TODO(Triang3l): Find the converted data in the cache.
address &= index_32bit ? 0x1FFFFFFC : 0x1FFFFFFE;
uint32_t index_size = index_32bit ? sizeof(uint32_t) : sizeof(uint16_t);
uint32_t address_last = address + index_size * (index_count - 1);
// Calculate the index count, and also check if there's nothing to convert in
// the buffer (for instance, if not using primitive reset).
// Create the cache entry, currently only for the key.
ConvertedIndices converted_indices;
converted_indices.key.address = address;
converted_indices.key.source_type = source_type;
converted_indices.key.format = index_format;
converted_indices.key.count = index_count;
converted_indices.key.reset = reset ? 1 : 0;
converted_indices.reset_index = reset_index;
// Try to find the previously converted index buffer.
auto found_range =
converted_indices_cache_.equal_range(converted_indices.key.value);
for (auto iter = found_range.first; iter != found_range.second; ++iter) {
const ConvertedIndices& found_converted = iter->second;
if (reset && found_converted.reset_index != reset_index) {
continue;
}
if (found_converted.converted_index_count == 0) {
return ConversionResult::kPrimitiveEmpty;
}
if (!found_converted.gpu_address) {
return ConversionResult::kConversionNotNeeded;
}
gpu_address_out = found_converted.gpu_address;
index_count_out = found_converted.converted_index_count;
return ConversionResult::kConverted;
}
// Get the memory usage mask for cache invalidation.
// 1 bit = (512 / 64) MB = 8 MB.
uint64_t memory_regions_used_bits = ~((1ull << (address >> 23)) - 1);
if (address_last < (63 << 23)) {
memory_regions_used_bits = (1ull << ((address_last >> 23) + 1)) - 1;
}
// Calculate the new index count, and also check if there's nothing to convert
// in the buffer (for instance, if not using actually primitive reset).
uint32_t converted_index_count = 0;
bool conversion_needed = false;
bool simd = false;
@ -196,22 +248,44 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
} else {
converted_index_count = 3 * (index_count - 2);
}
} else if (source_type == PrimitiveType::kTriangleStrip ||
source_type == PrimitiveType::kLineStrip) {
// TODO(Triang3l): Check if the restart index is used at all in this buffer.
conversion_needed = true;
converted_index_count = index_count;
simd = true;
}
converted_indices.converted_index_count = converted_index_count;
// If nothing to convert, store this result so the check won't be happening
// again and again and exit.
if (!conversion_needed || converted_index_count == 0) {
converted_indices.gpu_address = 0;
converted_indices_cache_.insert(
std::make_pair(converted_indices.key.value, converted_indices));
memory_regions_used_ |= memory_regions_used_bits;
return converted_index_count == 0 ? ConversionResult::kPrimitiveEmpty
: ConversionResult::kConversionNotNeeded;
}
// Convert.
union {
void* source;
uint16_t* source_16;
uint32_t* source_32;
const void* source;
const uint8_t* source_8;
const uint16_t* source_16;
const uint32_t* source_32;
};
source = memory_->TranslatePhysical(address);
union {
void* target;
uint8_t* target_8;
uint16_t* target_16;
uint32_t* target_32;
};
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
target = AllocateIndices(index_format, index_count, simd ? address & 15 : 0,
gpu_address);
target = AllocateIndices(index_format, converted_index_count,
simd ? address & 15 : 0, gpu_address);
if (target == nullptr) {
return ConversionResult::kFailed;
}
@ -237,11 +311,62 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
}
}
}
} else if (source_type == PrimitiveType::kTriangleStrip ||
source_type == PrimitiveType::kLineStrip) {
// Replace the reset index with the maximum representable value - vector OR
// gives 0 or 0xFFFF/0xFFFFFFFF, which is exactly what is needed.
// Allocations in the target index buffer are aligned with 16-byte
// granularity, and within 16-byte vectors, both the source and the target
// start at the same offset.
#if XE_ARCH_AMD64
source = reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(source) &
~(uintptr_t(15)));
target = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(target) &
~(uintptr_t(15)));
uint32_t vector_count = (address_last >> 4) - (address >> 4) + 1;
if (index_format == IndexFormat::kInt32) {
__m128i reset_index_vector = _mm_set1_epi32(reset_index);
for (uint32_t i = 0; i < vector_count; ++i) {
__m128i indices_vector =
_mm_load_si128(reinterpret_cast<const __m128i*>(&source_8[i << 4]));
__m128i indices_are_reset_vector =
_mm_cmpeq_epi32(indices_vector, reset_index_vector);
_mm_store_si128(reinterpret_cast<__m128i*>(&target_8[i << 4]),
_mm_or_si128(indices_vector, indices_are_reset_vector));
}
} else {
__m128i reset_index_vector = _mm_set1_epi16(reset_index);
for (uint32_t i = 0; i < vector_count; ++i) {
__m128i indices_vector =
_mm_load_si128(reinterpret_cast<const __m128i*>(&source_8[i << 4]));
__m128i indices_are_reset_vector =
_mm_cmpeq_epi16(indices_vector, reset_index_vector);
_mm_store_si128(reinterpret_cast<__m128i*>(&target_8[i << 4]),
_mm_or_si128(indices_vector, indices_are_reset_vector));
}
}
#else
if (index_format == IndexFormat::kInt32) {
for (uint32_t i = 0; i < index_count; ++i) {
uint32_t index = source_32[i];
target_32[i] = index == reset_index ? 0xFFFFFFFFu : index;
}
} else {
for (uint32_t i = 0; i < index_count; ++i) {
uint16_t index = source_16[i];
target_16[i] = index == reset_index ? 0xFFFFu : index;
}
}
#endif
}
// TODO(Triang3l): Replace primitive reset index in triangle and line strips.
// TODO(Triang3l): Line loops.
// Cache and return the indices.
converted_indices.gpu_address = gpu_address;
converted_indices_cache_.insert(
std::make_pair(converted_indices.key.value, converted_indices));
memory_regions_used_ |= memory_regions_used_bits;
gpu_address_out = gpu_address;
index_count_out = converted_index_count;
return ConversionResult::kConverted;
@ -277,6 +402,25 @@ void* PrimitiveConverter::AllocateIndices(
return mapping + simd_offset;
}
void PrimitiveConverter::MemoryWriteCallback(uint32_t page_first,
uint32_t page_last) {
// 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size.
uint32_t bit_index_first = (page_first * system_page_size_) >> 23;
uint32_t bit_index_last = (page_last * system_page_size_) >> 23;
uint64_t bits = ~((1ull << bit_index_first) - 1);
if (bit_index_last < 63) {
bits &= (1ull << (bit_index_last + 1)) - 1;
}
memory_regions_invalidated_ |= bits;
}
void PrimitiveConverter::MemoryWriteCallbackThunk(void* context_ptr,
uint32_t page_first,
uint32_t page_last) {
reinterpret_cast<PrimitiveConverter*>(context_ptr)
->MemoryWriteCallback(page_first, page_last);
}
D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer(
PrimitiveType source_type, uint32_t index_count,
uint32_t& index_count_out) const {

View File

@ -14,11 +14,11 @@
#include <memory>
#include <unordered_map>
#include "xenia/gpu/d3d12/shared_memory.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
#include "xenia/ui/d3d12/d3d12_context.h"
#include "xenia/ui/d3d12/pools.h"
namespace xe {
namespace gpu {
@ -35,8 +35,7 @@ class D3D12CommandProcessor;
class PrimitiveConverter {
public:
PrimitiveConverter(D3D12CommandProcessor* command_processor,
RegisterFile* register_file, Memory* memory,
SharedMemory* shared_memory);
RegisterFile* register_file, Memory* memory);
~PrimitiveConverter();
bool Initialize();
@ -78,8 +77,6 @@ class PrimitiveConverter {
D3D12_GPU_VIRTUAL_ADDRESS GetStaticIndexBuffer(
PrimitiveType source_type, uint32_t index_count,
uint32_t& index_count_out) const;
// TODO(Triang3l): A function that returns a static index buffer for
// non-indexed drawing of unsupported primitives
private:
// simd_offset is source address & 15 - if SIMD is used, the source and the
@ -89,10 +86,14 @@ class PrimitiveConverter {
uint32_t simd_offset,
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out);
// Callback for invalidating buffers mid-frame.
void MemoryWriteCallback(uint32_t page_first, uint32_t page_last);
static void MemoryWriteCallbackThunk(void* context_ptr, uint32_t page_first,
uint32_t page_last);
D3D12CommandProcessor* command_processor_;
RegisterFile* register_file_;
Memory* memory_;
SharedMemory* shared_memory_;
std::unique_ptr<ui::d3d12::UploadBufferPool> buffer_pool_ = nullptr;
@ -113,17 +114,56 @@ class PrimitiveConverter {
static constexpr uint32_t kStaticIBTotalCount =
kStaticIBTriangleFanOffset + kStaticIBTriangleFanCount;
struct ConvertedIndices {
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
PrimitiveType primitive_type;
uint32_t index_count;
IndexFormat index_format;
// Index pre-swapped - in guest storage endian.
uint32_t reset_index;
bool reset;
// Not identifying the index buffer uniquely - reset index must also be
// checked if reset is enabled.
union ConvertedIndicesKey {
uint64_t value;
struct {
uint32_t address; // 32
PrimitiveType source_type : 6; // 38
IndexFormat format : 1; // 39
uint32_t count : 16; // 55
uint32_t reset : 1; // 56;
};
// Clearing the unused bits.
ConvertedIndicesKey() : value(0) {}
ConvertedIndicesKey(const ConvertedIndicesKey& key) : value(key.value) {}
ConvertedIndicesKey& operator=(const ConvertedIndicesKey& key) {
value = key.value;
return *this;
}
bool operator==(const ConvertedIndicesKey& key) const {
return value == key.value;
}
bool operator!=(const ConvertedIndicesKey& key) const {
return value != key.value;
}
};
struct ConvertedIndices {
ConvertedIndicesKey key;
// If reset is enabled, this also must be checked to find cached indices.
uint32_t reset_index;
// Zero GPU address if conversion not needed or the resulting index buffer
// is empty.
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
// When conversion is not needed, this must be equal to the original index
// count.
uint32_t converted_index_count;
};
// Cache for a single frame.
std::unordered_multimap<uint32_t, ConvertedIndices> converted_indices_;
std::unordered_multimap<uint64_t, ConvertedIndices> converted_indices_cache_;
// Very coarse cache invalidation - if something is modified in a 8 MB portion
// of the physical memory and converted indices are also there, invalidate all
// the cache.
uint64_t memory_regions_used_;
std::atomic<uint64_t> memory_regions_invalidated_ = 0;
void* physical_write_watch_handle_ = nullptr;
uint32_t system_page_size_;
};
} // namespace d3d12

View File

@ -440,7 +440,7 @@ void TextureCache::RequestTextures(uint32_t used_vertex_texture_mask,
SCOPE_profile_cpu_f("gpu");
#endif // FINE_GRAINED_DRAW_SCOPES
if (texture_invalidated_.exchange(false, std::memory_order_relaxed)) {
if (texture_invalidated_.exchange(false, std::memory_order_acquire)) {
// Clear the bindings not only for this draw call, but entirely, because
// loading may be needed in some draw call later, which may have the same
// key for some binding as before the invalidation, but texture_invalidated_
@ -1297,7 +1297,7 @@ void TextureCache::WatchCallback(Texture* texture, bool is_mip) {
texture->base_in_sync = false;
texture->base_watch_handle = nullptr;
}
texture_invalidated_.store(true, std::memory_order_relaxed);
texture_invalidated_.store(true, std::memory_order_release);
}
void TextureCache::ClearBindings() {

View File

@ -384,8 +384,7 @@ class TextureCache {
// Whether a texture has been invalidated (a watch has been triggered), so
// need to try to reload textures, disregarding whether fetch constants have
// been changed. A simple notification (texture validity is protected by a
// mutex), so memory_order_relaxed is enough.
// been changed.
std::atomic<bool> texture_invalidated_ = false;
// Unsupported texture formats used during this frame (for research and