Merge branch 'vtx_cache'

This commit is contained in:
DrChat 2018-02-18 16:49:32 -06:00
commit 79308b477f
12 changed files with 247 additions and 82 deletions

View File

@ -240,12 +240,20 @@ bool MMIOHandler::IsRangeWatched(uint32_t physical_address, size_t length) {
for (auto it = access_watches_.begin(); it != access_watches_.end(); ++it) {
auto entry = *it;
if ((entry->address <= physical_address &&
entry->address + entry->length > physical_address) ||
(entry->address >= physical_address &&
entry->address < physical_address + length)) {
// This watch lies within the range.
entry->address + entry->length > physical_address + length)) {
// This range lies entirely within this watch.
return true;
}
// TODO(DrChat): Check if the range is partially covered, and subtract the
// covered portion if it is.
if ((entry->address <= physical_address &&
entry->address + entry->length > physical_address)) {
// The beginning of range lies partially within this watch.
} else if ((entry->address < physical_address + length &&
entry->address + entry->length > physical_address + length)) {
// The ending of this range lies partially within this watch.
}
}
return false;

View File

@ -77,7 +77,7 @@ class MMIOHandler {
// Fires and clears any access watches that overlap this range.
void InvalidateRange(uint32_t physical_address, size_t length);
// Returns true if /any/ part of this range is watched.
// Returns true if /all/ of this range is watched.
bool IsRangeWatched(uint32_t physical_address, size_t length);
protected:

View File

@ -47,6 +47,10 @@ XE_GPU_REGISTER(0x0D04, kDword, SQ_EO_RT)
XE_GPU_REGISTER(0x0C85, kDword, PA_CL_ENHANCE)
// Set with WAIT_UNTIL = WAIT_3D_IDLECLEAN
XE_GPU_REGISTER(0x0E00, kDword, UNKNOWN_0E00)
XE_GPU_REGISTER(0x0E40, kDword, UNKNOWN_0E40)
XE_GPU_REGISTER(0x0E42, kDword, UNKNOWN_0E42)
XE_GPU_REGISTER(0x0F01, kDword, RB_BC_CONTROL)

View File

@ -16,10 +16,80 @@
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/vulkan/vulkan_gpu_flags.h"
#include "third_party/vulkan/vk_mem_alloc.h"
namespace xe {
namespace gpu {
namespace vulkan {
#if XE_ARCH_AMD64
void copy_cmp_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
uint16_t cmp_value, size_t count) {
auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
auto src = reinterpret_cast<const uint16_t*>(src_ptr);
__m128i shufmask =
_mm_set_epi8(0x0E, 0x0F, 0x0C, 0x0D, 0x0A, 0x0B, 0x08, 0x09, 0x06, 0x07,
0x04, 0x05, 0x02, 0x03, 0x00, 0x01);
__m128i cmpval = _mm_set1_epi16(cmp_value);
size_t i;
for (i = 0; i + 8 <= count; i += 8) {
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
__m128i output = _mm_shuffle_epi8(input, shufmask);
__m128i mask = _mm_cmpeq_epi16(output, cmpval);
output = _mm_or_si128(output, mask);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
dest[i] = byte_swap(src[i]);
}
}
void copy_cmp_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
uint32_t cmp_value, size_t count) {
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
__m128i shufmask =
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
__m128i cmpval = _mm_set1_epi32(cmp_value);
size_t i;
for (i = 0; i + 4 <= count; i += 4) {
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
__m128i output = _mm_shuffle_epi8(input, shufmask);
__m128i mask = _mm_cmpeq_epi32(output, cmpval);
output = _mm_or_si128(output, mask);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
dest[i] = byte_swap(src[i]);
}
}
#else
void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
uint16_t cmp_value, size_t count) {
auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
auto src = reinterpret_cast<const uint16_t*>(src_ptr);
for (size_t i = 0; i < count; ++i) {
uint16_t value = byte_swap(src[i]);
dest[i] = value == cmp_value ? 0xFFFF : value;
}
}
void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
uint32_t cmp_value, size_t count) {
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
for (size_t i = 0; i < count; ++i) {
uint32_t value = byte_swap(src[i]);
dest[i] = value == cmp_value ? 0xFFFFFFFF : value;
}
}
#endif
using xe::ui::vulkan::CheckResult;
constexpr VkDeviceSize kConstantRegisterUniformRange =
@ -32,7 +102,7 @@ BufferCache::BufferCache(RegisterFile* register_file, Memory* memory,
device_,
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
capacity);
capacity, 4096);
}
BufferCache::~BufferCache() { Shutdown(); }
@ -47,6 +117,15 @@ VkResult BufferCache::Initialize() {
return status;
}
// Create a memory allocator for textures.
VmaAllocatorCreateInfo alloc_info = {
0, *device_, *device_, 0, 0, nullptr, nullptr,
};
status = vmaCreateAllocator(&alloc_info, &mem_allocator_);
if (status != VK_SUCCESS) {
return status;
}
// Descriptor pool used for all of our cached descriptors.
// In the steady state we don't allocate anything, so these are all manually
// managed.
@ -150,28 +229,23 @@ VkResult BufferCache::Initialize() {
}
void BufferCache::Shutdown() {
if (mem_allocator_) {
vmaDestroyAllocator(mem_allocator_);
mem_allocator_ = nullptr;
}
if (transient_descriptor_set_) {
vkFreeDescriptorSets(*device_, descriptor_pool_, 1,
&transient_descriptor_set_);
transient_descriptor_set_ = nullptr;
}
if (descriptor_set_layout_) {
vkDestroyDescriptorSetLayout(*device_, descriptor_set_layout_, nullptr);
descriptor_set_layout_ = nullptr;
}
if (descriptor_pool_) {
vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr);
descriptor_pool_ = nullptr;
}
VK_SAFE_DESTROY(vkDestroyDescriptorSetLayout, *device_,
descriptor_set_layout_, nullptr);
VK_SAFE_DESTROY(vkDestroyDescriptorPool, *device_, descriptor_pool_, nullptr);
transient_buffer_->Shutdown();
if (gpu_memory_pool_) {
vkFreeMemory(*device_, gpu_memory_pool_, nullptr);
gpu_memory_pool_ = nullptr;
}
VK_SAFE_DESTROY(vkFreeMemory, *device_, gpu_memory_pool_, nullptr);
}
std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
@ -278,13 +352,8 @@ std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
VkCommandBuffer command_buffer, uint32_t source_addr,
uint32_t source_length, IndexFormat format, VkFence fence) {
auto offset = FindCachedTransientData(source_addr, source_length);
if (offset != VK_WHOLE_SIZE) {
return {transient_buffer_->gpu_buffer(), offset};
}
// Allocate space in the buffer for our data.
offset = AllocateTransientData(source_length, fence);
auto offset = AllocateTransientData(source_length, fence);
if (offset == VK_WHOLE_SIZE) {
// OOM.
return {nullptr, VK_WHOLE_SIZE};
@ -292,17 +361,36 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
const void* source_ptr = memory_->TranslatePhysical(source_addr);
// Copy data into the buffer.
// TODO(benvanik): get min/max indices and pass back?
uint32_t prim_reset_index =
register_file_->values[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32;
bool prim_reset_enabled =
!!(register_file_->values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21));
// Copy data into the buffer. If primitive reset is enabled, translate any
// primitive reset indices to something Vulkan understands.
// TODO(benvanik): memcpy then use compute shaders to swap?
if (format == IndexFormat::kInt16) {
// Endian::k8in16, swap half-words.
xe::copy_and_swap_16_unaligned(transient_buffer_->host_base() + offset,
source_ptr, source_length / 2);
} else if (format == IndexFormat::kInt32) {
// Endian::k8in32, swap words.
xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
source_ptr, source_length / 4);
if (prim_reset_enabled) {
if (format == IndexFormat::kInt16) {
// Endian::k8in16, swap half-words.
copy_cmp_swap_16_unaligned(
transient_buffer_->host_base() + offset, source_ptr,
static_cast<uint16_t>(prim_reset_index), source_length / 2);
} else if (format == IndexFormat::kInt32) {
// Endian::k8in32, swap words.
copy_cmp_swap_32_unaligned(transient_buffer_->host_base() + offset,
source_ptr, prim_reset_index,
source_length / 4);
}
} else {
if (format == IndexFormat::kInt16) {
// Endian::k8in16, swap half-words.
xe::copy_and_swap_16_unaligned(transient_buffer_->host_base() + offset,
source_ptr, source_length / 2);
} else if (format == IndexFormat::kInt32) {
// Endian::k8in32, swap words.
xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
source_ptr, source_length / 4);
}
}
transient_buffer_->Flush(offset, source_length);
@ -323,7 +411,6 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
&barrier, 0, nullptr);
CacheTransientData(source_addr, source_length, offset);
return {transient_buffer_->gpu_buffer(), offset};
}
@ -335,29 +422,41 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
return {transient_buffer_->gpu_buffer(), offset};
}
// Slow path :)
// Expand the region up to the allocation boundary
auto physical_heap = memory_->GetPhysicalHeap();
uint32_t upload_base = source_addr;
uint32_t upload_size = source_length;
// Ping the memory subsystem for allocation size.
// TODO(DrChat): Artifacting occurring in GripShift with this enabled.
// physical_heap->QueryBaseAndSize(&upload_base, &upload_size);
assert(upload_base <= source_addr);
uint32_t source_offset = source_addr - upload_base;
// Allocate space in the buffer for our data.
offset = AllocateTransientData(source_length, fence);
offset = AllocateTransientData(upload_size, fence);
if (offset == VK_WHOLE_SIZE) {
// OOM.
return {nullptr, VK_WHOLE_SIZE};
}
const void* source_ptr = memory_->TranslatePhysical(source_addr);
const void* upload_ptr = memory_->TranslatePhysical(upload_base);
// Copy data into the buffer.
// TODO(benvanik): memcpy then use compute shaders to swap?
if (endian == Endian::k8in32) {
// Endian::k8in32, swap words.
xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
source_ptr, source_length / 4);
upload_ptr, source_length / 4);
} else if (endian == Endian::k16in32) {
xe::copy_and_swap_16_in_32_unaligned(
transient_buffer_->host_base() + offset, source_ptr, source_length / 4);
transient_buffer_->host_base() + offset, upload_ptr, source_length / 4);
} else {
assert_always();
}
transient_buffer_->Flush(offset, source_length);
transient_buffer_->Flush(offset, upload_size);
// Append a barrier to the command buffer.
VkBufferMemoryBarrier barrier = {
@ -369,14 +468,14 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
VK_QUEUE_FAMILY_IGNORED,
transient_buffer_->gpu_buffer(),
offset,
source_length,
upload_size,
};
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
&barrier, 0, nullptr);
CacheTransientData(source_addr, source_length, offset);
return {transient_buffer_->gpu_buffer(), offset};
CacheTransientData(upload_base, upload_size, offset);
return {transient_buffer_->gpu_buffer(), offset + source_offset};
}
VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize length,
@ -409,10 +508,22 @@ VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize length,
VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
uint32_t guest_length) {
uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
auto it = transient_cache_.find(key);
if (it != transient_cache_.end()) {
return it->second;
if (transient_cache_.empty()) {
// Short-circuit exit.
return VK_WHOLE_SIZE;
}
// Find the first element > guest_address
auto it = transient_cache_.upper_bound(guest_address);
if (it != transient_cache_.begin()) {
// it = first element <= guest_address
--it;
if ((it->first + it->second.first) >= (guest_address + guest_length)) {
// This data is contained within some existing transient data.
auto source_offset = static_cast<VkDeviceSize>(guest_address - it->first);
return it->second.second + source_offset;
}
}
return VK_WHOLE_SIZE;
@ -421,8 +532,17 @@ VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
void BufferCache::CacheTransientData(uint32_t guest_address,
uint32_t guest_length,
VkDeviceSize offset) {
uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
transient_cache_[key] = offset;
transient_cache_[guest_address] = {guest_length, offset};
// Erase any entries contained within
auto it = transient_cache_.upper_bound(guest_address);
while (it != transient_cache_.end()) {
if ((guest_address + guest_length) >= (it->first + it->second.first)) {
it = transient_cache_.erase(it);
} else {
break;
}
}
}
void BufferCache::Flush(VkCommandBuffer command_buffer) {

View File

@ -18,6 +18,8 @@
#include "xenia/ui/vulkan/vulkan.h"
#include "xenia/ui/vulkan/vulkan_device.h"
#include "third_party/vulkan/vk_mem_alloc.h"
#include <map>
namespace xe {
@ -95,6 +97,15 @@ class BufferCache {
void Scavenge();
private:
// This represents an uploaded vertex buffer.
struct VertexBuffer {
uint32_t guest_address;
uint32_t size;
VmaAllocation alloc;
VmaAllocationInfo alloc_info;
};
// Allocates a block of memory in the transient buffer.
// When memory is not available fences are checked and space is reclaimed.
// Returns VK_WHOLE_SIZE if requested amount of memory is not available.
@ -115,11 +126,12 @@ class BufferCache {
ui::vulkan::VulkanDevice* device_ = nullptr;
VkDeviceMemory gpu_memory_pool_ = nullptr;
VmaAllocator mem_allocator_ = nullptr;
// Staging ringbuffer we cycle through fast. Used for data we don't
// plan on keeping past the current frame.
std::unique_ptr<ui::vulkan::CircularBuffer> transient_buffer_ = nullptr;
std::map<uint64_t, VkDeviceSize> transient_cache_;
std::map<uint32_t, std::pair<uint32_t, VkDeviceSize>> transient_cache_;
VkDescriptorPool descriptor_pool_ = nullptr;
VkDescriptorSetLayout descriptor_set_layout_ = nullptr;

View File

@ -1202,16 +1202,12 @@ PipelineCache::UpdateStatus PipelineCache::UpdateInputAssemblyState(
// glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
// }
// Primitive restart index is handled in the buffer cache.
if (regs.pa_su_sc_mode_cntl & (1 << 21)) {
state_info.primitiveRestartEnable = VK_TRUE;
} else {
state_info.primitiveRestartEnable = VK_FALSE;
}
// TODO(benvanik): no way to specify in Vulkan?
assert_true(regs.multi_prim_ib_reset_index == 0xFFFF ||
regs.multi_prim_ib_reset_index == 0xFFFFFF ||
regs.multi_prim_ib_reset_index == 0xFFFFFFFF);
// glPrimitiveRestartIndex(regs.multi_prim_ib_reset_index);
return UpdateStatus::kMismatch;
}

View File

@ -860,14 +860,13 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
// TODO: Make the buffer cache ... actually cache buffers. We can have
// a list of buffers that were cached, and store those in chunks in a
// multiple of the host's page size.
// WRITE WATCHES: We need to invalidate vertex buffers if they're written
// to. Since most vertex buffers aren't aligned to a page boundary, this
// means a watch may cover more than one vertex buffer.
// We need to maintain a list of write watches, and what memory ranges
// they cover. If a vertex buffer lies within a write watch's range, assign
// it to the watch. If there's partial alignment where a buffer lies within
// one watch and outside of it, should we create a new watch or extend the
// existing watch?
// So, we need to track all vertex buffers in a sorted map, and track all
// write watches in a sorted map. When a vertex buffer is uploaded, track
// all untracked pages with 1-page write watches. In the callback,
// invalidate any overlapping vertex buffers.
//
// We would keep the old transient buffer as a staging buffer, and upload
// to a GPU-only buffer that tracks all cached vertex buffers.
auto buffer_ref = buffer_cache_->UploadVertexBuffer(
current_setup_buffer_, physical_address, source_length,
static_cast<Endian>(fetch->endian), current_batch_fence_);

View File

@ -273,14 +273,11 @@ dword_result_t NtQueryVirtualMemory(
return X_STATUS_INVALID_PARAMETER;
}
memory_basic_information_ptr->base_address =
static_cast<uint32_t>(alloc_info.base_address);
memory_basic_information_ptr->allocation_base =
static_cast<uint32_t>(alloc_info.allocation_base);
memory_basic_information_ptr->base_address = alloc_info.base_address;
memory_basic_information_ptr->allocation_base = alloc_info.allocation_base;
memory_basic_information_ptr->allocation_protect =
ToXdkProtectFlags(alloc_info.allocation_protect);
memory_basic_information_ptr->region_size =
static_cast<uint32_t>(alloc_info.region_size);
memory_basic_information_ptr->region_size = alloc_info.region_size;
uint32_t x_state = 0;
if (alloc_info.state & kMemoryAllocationReserve) {
x_state |= X_MEM_RESERVE;
@ -290,7 +287,7 @@ dword_result_t NtQueryVirtualMemory(
}
memory_basic_information_ptr->state = x_state;
memory_basic_information_ptr->protect = ToXdkProtectFlags(alloc_info.protect);
memory_basic_information_ptr->type = alloc_info.type;
memory_basic_information_ptr->type = X_MEM_PRIVATE;
return X_STATUS_SUCCESS;
}

View File

@ -339,6 +339,8 @@ BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
}
}
VirtualHeap* Memory::GetPhysicalHeap() { return &heaps_.physical; }
void Memory::Zero(uint32_t address, uint32_t size) {
std::memset(TranslateVirtual(address), 0, size);
}
@ -1096,16 +1098,19 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
out_info->region_size = 0;
out_info->state = 0;
out_info->protect = 0;
out_info->type = 0;
if (start_page_entry.state) {
// Committed/reserved region.
out_info->allocation_base = start_page_entry.base_address * page_size_;
out_info->allocation_protect = start_page_entry.allocation_protect;
out_info->allocation_size = start_page_entry.region_page_count * page_size_;
out_info->state = start_page_entry.state;
out_info->protect = start_page_entry.current_protect;
out_info->type = 0x20000;
// Scan forward and report the size of the region matching the initial
// base address's attributes.
for (uint32_t page_number = start_page_number;
page_number < start_page_number + start_page_entry.region_page_count;
page_number <
start_page_entry.base_address + start_page_entry.region_page_count;
++page_number) {
auto page_entry = page_table_[page_number];
if (page_entry.base_address != start_page_entry.base_address ||
@ -1144,6 +1149,20 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
return true;
}
bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
if (page_number > page_table_.size()) {
XELOGE("BaseHeap::QuerySize base page out of range");
*out_size = 0;
return false;
}
auto global_lock = global_critical_region_.Acquire();
auto page_entry = page_table_[page_number];
*in_out_address = (page_entry.base_address * page_size_);
*out_size = (page_entry.region_page_count * page_size_);
return true;
}
bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
uint32_t page_number = (address - heap_base_) / page_size_;
if (page_number > page_table_.size()) {

View File

@ -56,6 +56,8 @@ struct HeapAllocationInfo {
uint32_t allocation_base;
// The memory protection option when the region was initially allocated.
uint32_t allocation_protect;
// The size specified when the region was initially allocated, in bytes.
uint32_t allocation_size;
// The size of the region beginning at the base address in which all pages
// have identical attributes, in bytes.
uint32_t region_size;
@ -63,8 +65,6 @@ struct HeapAllocationInfo {
uint32_t state;
// The access protection of the pages in the region.
uint32_t protect;
// The type of pages in the region (private).
uint32_t type;
};
// Describes a single page in the page table.
@ -144,6 +144,9 @@ class BaseHeap {
// Queries the size of the region containing the given address.
bool QuerySize(uint32_t address, uint32_t* out_size);
// Queries the base and size of a region containing the given address.
bool QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size);
// Queries the current protection mode of the region containing the given
// address.
bool QueryProtect(uint32_t address, uint32_t* out_protect);
@ -332,6 +335,9 @@ class Memory {
// Gets the heap with the given properties.
BaseHeap* LookupHeapByType(bool physical, uint32_t page_size);
// Gets the physical base heap.
VirtualHeap* GetPhysicalHeap();
// Dumps a map of all allocated memory to the log.
void DumpMap();

View File

@ -42,7 +42,7 @@ CircularBuffer::CircularBuffer(VulkanDevice* device, VkBufferUsageFlags usage,
VkMemoryRequirements reqs;
vkGetBufferMemoryRequirements(*device_, gpu_buffer_, &reqs);
alignment_ = reqs.alignment;
alignment_ = xe::round_up(alignment, reqs.alignment);
}
CircularBuffer::~CircularBuffer() { Shutdown(); }

View File

@ -26,10 +26,14 @@ namespace ui {
namespace vulkan {
#define VK_SAFE_DESTROY(fn, dev, obj, alloc) \
if (obj) { \
fn(dev, obj, alloc); \
obj = nullptr; \
}
\
do { \
if (obj) { \
fn(dev, obj, alloc); \
obj = nullptr; \
} \
\
} while (0)
class Fence {
public: