Merge branch 'vtx_cache'
This commit is contained in:
commit
79308b477f
|
@ -240,12 +240,20 @@ bool MMIOHandler::IsRangeWatched(uint32_t physical_address, size_t length) {
|
||||||
for (auto it = access_watches_.begin(); it != access_watches_.end(); ++it) {
|
for (auto it = access_watches_.begin(); it != access_watches_.end(); ++it) {
|
||||||
auto entry = *it;
|
auto entry = *it;
|
||||||
if ((entry->address <= physical_address &&
|
if ((entry->address <= physical_address &&
|
||||||
entry->address + entry->length > physical_address) ||
|
entry->address + entry->length > physical_address + length)) {
|
||||||
(entry->address >= physical_address &&
|
// This range lies entirely within this watch.
|
||||||
entry->address < physical_address + length)) {
|
|
||||||
// This watch lies within the range.
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO(DrChat): Check if the range is partially covered, and subtract the
|
||||||
|
// covered portion if it is.
|
||||||
|
if ((entry->address <= physical_address &&
|
||||||
|
entry->address + entry->length > physical_address)) {
|
||||||
|
// The beginning of range lies partially within this watch.
|
||||||
|
} else if ((entry->address < physical_address + length &&
|
||||||
|
entry->address + entry->length > physical_address + length)) {
|
||||||
|
// The ending of this range lies partially within this watch.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -77,7 +77,7 @@ class MMIOHandler {
|
||||||
// Fires and clears any access watches that overlap this range.
|
// Fires and clears any access watches that overlap this range.
|
||||||
void InvalidateRange(uint32_t physical_address, size_t length);
|
void InvalidateRange(uint32_t physical_address, size_t length);
|
||||||
|
|
||||||
// Returns true if /any/ part of this range is watched.
|
// Returns true if /all/ of this range is watched.
|
||||||
bool IsRangeWatched(uint32_t physical_address, size_t length);
|
bool IsRangeWatched(uint32_t physical_address, size_t length);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
|
@ -47,6 +47,10 @@ XE_GPU_REGISTER(0x0D04, kDword, SQ_EO_RT)
|
||||||
|
|
||||||
XE_GPU_REGISTER(0x0C85, kDword, PA_CL_ENHANCE)
|
XE_GPU_REGISTER(0x0C85, kDword, PA_CL_ENHANCE)
|
||||||
|
|
||||||
|
// Set with WAIT_UNTIL = WAIT_3D_IDLECLEAN
|
||||||
|
XE_GPU_REGISTER(0x0E00, kDword, UNKNOWN_0E00)
|
||||||
|
XE_GPU_REGISTER(0x0E40, kDword, UNKNOWN_0E40)
|
||||||
|
|
||||||
XE_GPU_REGISTER(0x0E42, kDword, UNKNOWN_0E42)
|
XE_GPU_REGISTER(0x0E42, kDword, UNKNOWN_0E42)
|
||||||
|
|
||||||
XE_GPU_REGISTER(0x0F01, kDword, RB_BC_CONTROL)
|
XE_GPU_REGISTER(0x0F01, kDword, RB_BC_CONTROL)
|
||||||
|
|
|
@ -16,10 +16,80 @@
|
||||||
#include "xenia/gpu/gpu_flags.h"
|
#include "xenia/gpu/gpu_flags.h"
|
||||||
#include "xenia/gpu/vulkan/vulkan_gpu_flags.h"
|
#include "xenia/gpu/vulkan/vulkan_gpu_flags.h"
|
||||||
|
|
||||||
|
#include "third_party/vulkan/vk_mem_alloc.h"
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
namespace vulkan {
|
namespace vulkan {
|
||||||
|
|
||||||
|
#if XE_ARCH_AMD64
|
||||||
|
void copy_cmp_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
|
uint16_t cmp_value, size_t count) {
|
||||||
|
auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint16_t*>(src_ptr);
|
||||||
|
__m128i shufmask =
|
||||||
|
_mm_set_epi8(0x0E, 0x0F, 0x0C, 0x0D, 0x0A, 0x0B, 0x08, 0x09, 0x06, 0x07,
|
||||||
|
0x04, 0x05, 0x02, 0x03, 0x00, 0x01);
|
||||||
|
__m128i cmpval = _mm_set1_epi16(cmp_value);
|
||||||
|
|
||||||
|
size_t i;
|
||||||
|
for (i = 0; i + 8 <= count; i += 8) {
|
||||||
|
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||||
|
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
||||||
|
|
||||||
|
__m128i mask = _mm_cmpeq_epi16(output, cmpval);
|
||||||
|
output = _mm_or_si128(output, mask);
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) { // handle residual elements
|
||||||
|
dest[i] = byte_swap(src[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_cmp_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
|
uint32_t cmp_value, size_t count) {
|
||||||
|
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
|
||||||
|
__m128i shufmask =
|
||||||
|
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
|
||||||
|
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||||
|
__m128i cmpval = _mm_set1_epi32(cmp_value);
|
||||||
|
|
||||||
|
size_t i;
|
||||||
|
for (i = 0; i + 4 <= count; i += 4) {
|
||||||
|
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||||
|
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
||||||
|
|
||||||
|
__m128i mask = _mm_cmpeq_epi32(output, cmpval);
|
||||||
|
output = _mm_or_si128(output, mask);
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) { // handle residual elements
|
||||||
|
dest[i] = byte_swap(src[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
|
uint16_t cmp_value, size_t count) {
|
||||||
|
auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint16_t*>(src_ptr);
|
||||||
|
for (size_t i = 0; i < count; ++i) {
|
||||||
|
uint16_t value = byte_swap(src[i]);
|
||||||
|
dest[i] = value == cmp_value ? 0xFFFF : value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
|
uint32_t cmp_value, size_t count) {
|
||||||
|
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
|
||||||
|
for (size_t i = 0; i < count; ++i) {
|
||||||
|
uint32_t value = byte_swap(src[i]);
|
||||||
|
dest[i] = value == cmp_value ? 0xFFFFFFFF : value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
using xe::ui::vulkan::CheckResult;
|
using xe::ui::vulkan::CheckResult;
|
||||||
|
|
||||||
constexpr VkDeviceSize kConstantRegisterUniformRange =
|
constexpr VkDeviceSize kConstantRegisterUniformRange =
|
||||||
|
@ -32,7 +102,7 @@ BufferCache::BufferCache(RegisterFile* register_file, Memory* memory,
|
||||||
device_,
|
device_,
|
||||||
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
|
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
|
||||||
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
|
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
|
||||||
capacity);
|
capacity, 4096);
|
||||||
}
|
}
|
||||||
|
|
||||||
BufferCache::~BufferCache() { Shutdown(); }
|
BufferCache::~BufferCache() { Shutdown(); }
|
||||||
|
@ -47,6 +117,15 @@ VkResult BufferCache::Initialize() {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create a memory allocator for textures.
|
||||||
|
VmaAllocatorCreateInfo alloc_info = {
|
||||||
|
0, *device_, *device_, 0, 0, nullptr, nullptr,
|
||||||
|
};
|
||||||
|
status = vmaCreateAllocator(&alloc_info, &mem_allocator_);
|
||||||
|
if (status != VK_SUCCESS) {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
// Descriptor pool used for all of our cached descriptors.
|
// Descriptor pool used for all of our cached descriptors.
|
||||||
// In the steady state we don't allocate anything, so these are all manually
|
// In the steady state we don't allocate anything, so these are all manually
|
||||||
// managed.
|
// managed.
|
||||||
|
@ -150,28 +229,23 @@ VkResult BufferCache::Initialize() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void BufferCache::Shutdown() {
|
void BufferCache::Shutdown() {
|
||||||
|
if (mem_allocator_) {
|
||||||
|
vmaDestroyAllocator(mem_allocator_);
|
||||||
|
mem_allocator_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
if (transient_descriptor_set_) {
|
if (transient_descriptor_set_) {
|
||||||
vkFreeDescriptorSets(*device_, descriptor_pool_, 1,
|
vkFreeDescriptorSets(*device_, descriptor_pool_, 1,
|
||||||
&transient_descriptor_set_);
|
&transient_descriptor_set_);
|
||||||
transient_descriptor_set_ = nullptr;
|
transient_descriptor_set_ = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (descriptor_set_layout_) {
|
VK_SAFE_DESTROY(vkDestroyDescriptorSetLayout, *device_,
|
||||||
vkDestroyDescriptorSetLayout(*device_, descriptor_set_layout_, nullptr);
|
descriptor_set_layout_, nullptr);
|
||||||
descriptor_set_layout_ = nullptr;
|
VK_SAFE_DESTROY(vkDestroyDescriptorPool, *device_, descriptor_pool_, nullptr);
|
||||||
}
|
|
||||||
|
|
||||||
if (descriptor_pool_) {
|
|
||||||
vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr);
|
|
||||||
descriptor_pool_ = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
transient_buffer_->Shutdown();
|
transient_buffer_->Shutdown();
|
||||||
|
VK_SAFE_DESTROY(vkFreeMemory, *device_, gpu_memory_pool_, nullptr);
|
||||||
if (gpu_memory_pool_) {
|
|
||||||
vkFreeMemory(*device_, gpu_memory_pool_, nullptr);
|
|
||||||
gpu_memory_pool_ = nullptr;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
|
std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
|
||||||
|
@ -278,13 +352,8 @@ std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
|
||||||
std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
|
std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
|
||||||
VkCommandBuffer command_buffer, uint32_t source_addr,
|
VkCommandBuffer command_buffer, uint32_t source_addr,
|
||||||
uint32_t source_length, IndexFormat format, VkFence fence) {
|
uint32_t source_length, IndexFormat format, VkFence fence) {
|
||||||
auto offset = FindCachedTransientData(source_addr, source_length);
|
|
||||||
if (offset != VK_WHOLE_SIZE) {
|
|
||||||
return {transient_buffer_->gpu_buffer(), offset};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Allocate space in the buffer for our data.
|
// Allocate space in the buffer for our data.
|
||||||
offset = AllocateTransientData(source_length, fence);
|
auto offset = AllocateTransientData(source_length, fence);
|
||||||
if (offset == VK_WHOLE_SIZE) {
|
if (offset == VK_WHOLE_SIZE) {
|
||||||
// OOM.
|
// OOM.
|
||||||
return {nullptr, VK_WHOLE_SIZE};
|
return {nullptr, VK_WHOLE_SIZE};
|
||||||
|
@ -292,9 +361,27 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
|
||||||
|
|
||||||
const void* source_ptr = memory_->TranslatePhysical(source_addr);
|
const void* source_ptr = memory_->TranslatePhysical(source_addr);
|
||||||
|
|
||||||
// Copy data into the buffer.
|
uint32_t prim_reset_index =
|
||||||
// TODO(benvanik): get min/max indices and pass back?
|
register_file_->values[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32;
|
||||||
|
bool prim_reset_enabled =
|
||||||
|
!!(register_file_->values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21));
|
||||||
|
|
||||||
|
// Copy data into the buffer. If primitive reset is enabled, translate any
|
||||||
|
// primitive reset indices to something Vulkan understands.
|
||||||
// TODO(benvanik): memcpy then use compute shaders to swap?
|
// TODO(benvanik): memcpy then use compute shaders to swap?
|
||||||
|
if (prim_reset_enabled) {
|
||||||
|
if (format == IndexFormat::kInt16) {
|
||||||
|
// Endian::k8in16, swap half-words.
|
||||||
|
copy_cmp_swap_16_unaligned(
|
||||||
|
transient_buffer_->host_base() + offset, source_ptr,
|
||||||
|
static_cast<uint16_t>(prim_reset_index), source_length / 2);
|
||||||
|
} else if (format == IndexFormat::kInt32) {
|
||||||
|
// Endian::k8in32, swap words.
|
||||||
|
copy_cmp_swap_32_unaligned(transient_buffer_->host_base() + offset,
|
||||||
|
source_ptr, prim_reset_index,
|
||||||
|
source_length / 4);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
if (format == IndexFormat::kInt16) {
|
if (format == IndexFormat::kInt16) {
|
||||||
// Endian::k8in16, swap half-words.
|
// Endian::k8in16, swap half-words.
|
||||||
xe::copy_and_swap_16_unaligned(transient_buffer_->host_base() + offset,
|
xe::copy_and_swap_16_unaligned(transient_buffer_->host_base() + offset,
|
||||||
|
@ -304,6 +391,7 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
|
||||||
xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
|
xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
|
||||||
source_ptr, source_length / 4);
|
source_ptr, source_length / 4);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
transient_buffer_->Flush(offset, source_length);
|
transient_buffer_->Flush(offset, source_length);
|
||||||
|
|
||||||
|
@ -323,7 +411,6 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
|
||||||
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
|
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
|
||||||
&barrier, 0, nullptr);
|
&barrier, 0, nullptr);
|
||||||
|
|
||||||
CacheTransientData(source_addr, source_length, offset);
|
|
||||||
return {transient_buffer_->gpu_buffer(), offset};
|
return {transient_buffer_->gpu_buffer(), offset};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -335,29 +422,41 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
|
||||||
return {transient_buffer_->gpu_buffer(), offset};
|
return {transient_buffer_->gpu_buffer(), offset};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Slow path :)
|
||||||
|
// Expand the region up to the allocation boundary
|
||||||
|
auto physical_heap = memory_->GetPhysicalHeap();
|
||||||
|
uint32_t upload_base = source_addr;
|
||||||
|
uint32_t upload_size = source_length;
|
||||||
|
|
||||||
|
// Ping the memory subsystem for allocation size.
|
||||||
|
// TODO(DrChat): Artifacting occurring in GripShift with this enabled.
|
||||||
|
// physical_heap->QueryBaseAndSize(&upload_base, &upload_size);
|
||||||
|
assert(upload_base <= source_addr);
|
||||||
|
uint32_t source_offset = source_addr - upload_base;
|
||||||
|
|
||||||
// Allocate space in the buffer for our data.
|
// Allocate space in the buffer for our data.
|
||||||
offset = AllocateTransientData(source_length, fence);
|
offset = AllocateTransientData(upload_size, fence);
|
||||||
if (offset == VK_WHOLE_SIZE) {
|
if (offset == VK_WHOLE_SIZE) {
|
||||||
// OOM.
|
// OOM.
|
||||||
return {nullptr, VK_WHOLE_SIZE};
|
return {nullptr, VK_WHOLE_SIZE};
|
||||||
}
|
}
|
||||||
|
|
||||||
const void* source_ptr = memory_->TranslatePhysical(source_addr);
|
const void* upload_ptr = memory_->TranslatePhysical(upload_base);
|
||||||
|
|
||||||
// Copy data into the buffer.
|
// Copy data into the buffer.
|
||||||
// TODO(benvanik): memcpy then use compute shaders to swap?
|
// TODO(benvanik): memcpy then use compute shaders to swap?
|
||||||
if (endian == Endian::k8in32) {
|
if (endian == Endian::k8in32) {
|
||||||
// Endian::k8in32, swap words.
|
// Endian::k8in32, swap words.
|
||||||
xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
|
xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
|
||||||
source_ptr, source_length / 4);
|
upload_ptr, source_length / 4);
|
||||||
} else if (endian == Endian::k16in32) {
|
} else if (endian == Endian::k16in32) {
|
||||||
xe::copy_and_swap_16_in_32_unaligned(
|
xe::copy_and_swap_16_in_32_unaligned(
|
||||||
transient_buffer_->host_base() + offset, source_ptr, source_length / 4);
|
transient_buffer_->host_base() + offset, upload_ptr, source_length / 4);
|
||||||
} else {
|
} else {
|
||||||
assert_always();
|
assert_always();
|
||||||
}
|
}
|
||||||
|
|
||||||
transient_buffer_->Flush(offset, source_length);
|
transient_buffer_->Flush(offset, upload_size);
|
||||||
|
|
||||||
// Append a barrier to the command buffer.
|
// Append a barrier to the command buffer.
|
||||||
VkBufferMemoryBarrier barrier = {
|
VkBufferMemoryBarrier barrier = {
|
||||||
|
@ -369,14 +468,14 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
|
||||||
VK_QUEUE_FAMILY_IGNORED,
|
VK_QUEUE_FAMILY_IGNORED,
|
||||||
transient_buffer_->gpu_buffer(),
|
transient_buffer_->gpu_buffer(),
|
||||||
offset,
|
offset,
|
||||||
source_length,
|
upload_size,
|
||||||
};
|
};
|
||||||
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT,
|
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT,
|
||||||
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
|
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
|
||||||
&barrier, 0, nullptr);
|
&barrier, 0, nullptr);
|
||||||
|
|
||||||
CacheTransientData(source_addr, source_length, offset);
|
CacheTransientData(upload_base, upload_size, offset);
|
||||||
return {transient_buffer_->gpu_buffer(), offset};
|
return {transient_buffer_->gpu_buffer(), offset + source_offset};
|
||||||
}
|
}
|
||||||
|
|
||||||
VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize length,
|
VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize length,
|
||||||
|
@ -409,10 +508,22 @@ VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize length,
|
||||||
|
|
||||||
VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
|
VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
|
||||||
uint32_t guest_length) {
|
uint32_t guest_length) {
|
||||||
uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
|
if (transient_cache_.empty()) {
|
||||||
auto it = transient_cache_.find(key);
|
// Short-circuit exit.
|
||||||
if (it != transient_cache_.end()) {
|
return VK_WHOLE_SIZE;
|
||||||
return it->second;
|
}
|
||||||
|
|
||||||
|
// Find the first element > guest_address
|
||||||
|
auto it = transient_cache_.upper_bound(guest_address);
|
||||||
|
if (it != transient_cache_.begin()) {
|
||||||
|
// it = first element <= guest_address
|
||||||
|
--it;
|
||||||
|
|
||||||
|
if ((it->first + it->second.first) >= (guest_address + guest_length)) {
|
||||||
|
// This data is contained within some existing transient data.
|
||||||
|
auto source_offset = static_cast<VkDeviceSize>(guest_address - it->first);
|
||||||
|
return it->second.second + source_offset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return VK_WHOLE_SIZE;
|
return VK_WHOLE_SIZE;
|
||||||
|
@ -421,8 +532,17 @@ VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
|
||||||
void BufferCache::CacheTransientData(uint32_t guest_address,
|
void BufferCache::CacheTransientData(uint32_t guest_address,
|
||||||
uint32_t guest_length,
|
uint32_t guest_length,
|
||||||
VkDeviceSize offset) {
|
VkDeviceSize offset) {
|
||||||
uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
|
transient_cache_[guest_address] = {guest_length, offset};
|
||||||
transient_cache_[key] = offset;
|
|
||||||
|
// Erase any entries contained within
|
||||||
|
auto it = transient_cache_.upper_bound(guest_address);
|
||||||
|
while (it != transient_cache_.end()) {
|
||||||
|
if ((guest_address + guest_length) >= (it->first + it->second.first)) {
|
||||||
|
it = transient_cache_.erase(it);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void BufferCache::Flush(VkCommandBuffer command_buffer) {
|
void BufferCache::Flush(VkCommandBuffer command_buffer) {
|
||||||
|
|
|
@ -18,6 +18,8 @@
|
||||||
#include "xenia/ui/vulkan/vulkan.h"
|
#include "xenia/ui/vulkan/vulkan.h"
|
||||||
#include "xenia/ui/vulkan/vulkan_device.h"
|
#include "xenia/ui/vulkan/vulkan_device.h"
|
||||||
|
|
||||||
|
#include "third_party/vulkan/vk_mem_alloc.h"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
|
@ -95,6 +97,15 @@ class BufferCache {
|
||||||
void Scavenge();
|
void Scavenge();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
// This represents an uploaded vertex buffer.
|
||||||
|
struct VertexBuffer {
|
||||||
|
uint32_t guest_address;
|
||||||
|
uint32_t size;
|
||||||
|
|
||||||
|
VmaAllocation alloc;
|
||||||
|
VmaAllocationInfo alloc_info;
|
||||||
|
};
|
||||||
|
|
||||||
// Allocates a block of memory in the transient buffer.
|
// Allocates a block of memory in the transient buffer.
|
||||||
// When memory is not available fences are checked and space is reclaimed.
|
// When memory is not available fences are checked and space is reclaimed.
|
||||||
// Returns VK_WHOLE_SIZE if requested amount of memory is not available.
|
// Returns VK_WHOLE_SIZE if requested amount of memory is not available.
|
||||||
|
@ -115,11 +126,12 @@ class BufferCache {
|
||||||
ui::vulkan::VulkanDevice* device_ = nullptr;
|
ui::vulkan::VulkanDevice* device_ = nullptr;
|
||||||
|
|
||||||
VkDeviceMemory gpu_memory_pool_ = nullptr;
|
VkDeviceMemory gpu_memory_pool_ = nullptr;
|
||||||
|
VmaAllocator mem_allocator_ = nullptr;
|
||||||
|
|
||||||
// Staging ringbuffer we cycle through fast. Used for data we don't
|
// Staging ringbuffer we cycle through fast. Used for data we don't
|
||||||
// plan on keeping past the current frame.
|
// plan on keeping past the current frame.
|
||||||
std::unique_ptr<ui::vulkan::CircularBuffer> transient_buffer_ = nullptr;
|
std::unique_ptr<ui::vulkan::CircularBuffer> transient_buffer_ = nullptr;
|
||||||
std::map<uint64_t, VkDeviceSize> transient_cache_;
|
std::map<uint32_t, std::pair<uint32_t, VkDeviceSize>> transient_cache_;
|
||||||
|
|
||||||
VkDescriptorPool descriptor_pool_ = nullptr;
|
VkDescriptorPool descriptor_pool_ = nullptr;
|
||||||
VkDescriptorSetLayout descriptor_set_layout_ = nullptr;
|
VkDescriptorSetLayout descriptor_set_layout_ = nullptr;
|
||||||
|
|
|
@ -1202,16 +1202,12 @@ PipelineCache::UpdateStatus PipelineCache::UpdateInputAssemblyState(
|
||||||
// glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
|
// glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
// Primitive restart index is handled in the buffer cache.
|
||||||
if (regs.pa_su_sc_mode_cntl & (1 << 21)) {
|
if (regs.pa_su_sc_mode_cntl & (1 << 21)) {
|
||||||
state_info.primitiveRestartEnable = VK_TRUE;
|
state_info.primitiveRestartEnable = VK_TRUE;
|
||||||
} else {
|
} else {
|
||||||
state_info.primitiveRestartEnable = VK_FALSE;
|
state_info.primitiveRestartEnable = VK_FALSE;
|
||||||
}
|
}
|
||||||
// TODO(benvanik): no way to specify in Vulkan?
|
|
||||||
assert_true(regs.multi_prim_ib_reset_index == 0xFFFF ||
|
|
||||||
regs.multi_prim_ib_reset_index == 0xFFFFFF ||
|
|
||||||
regs.multi_prim_ib_reset_index == 0xFFFFFFFF);
|
|
||||||
// glPrimitiveRestartIndex(regs.multi_prim_ib_reset_index);
|
|
||||||
|
|
||||||
return UpdateStatus::kMismatch;
|
return UpdateStatus::kMismatch;
|
||||||
}
|
}
|
||||||
|
|
|
@ -860,14 +860,13 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
|
||||||
// TODO: Make the buffer cache ... actually cache buffers. We can have
|
// TODO: Make the buffer cache ... actually cache buffers. We can have
|
||||||
// a list of buffers that were cached, and store those in chunks in a
|
// a list of buffers that were cached, and store those in chunks in a
|
||||||
// multiple of the host's page size.
|
// multiple of the host's page size.
|
||||||
// WRITE WATCHES: We need to invalidate vertex buffers if they're written
|
// So, we need to track all vertex buffers in a sorted map, and track all
|
||||||
// to. Since most vertex buffers aren't aligned to a page boundary, this
|
// write watches in a sorted map. When a vertex buffer is uploaded, track
|
||||||
// means a watch may cover more than one vertex buffer.
|
// all untracked pages with 1-page write watches. In the callback,
|
||||||
// We need to maintain a list of write watches, and what memory ranges
|
// invalidate any overlapping vertex buffers.
|
||||||
// they cover. If a vertex buffer lies within a write watch's range, assign
|
//
|
||||||
// it to the watch. If there's partial alignment where a buffer lies within
|
// We would keep the old transient buffer as a staging buffer, and upload
|
||||||
// one watch and outside of it, should we create a new watch or extend the
|
// to a GPU-only buffer that tracks all cached vertex buffers.
|
||||||
// existing watch?
|
|
||||||
auto buffer_ref = buffer_cache_->UploadVertexBuffer(
|
auto buffer_ref = buffer_cache_->UploadVertexBuffer(
|
||||||
current_setup_buffer_, physical_address, source_length,
|
current_setup_buffer_, physical_address, source_length,
|
||||||
static_cast<Endian>(fetch->endian), current_batch_fence_);
|
static_cast<Endian>(fetch->endian), current_batch_fence_);
|
||||||
|
|
|
@ -273,14 +273,11 @@ dword_result_t NtQueryVirtualMemory(
|
||||||
return X_STATUS_INVALID_PARAMETER;
|
return X_STATUS_INVALID_PARAMETER;
|
||||||
}
|
}
|
||||||
|
|
||||||
memory_basic_information_ptr->base_address =
|
memory_basic_information_ptr->base_address = alloc_info.base_address;
|
||||||
static_cast<uint32_t>(alloc_info.base_address);
|
memory_basic_information_ptr->allocation_base = alloc_info.allocation_base;
|
||||||
memory_basic_information_ptr->allocation_base =
|
|
||||||
static_cast<uint32_t>(alloc_info.allocation_base);
|
|
||||||
memory_basic_information_ptr->allocation_protect =
|
memory_basic_information_ptr->allocation_protect =
|
||||||
ToXdkProtectFlags(alloc_info.allocation_protect);
|
ToXdkProtectFlags(alloc_info.allocation_protect);
|
||||||
memory_basic_information_ptr->region_size =
|
memory_basic_information_ptr->region_size = alloc_info.region_size;
|
||||||
static_cast<uint32_t>(alloc_info.region_size);
|
|
||||||
uint32_t x_state = 0;
|
uint32_t x_state = 0;
|
||||||
if (alloc_info.state & kMemoryAllocationReserve) {
|
if (alloc_info.state & kMemoryAllocationReserve) {
|
||||||
x_state |= X_MEM_RESERVE;
|
x_state |= X_MEM_RESERVE;
|
||||||
|
@ -290,7 +287,7 @@ dword_result_t NtQueryVirtualMemory(
|
||||||
}
|
}
|
||||||
memory_basic_information_ptr->state = x_state;
|
memory_basic_information_ptr->state = x_state;
|
||||||
memory_basic_information_ptr->protect = ToXdkProtectFlags(alloc_info.protect);
|
memory_basic_information_ptr->protect = ToXdkProtectFlags(alloc_info.protect);
|
||||||
memory_basic_information_ptr->type = alloc_info.type;
|
memory_basic_information_ptr->type = X_MEM_PRIVATE;
|
||||||
|
|
||||||
return X_STATUS_SUCCESS;
|
return X_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
|
@ -339,6 +339,8 @@ BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
VirtualHeap* Memory::GetPhysicalHeap() { return &heaps_.physical; }
|
||||||
|
|
||||||
void Memory::Zero(uint32_t address, uint32_t size) {
|
void Memory::Zero(uint32_t address, uint32_t size) {
|
||||||
std::memset(TranslateVirtual(address), 0, size);
|
std::memset(TranslateVirtual(address), 0, size);
|
||||||
}
|
}
|
||||||
|
@ -1096,16 +1098,19 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
|
||||||
out_info->region_size = 0;
|
out_info->region_size = 0;
|
||||||
out_info->state = 0;
|
out_info->state = 0;
|
||||||
out_info->protect = 0;
|
out_info->protect = 0;
|
||||||
out_info->type = 0;
|
|
||||||
if (start_page_entry.state) {
|
if (start_page_entry.state) {
|
||||||
// Committed/reserved region.
|
// Committed/reserved region.
|
||||||
out_info->allocation_base = start_page_entry.base_address * page_size_;
|
out_info->allocation_base = start_page_entry.base_address * page_size_;
|
||||||
out_info->allocation_protect = start_page_entry.allocation_protect;
|
out_info->allocation_protect = start_page_entry.allocation_protect;
|
||||||
|
out_info->allocation_size = start_page_entry.region_page_count * page_size_;
|
||||||
out_info->state = start_page_entry.state;
|
out_info->state = start_page_entry.state;
|
||||||
out_info->protect = start_page_entry.current_protect;
|
out_info->protect = start_page_entry.current_protect;
|
||||||
out_info->type = 0x20000;
|
|
||||||
|
// Scan forward and report the size of the region matching the initial
|
||||||
|
// base address's attributes.
|
||||||
for (uint32_t page_number = start_page_number;
|
for (uint32_t page_number = start_page_number;
|
||||||
page_number < start_page_number + start_page_entry.region_page_count;
|
page_number <
|
||||||
|
start_page_entry.base_address + start_page_entry.region_page_count;
|
||||||
++page_number) {
|
++page_number) {
|
||||||
auto page_entry = page_table_[page_number];
|
auto page_entry = page_table_[page_number];
|
||||||
if (page_entry.base_address != start_page_entry.base_address ||
|
if (page_entry.base_address != start_page_entry.base_address ||
|
||||||
|
@ -1144,6 +1149,20 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
|
||||||
|
uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
|
||||||
|
if (page_number > page_table_.size()) {
|
||||||
|
XELOGE("BaseHeap::QuerySize base page out of range");
|
||||||
|
*out_size = 0;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
auto global_lock = global_critical_region_.Acquire();
|
||||||
|
auto page_entry = page_table_[page_number];
|
||||||
|
*in_out_address = (page_entry.base_address * page_size_);
|
||||||
|
*out_size = (page_entry.region_page_count * page_size_);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
|
bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
|
||||||
uint32_t page_number = (address - heap_base_) / page_size_;
|
uint32_t page_number = (address - heap_base_) / page_size_;
|
||||||
if (page_number > page_table_.size()) {
|
if (page_number > page_table_.size()) {
|
||||||
|
|
|
@ -56,6 +56,8 @@ struct HeapAllocationInfo {
|
||||||
uint32_t allocation_base;
|
uint32_t allocation_base;
|
||||||
// The memory protection option when the region was initially allocated.
|
// The memory protection option when the region was initially allocated.
|
||||||
uint32_t allocation_protect;
|
uint32_t allocation_protect;
|
||||||
|
// The size specified when the region was initially allocated, in bytes.
|
||||||
|
uint32_t allocation_size;
|
||||||
// The size of the region beginning at the base address in which all pages
|
// The size of the region beginning at the base address in which all pages
|
||||||
// have identical attributes, in bytes.
|
// have identical attributes, in bytes.
|
||||||
uint32_t region_size;
|
uint32_t region_size;
|
||||||
|
@ -63,8 +65,6 @@ struct HeapAllocationInfo {
|
||||||
uint32_t state;
|
uint32_t state;
|
||||||
// The access protection of the pages in the region.
|
// The access protection of the pages in the region.
|
||||||
uint32_t protect;
|
uint32_t protect;
|
||||||
// The type of pages in the region (private).
|
|
||||||
uint32_t type;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Describes a single page in the page table.
|
// Describes a single page in the page table.
|
||||||
|
@ -144,6 +144,9 @@ class BaseHeap {
|
||||||
// Queries the size of the region containing the given address.
|
// Queries the size of the region containing the given address.
|
||||||
bool QuerySize(uint32_t address, uint32_t* out_size);
|
bool QuerySize(uint32_t address, uint32_t* out_size);
|
||||||
|
|
||||||
|
// Queries the base and size of a region containing the given address.
|
||||||
|
bool QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size);
|
||||||
|
|
||||||
// Queries the current protection mode of the region containing the given
|
// Queries the current protection mode of the region containing the given
|
||||||
// address.
|
// address.
|
||||||
bool QueryProtect(uint32_t address, uint32_t* out_protect);
|
bool QueryProtect(uint32_t address, uint32_t* out_protect);
|
||||||
|
@ -332,6 +335,9 @@ class Memory {
|
||||||
// Gets the heap with the given properties.
|
// Gets the heap with the given properties.
|
||||||
BaseHeap* LookupHeapByType(bool physical, uint32_t page_size);
|
BaseHeap* LookupHeapByType(bool physical, uint32_t page_size);
|
||||||
|
|
||||||
|
// Gets the physical base heap.
|
||||||
|
VirtualHeap* GetPhysicalHeap();
|
||||||
|
|
||||||
// Dumps a map of all allocated memory to the log.
|
// Dumps a map of all allocated memory to the log.
|
||||||
void DumpMap();
|
void DumpMap();
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ CircularBuffer::CircularBuffer(VulkanDevice* device, VkBufferUsageFlags usage,
|
||||||
|
|
||||||
VkMemoryRequirements reqs;
|
VkMemoryRequirements reqs;
|
||||||
vkGetBufferMemoryRequirements(*device_, gpu_buffer_, &reqs);
|
vkGetBufferMemoryRequirements(*device_, gpu_buffer_, &reqs);
|
||||||
alignment_ = reqs.alignment;
|
alignment_ = xe::round_up(alignment, reqs.alignment);
|
||||||
}
|
}
|
||||||
CircularBuffer::~CircularBuffer() { Shutdown(); }
|
CircularBuffer::~CircularBuffer() { Shutdown(); }
|
||||||
|
|
||||||
|
|
|
@ -26,10 +26,14 @@ namespace ui {
|
||||||
namespace vulkan {
|
namespace vulkan {
|
||||||
|
|
||||||
#define VK_SAFE_DESTROY(fn, dev, obj, alloc) \
|
#define VK_SAFE_DESTROY(fn, dev, obj, alloc) \
|
||||||
|
\
|
||||||
|
do { \
|
||||||
if (obj) { \
|
if (obj) { \
|
||||||
fn(dev, obj, alloc); \
|
fn(dev, obj, alloc); \
|
||||||
obj = nullptr; \
|
obj = nullptr; \
|
||||||
}
|
} \
|
||||||
|
\
|
||||||
|
} while (0)
|
||||||
|
|
||||||
class Fence {
|
class Fence {
|
||||||
public:
|
public:
|
||||||
|
|
Loading…
Reference in New Issue