[D3D12] Unify UploadBufferPool page size (2 MB), add alignment parameter

This commit is contained in:
Triang3l 2020-09-15 22:13:53 +03:00
parent dfbe36a8aa
commit 2cebd3cabe
9 changed files with 111 additions and 79 deletions

View File

@ -26,23 +26,28 @@
namespace xe {
template <typename T, size_t N>
size_t countof(T (&arr)[N]) {
constexpr size_t countof(T (&arr)[N]) {
return std::extent<T[N]>::value;
}
template <typename T>
constexpr bool is_pow2(T value) {
return (value & (value - 1)) == 0;
}
// Rounds up the given value to the given alignment.
template <typename T>
T align(T value, T alignment) {
constexpr T align(T value, T alignment) {
return (value + alignment - 1) & ~(alignment - 1);
}
// Rounds the given number up to the next highest multiple.
template <typename T, typename V>
T round_up(T value, V multiple) {
constexpr T round_up(T value, V multiple) {
return value ? (((value + multiple - 1) / multiple) * multiple) : multiple;
}
inline float saturate(float value) {
constexpr float saturate(float value) {
return std::max(std::min(1.0f, value), -1.0f);
}
@ -62,7 +67,7 @@ T next_pow2(T value) {
#if __cpp_lib_gcd_lcm
template <typename T>
inline constexpr T greatest_common_divisor(T a, T b) {
constexpr T greatest_common_divisor(T a, T b) {
return std::gcd(a, b);
}
#else
@ -77,14 +82,14 @@ constexpr T greatest_common_divisor(T a, T b) {
#endif
template <typename T>
inline constexpr void reduce_fraction(T& numerator, T& denominator) {
constexpr void reduce_fraction(T& numerator, T& denominator) {
auto gcd = greatest_common_divisor(numerator, denominator);
numerator /= gcd;
denominator /= gcd;
}
template <typename T>
inline constexpr void reduce_fraction(std::pair<T, T>& fraction) {
constexpr void reduce_fraction(std::pair<T, T>& fraction) {
reduce_fraction<T>(fraction.first, fraction.second);
}

View File

@ -890,8 +890,10 @@ bool D3D12CommandProcessor::SetupContext() {
cvars::d3d12_edram_rov && provider.AreRasterizerOrderedViewsSupported();
// Initialize resource binding.
constant_buffer_pool_ =
std::make_unique<ui::d3d12::UploadBufferPool>(provider, 1024 * 1024);
constant_buffer_pool_ = std::make_unique<ui::d3d12::UploadBufferPool>(
provider, std::max(ui::d3d12::UploadBufferPool::kDefaultPageSize,
uint32_t(D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4 *
sizeof(float))));
if (bindless_resources_used_) {
D3D12_DESCRIPTOR_HEAP_DESC view_bindless_heap_desc;
view_bindless_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
@ -3519,13 +3521,6 @@ bool D3D12CommandProcessor::UpdateBindings(
const Shader::ConstantRegisterMap& float_constant_map_vertex =
vertex_shader->constant_register_map();
uint32_t float_constant_count_vertex = float_constant_map_vertex.float_count;
// Even if the shader doesn't need any float constants, a valid binding must
// still be provided, so if the first draw in the frame with the current root
// signature doesn't have float constants at all, still allocate an empty
// buffer.
uint32_t float_constant_size_vertex = xe::align(
uint32_t(std::max(float_constant_count_vertex, 1u) * 4 * sizeof(float)),
256u);
for (uint32_t i = 0; i < 4; ++i) {
if (current_float_constant_map_vertex_[i] !=
float_constant_map_vertex.float_bitmap[i]) {
@ -3557,15 +3552,13 @@ bool D3D12CommandProcessor::UpdateBindings(
std::memset(current_float_constant_map_pixel_, 0,
sizeof(current_float_constant_map_pixel_));
}
uint32_t float_constant_size_pixel = xe::align(
uint32_t(std::max(float_constant_count_pixel, 1u) * 4 * sizeof(float)),
256u);
// Write the constant buffer data.
if (!cbuffer_binding_system_.up_to_date) {
uint8_t* system_constants = constant_buffer_pool_->Request(
frame_current_, xe::align(uint32_t(sizeof(system_constants_)), 256u),
nullptr, nullptr, &cbuffer_binding_system_.address);
frame_current_, sizeof(system_constants_),
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_binding_system_.address);
if (system_constants == nullptr) {
return false;
}
@ -3576,8 +3569,15 @@ bool D3D12CommandProcessor::UpdateBindings(
~(1u << root_parameter_system_constants);
}
if (!cbuffer_binding_float_vertex_.up_to_date) {
// Even if the shader doesn't need any float constants, a valid binding must
// still be provided, so if the first draw in the frame with the current
// root signature doesn't have float constants at all, still allocate an
// empty buffer.
uint8_t* float_constants = constant_buffer_pool_->Request(
frame_current_, float_constant_size_vertex, nullptr, nullptr,
frame_current_,
uint32_t(std::max(float_constant_count_vertex, uint32_t(1)) * 4 *
sizeof(float)),
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_binding_float_vertex_.address);
if (float_constants == nullptr) {
return false;
@ -3603,7 +3603,10 @@ bool D3D12CommandProcessor::UpdateBindings(
}
if (!cbuffer_binding_float_pixel_.up_to_date) {
uint8_t* float_constants = constant_buffer_pool_->Request(
frame_current_, float_constant_size_pixel, nullptr, nullptr,
frame_current_,
uint32_t(std::max(float_constant_count_pixel, uint32_t(1)) * 4 *
sizeof(float)),
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_binding_float_pixel_.address);
if (float_constants == nullptr) {
return false;
@ -3632,28 +3635,33 @@ bool D3D12CommandProcessor::UpdateBindings(
~(1u << root_parameter_float_constants_pixel);
}
if (!cbuffer_binding_bool_loop_.up_to_date) {
uint8_t* bool_loop_constants =
constant_buffer_pool_->Request(frame_current_, 256, nullptr, nullptr,
&cbuffer_binding_bool_loop_.address);
constexpr uint32_t kBoolLoopConstantsSize = (8 + 32) * sizeof(uint32_t);
uint8_t* bool_loop_constants = constant_buffer_pool_->Request(
frame_current_, kBoolLoopConstantsSize,
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_binding_bool_loop_.address);
if (bool_loop_constants == nullptr) {
return false;
}
std::memcpy(bool_loop_constants,
&regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32,
(8 + 32) * sizeof(uint32_t));
kBoolLoopConstantsSize);
cbuffer_binding_bool_loop_.up_to_date = true;
current_graphics_root_up_to_date_ &=
~(1u << root_parameter_bool_loop_constants);
}
if (!cbuffer_binding_fetch_.up_to_date) {
constexpr uint32_t kFetchConstantsSize = 32 * 6 * sizeof(uint32_t);
uint8_t* fetch_constants = constant_buffer_pool_->Request(
frame_current_, 768, nullptr, nullptr, &cbuffer_binding_fetch_.address);
frame_current_, kFetchConstantsSize,
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_binding_fetch_.address);
if (fetch_constants == nullptr) {
return false;
}
std::memcpy(fetch_constants,
&regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32,
32 * 6 * sizeof(uint32_t));
kFetchConstantsSize);
cbuffer_binding_fetch_.up_to_date = true;
current_graphics_root_up_to_date_ &=
~(1u << root_parameter_fetch_constants);
@ -3885,12 +3893,10 @@ bool D3D12CommandProcessor::UpdateBindings(
uint32_t* descriptor_indices =
reinterpret_cast<uint32_t*>(constant_buffer_pool_->Request(
frame_current_,
xe::align(
uint32_t(std::max(texture_count_vertex + sampler_count_vertex,
uint32_t(1)) *
sizeof(uint32_t)),
uint32_t(256)),
nullptr, nullptr,
uint32_t(std::max(texture_count_vertex + sampler_count_vertex,
uint32_t(1)) *
sizeof(uint32_t)),
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_binding_descriptor_indices_vertex_.address));
if (!descriptor_indices) {
return false;
@ -3923,12 +3929,10 @@ bool D3D12CommandProcessor::UpdateBindings(
uint32_t* descriptor_indices =
reinterpret_cast<uint32_t*>(constant_buffer_pool_->Request(
frame_current_,
xe::align(
uint32_t(std::max(texture_count_pixel + sampler_count_pixel,
uint32_t(1)) *
sizeof(uint32_t)),
uint32_t(256)),
nullptr, nullptr,
uint32_t(std::max(texture_count_pixel + sampler_count_pixel,
uint32_t(1)) *
sizeof(uint32_t)),
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_binding_descriptor_indices_pixel_.address));
if (!descriptor_indices) {
return false;

View File

@ -52,11 +52,13 @@ bool PrimitiveConverter::Initialize() {
D3D12_HEAP_FLAGS heap_flag_create_not_zeroed =
provider.GetHeapFlagCreateNotZeroed();
// There can be at most 65535 indices in a Xenos draw call, but they can be up
// to 4 bytes large, and conversion can add more indices (almost triple the
// count for triangle strips, for instance).
buffer_pool_ =
std::make_unique<ui::d3d12::UploadBufferPool>(provider, 4 * 1024 * 1024);
// There can be at most 65535 indices in a Xenos draw call (16 bit index
// count), but they can be up to 4 bytes large, and conversion can add more
// indices (almost triple the count for triangle strips or fans, for
// instance).
buffer_pool_ = std::make_unique<ui::d3d12::UploadBufferPool>(
provider, std::max(uint32_t(65535 * 3 * sizeof(uint32_t)),
ui::d3d12::UploadBufferPool::kDefaultPageSize));
// Create the static index buffer for non-indexed drawing.
D3D12_RESOURCE_DESC static_ib_desc;
@ -697,8 +699,8 @@ void* PrimitiveConverter::AllocateIndices(
}
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
uint8_t* mapping =
buffer_pool_->Request(command_processor_.GetCurrentFrame(), size, nullptr,
nullptr, &gpu_address);
buffer_pool_->Request(command_processor_.GetCurrentFrame(), size, 16,
nullptr, nullptr, &gpu_address);
if (mapping == nullptr) {
XELOGE("Failed to allocate space for {} converted {}-bit vertex indices",
count, format == xenos::IndexFormat::kInt32 ? 32 : 16);

View File

@ -1507,7 +1507,7 @@ void RenderTargetCache::RestoreEdramSnapshot(const void* snapshot) {
ID3D12Resource* upload_buffer;
uint32_t upload_buffer_offset;
void* upload_buffer_mapping = edram_snapshot_restore_pool_->Request(
command_processor_.GetCurrentSubmission(), xenos::kEdramSizeBytes,
command_processor_.GetCurrentSubmission(), xenos::kEdramSizeBytes, 1,
&upload_buffer, &upload_buffer_offset, nullptr);
if (!upload_buffer_mapping) {
XELOGE("Failed to get a buffer for restoring a EDRAM snapshot");

View File

@ -154,8 +154,8 @@ bool SharedMemory::Initialize() {
system_page_flags_.resize((page_count_ + 63) / 64);
upload_buffer_pool_ = std::make_unique<ui::d3d12::UploadBufferPool>(
provider,
xe::align(uint32_t(4 * 1024 * 1024), uint32_t(1) << page_size_log2_));
provider, xe::align(ui::d3d12::UploadBufferPool::kDefaultPageSize,
uint32_t(1) << page_size_log2_));
memory_invalidation_callback_handle_ =
memory_.RegisterPhysicalMemoryInvalidationCallback(
@ -442,8 +442,9 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
uint32_t upload_buffer_offset, upload_buffer_size;
uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial(
command_processor_.GetCurrentSubmission(),
upload_range_length << page_size_log2_, &upload_buffer,
&upload_buffer_offset, &upload_buffer_size, nullptr);
upload_range_length << page_size_log2_,
uint32_t(1) << page_size_log2_, &upload_buffer, &upload_buffer_offset,
&upload_buffer_size, nullptr);
if (upload_buffer_mapping == nullptr) {
XELOGE("Shared memory: Failed to get an upload buffer");
return false;

View File

@ -2396,9 +2396,9 @@ bool TextureCache::LoadTextureData(Texture* texture) {
}
D3D12_GPU_VIRTUAL_ADDRESS cbuffer_gpu_address;
uint8_t* cbuffer_mapping = cbuffer_pool.Request(
command_processor_.GetCurrentFrame(),
xe::align(uint32_t(sizeof(load_constants)), uint32_t(256)), nullptr,
nullptr, &cbuffer_gpu_address);
command_processor_.GetCurrentFrame(), sizeof(load_constants),
D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
&cbuffer_gpu_address);
if (cbuffer_mapping == nullptr) {
command_processor_.ReleaseScratchGPUBuffer(copy_buffer,
copy_buffer_state);

View File

@ -287,8 +287,7 @@ bool D3D12ImmediateDrawer::Initialize() {
device->CreateSampler(&sampler_desc, sampler_handle);
// Create pools for draws.
vertex_buffer_pool_ =
std::make_unique<UploadBufferPool>(provider, 2 * 1024 * 1024);
vertex_buffer_pool_ = std::make_unique<UploadBufferPool>(provider);
texture_descriptor_pool_ = std::make_unique<DescriptorHeapPool>(
device, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, 2048);
texture_descriptor_pool_heap_index_ = DescriptorHeapPool::kHeapIndexInvalid;
@ -506,8 +505,8 @@ void D3D12ImmediateDrawer::BeginDrawBatch(const ImmediateDrawBatch& batch) {
vertex_buffer_view.SizeInBytes =
batch.vertex_count * uint32_t(sizeof(ImmediateVertex));
void* vertex_buffer_mapping = vertex_buffer_pool_->Request(
current_fence_value, vertex_buffer_view.SizeInBytes, nullptr, nullptr,
&vertex_buffer_view.BufferLocation);
current_fence_value, vertex_buffer_view.SizeInBytes, sizeof(uint32_t),
nullptr, nullptr, &vertex_buffer_view.BufferLocation);
if (vertex_buffer_mapping == nullptr) {
XELOGE("Failed to get a buffer for {} vertices in the immediate drawer",
batch.vertex_count);
@ -524,8 +523,7 @@ void D3D12ImmediateDrawer::BeginDrawBatch(const ImmediateDrawBatch& batch) {
index_buffer_view.SizeInBytes = batch.index_count * sizeof(uint16_t);
index_buffer_view.Format = DXGI_FORMAT_R16_UINT;
void* index_buffer_mapping = vertex_buffer_pool_->Request(
current_fence_value,
xe::align(index_buffer_view.SizeInBytes, UINT(sizeof(uint32_t))),
current_fence_value, index_buffer_view.SizeInBytes, sizeof(uint16_t),
nullptr, nullptr, &index_buffer_view.BufferLocation);
if (index_buffer_mapping == nullptr) {
XELOGE("Failed to get a buffer for {} indices in the immediate drawer",

View File

@ -13,14 +13,20 @@
#include "xenia/base/assert.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/ui/d3d12/d3d12_util.h"
namespace xe {
namespace ui {
namespace d3d12 {
// Align to D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT not to waste any space if
// it's smaller (the size of the heap backing the buffer will be aligned to
// D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT anyway).
UploadBufferPool::UploadBufferPool(D3D12Provider& provider, uint32_t page_size)
: provider_(provider), page_size_(page_size) {}
: provider_(provider),
page_size_(xe::align(
page_size, uint32_t(D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT))) {}
UploadBufferPool::~UploadBufferPool() { ClearCache(); }
@ -68,9 +74,13 @@ void UploadBufferPool::ClearCache() {
}
uint8_t* UploadBufferPool::Request(uint64_t submission_index, uint32_t size,
uint32_t alignment,
ID3D12Resource** buffer_out,
uint32_t* offset_out,
D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out) {
assert_not_zero(alignment);
assert_true(xe::is_pow2(alignment));
size = xe::align(size, alignment);
assert_true(size <= page_size_);
if (size > page_size_) {
return nullptr;
@ -79,7 +89,8 @@ uint8_t* UploadBufferPool::Request(uint64_t submission_index, uint32_t size,
submission_index >= writable_first_->last_submission_index);
assert_true(!submitted_last_ ||
submission_index >= submitted_last_->last_submission_index);
if (page_size_ - current_page_used_ < size || !writable_first_) {
uint32_t current_page_used_aligned = xe::align(current_page_used_, alignment);
if (current_page_used_aligned + size > page_size_ || !writable_first_) {
// Start a new page if can't fit all the bytes or don't have an open page.
if (writable_first_) {
// Close the page that was current.
@ -128,33 +139,39 @@ uint8_t* UploadBufferPool::Request(uint64_t submission_index, uint32_t size,
writable_last_ = writable_first_;
}
current_page_used_ = 0;
current_page_used_aligned = 0;
}
writable_first_->last_submission_index = submission_index;
if (buffer_out) {
*buffer_out = writable_first_->buffer;
}
if (offset_out) {
*offset_out = current_page_used_;
*offset_out = current_page_used_aligned;
}
if (gpu_address_out) {
*gpu_address_out = writable_first_->gpu_address + current_page_used_;
*gpu_address_out = writable_first_->gpu_address + current_page_used_aligned;
}
uint8_t* mapping =
reinterpret_cast<uint8_t*>(writable_first_->mapping) + current_page_used_;
current_page_used_ += size;
uint8_t* mapping = reinterpret_cast<uint8_t*>(writable_first_->mapping) +
current_page_used_aligned;
current_page_used_ = current_page_used_aligned + size;
return mapping;
}
uint8_t* UploadBufferPool::RequestPartial(
uint64_t submission_index, uint32_t size, ID3D12Resource** buffer_out,
uint32_t* offset_out, uint32_t* size_out,
uint64_t submission_index, uint32_t size, uint32_t alignment,
ID3D12Resource** buffer_out, uint32_t* offset_out, uint32_t* size_out,
D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out) {
assert_not_zero(alignment);
assert_true(xe::is_pow2(alignment));
size = xe::align(size, alignment);
size = std::min(size, page_size_);
if (current_page_used_ < page_size_) {
size = std::min(size, page_size_ - current_page_used_);
uint32_t current_page_used_aligned = xe::align(current_page_used_, alignment);
if (current_page_used_aligned + alignment <= page_size_) {
size = std::min(
size, (page_size_ - current_page_used_aligned) & ~(alignment - 1));
}
uint8_t* mapping =
Request(submission_index, size, buffer_out, offset_out, gpu_address_out);
uint8_t* mapping = Request(submission_index, size, alignment, buffer_out,
offset_out, gpu_address_out);
if (!mapping) {
return nullptr;
}

View File

@ -23,7 +23,12 @@ namespace d3d12 {
class UploadBufferPool {
public:
UploadBufferPool(D3D12Provider& provider, uint32_t page_size);
// Taken from the Direct3D 12 MiniEngine sample (LinearAllocator
// kCpuAllocatorPageSize). Large enough for most cases.
static constexpr uint32_t kDefaultPageSize = 2 * 1024 * 1024;
UploadBufferPool(D3D12Provider& provider,
uint32_t page_size = kDefaultPageSize);
~UploadBufferPool();
void Reclaim(uint64_t completed_submission_index);
@ -31,13 +36,13 @@ class UploadBufferPool {
// Request to write data in a single piece, creating a new page if the current
// one doesn't have enough free space.
uint8_t* Request(uint64_t submission_index, uint32_t size,
uint8_t* Request(uint64_t submission_index, uint32_t size, uint32_t alignment,
ID3D12Resource** buffer_out, uint32_t* offset_out,
D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out);
// Request to write data in multiple parts, filling the buffer entirely.
uint8_t* RequestPartial(uint64_t submission_index, uint32_t size,
ID3D12Resource** buffer_out, uint32_t* offset_out,
uint32_t* size_out,
uint32_t alignment, ID3D12Resource** buffer_out,
uint32_t* offset_out, uint32_t* size_out,
D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out);
private: