diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index c33e27019..e2d321702 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -26,23 +26,28 @@ namespace xe { template -size_t countof(T (&arr)[N]) { +constexpr size_t countof(T (&arr)[N]) { return std::extent::value; } +template +constexpr bool is_pow2(T value) { + return (value & (value - 1)) == 0; +} + // Rounds up the given value to the given alignment. template -T align(T value, T alignment) { +constexpr T align(T value, T alignment) { return (value + alignment - 1) & ~(alignment - 1); } // Rounds the given number up to the next highest multiple. template -T round_up(T value, V multiple) { +constexpr T round_up(T value, V multiple) { return value ? (((value + multiple - 1) / multiple) * multiple) : multiple; } -inline float saturate(float value) { +constexpr float saturate(float value) { return std::max(std::min(1.0f, value), -1.0f); } @@ -62,7 +67,7 @@ T next_pow2(T value) { #if __cpp_lib_gcd_lcm template -inline constexpr T greatest_common_divisor(T a, T b) { +constexpr T greatest_common_divisor(T a, T b) { return std::gcd(a, b); } #else @@ -77,14 +82,14 @@ constexpr T greatest_common_divisor(T a, T b) { #endif template -inline constexpr void reduce_fraction(T& numerator, T& denominator) { +constexpr void reduce_fraction(T& numerator, T& denominator) { auto gcd = greatest_common_divisor(numerator, denominator); numerator /= gcd; denominator /= gcd; } template -inline constexpr void reduce_fraction(std::pair& fraction) { +constexpr void reduce_fraction(std::pair& fraction) { reduce_fraction(fraction.first, fraction.second); } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 23163a609..0ce75ea77 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -890,8 +890,10 @@ bool D3D12CommandProcessor::SetupContext() { cvars::d3d12_edram_rov && provider.AreRasterizerOrderedViewsSupported(); // Initialize resource binding. - constant_buffer_pool_ = - std::make_unique(provider, 1024 * 1024); + constant_buffer_pool_ = std::make_unique( + provider, std::max(ui::d3d12::UploadBufferPool::kDefaultPageSize, + uint32_t(D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4 * + sizeof(float)))); if (bindless_resources_used_) { D3D12_DESCRIPTOR_HEAP_DESC view_bindless_heap_desc; view_bindless_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; @@ -3519,13 +3521,6 @@ bool D3D12CommandProcessor::UpdateBindings( const Shader::ConstantRegisterMap& float_constant_map_vertex = vertex_shader->constant_register_map(); uint32_t float_constant_count_vertex = float_constant_map_vertex.float_count; - // Even if the shader doesn't need any float constants, a valid binding must - // still be provided, so if the first draw in the frame with the current root - // signature doesn't have float constants at all, still allocate an empty - // buffer. - uint32_t float_constant_size_vertex = xe::align( - uint32_t(std::max(float_constant_count_vertex, 1u) * 4 * sizeof(float)), - 256u); for (uint32_t i = 0; i < 4; ++i) { if (current_float_constant_map_vertex_[i] != float_constant_map_vertex.float_bitmap[i]) { @@ -3557,15 +3552,13 @@ bool D3D12CommandProcessor::UpdateBindings( std::memset(current_float_constant_map_pixel_, 0, sizeof(current_float_constant_map_pixel_)); } - uint32_t float_constant_size_pixel = xe::align( - uint32_t(std::max(float_constant_count_pixel, 1u) * 4 * sizeof(float)), - 256u); // Write the constant buffer data. if (!cbuffer_binding_system_.up_to_date) { uint8_t* system_constants = constant_buffer_pool_->Request( - frame_current_, xe::align(uint32_t(sizeof(system_constants_)), 256u), - nullptr, nullptr, &cbuffer_binding_system_.address); + frame_current_, sizeof(system_constants_), + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, + &cbuffer_binding_system_.address); if (system_constants == nullptr) { return false; } @@ -3576,8 +3569,15 @@ bool D3D12CommandProcessor::UpdateBindings( ~(1u << root_parameter_system_constants); } if (!cbuffer_binding_float_vertex_.up_to_date) { + // Even if the shader doesn't need any float constants, a valid binding must + // still be provided, so if the first draw in the frame with the current + // root signature doesn't have float constants at all, still allocate an + // empty buffer. uint8_t* float_constants = constant_buffer_pool_->Request( - frame_current_, float_constant_size_vertex, nullptr, nullptr, + frame_current_, + uint32_t(std::max(float_constant_count_vertex, uint32_t(1)) * 4 * + sizeof(float)), + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, &cbuffer_binding_float_vertex_.address); if (float_constants == nullptr) { return false; @@ -3603,7 +3603,10 @@ bool D3D12CommandProcessor::UpdateBindings( } if (!cbuffer_binding_float_pixel_.up_to_date) { uint8_t* float_constants = constant_buffer_pool_->Request( - frame_current_, float_constant_size_pixel, nullptr, nullptr, + frame_current_, + uint32_t(std::max(float_constant_count_pixel, uint32_t(1)) * 4 * + sizeof(float)), + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, &cbuffer_binding_float_pixel_.address); if (float_constants == nullptr) { return false; @@ -3632,28 +3635,33 @@ bool D3D12CommandProcessor::UpdateBindings( ~(1u << root_parameter_float_constants_pixel); } if (!cbuffer_binding_bool_loop_.up_to_date) { - uint8_t* bool_loop_constants = - constant_buffer_pool_->Request(frame_current_, 256, nullptr, nullptr, - &cbuffer_binding_bool_loop_.address); + constexpr uint32_t kBoolLoopConstantsSize = (8 + 32) * sizeof(uint32_t); + uint8_t* bool_loop_constants = constant_buffer_pool_->Request( + frame_current_, kBoolLoopConstantsSize, + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, + &cbuffer_binding_bool_loop_.address); if (bool_loop_constants == nullptr) { return false; } std::memcpy(bool_loop_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32, - (8 + 32) * sizeof(uint32_t)); + kBoolLoopConstantsSize); cbuffer_binding_bool_loop_.up_to_date = true; current_graphics_root_up_to_date_ &= ~(1u << root_parameter_bool_loop_constants); } if (!cbuffer_binding_fetch_.up_to_date) { + constexpr uint32_t kFetchConstantsSize = 32 * 6 * sizeof(uint32_t); uint8_t* fetch_constants = constant_buffer_pool_->Request( - frame_current_, 768, nullptr, nullptr, &cbuffer_binding_fetch_.address); + frame_current_, kFetchConstantsSize, + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, + &cbuffer_binding_fetch_.address); if (fetch_constants == nullptr) { return false; } std::memcpy(fetch_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32, - 32 * 6 * sizeof(uint32_t)); + kFetchConstantsSize); cbuffer_binding_fetch_.up_to_date = true; current_graphics_root_up_to_date_ &= ~(1u << root_parameter_fetch_constants); @@ -3885,12 +3893,10 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t* descriptor_indices = reinterpret_cast(constant_buffer_pool_->Request( frame_current_, - xe::align( - uint32_t(std::max(texture_count_vertex + sampler_count_vertex, - uint32_t(1)) * - sizeof(uint32_t)), - uint32_t(256)), - nullptr, nullptr, + uint32_t(std::max(texture_count_vertex + sampler_count_vertex, + uint32_t(1)) * + sizeof(uint32_t)), + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, &cbuffer_binding_descriptor_indices_vertex_.address)); if (!descriptor_indices) { return false; @@ -3923,12 +3929,10 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t* descriptor_indices = reinterpret_cast(constant_buffer_pool_->Request( frame_current_, - xe::align( - uint32_t(std::max(texture_count_pixel + sampler_count_pixel, - uint32_t(1)) * - sizeof(uint32_t)), - uint32_t(256)), - nullptr, nullptr, + uint32_t(std::max(texture_count_pixel + sampler_count_pixel, + uint32_t(1)) * + sizeof(uint32_t)), + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, &cbuffer_binding_descriptor_indices_pixel_.address)); if (!descriptor_indices) { return false; diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc index ab2138b47..947d0666f 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.cc +++ b/src/xenia/gpu/d3d12/primitive_converter.cc @@ -52,11 +52,13 @@ bool PrimitiveConverter::Initialize() { D3D12_HEAP_FLAGS heap_flag_create_not_zeroed = provider.GetHeapFlagCreateNotZeroed(); - // There can be at most 65535 indices in a Xenos draw call, but they can be up - // to 4 bytes large, and conversion can add more indices (almost triple the - // count for triangle strips, for instance). - buffer_pool_ = - std::make_unique(provider, 4 * 1024 * 1024); + // There can be at most 65535 indices in a Xenos draw call (16 bit index + // count), but they can be up to 4 bytes large, and conversion can add more + // indices (almost triple the count for triangle strips or fans, for + // instance). + buffer_pool_ = std::make_unique( + provider, std::max(uint32_t(65535 * 3 * sizeof(uint32_t)), + ui::d3d12::UploadBufferPool::kDefaultPageSize)); // Create the static index buffer for non-indexed drawing. D3D12_RESOURCE_DESC static_ib_desc; @@ -697,8 +699,8 @@ void* PrimitiveConverter::AllocateIndices( } D3D12_GPU_VIRTUAL_ADDRESS gpu_address; uint8_t* mapping = - buffer_pool_->Request(command_processor_.GetCurrentFrame(), size, nullptr, - nullptr, &gpu_address); + buffer_pool_->Request(command_processor_.GetCurrentFrame(), size, 16, + nullptr, nullptr, &gpu_address); if (mapping == nullptr) { XELOGE("Failed to allocate space for {} converted {}-bit vertex indices", count, format == xenos::IndexFormat::kInt32 ? 32 : 16); diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index cfc7f6f47..2cb4cfc61 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -1507,7 +1507,7 @@ void RenderTargetCache::RestoreEdramSnapshot(const void* snapshot) { ID3D12Resource* upload_buffer; uint32_t upload_buffer_offset; void* upload_buffer_mapping = edram_snapshot_restore_pool_->Request( - command_processor_.GetCurrentSubmission(), xenos::kEdramSizeBytes, + command_processor_.GetCurrentSubmission(), xenos::kEdramSizeBytes, 1, &upload_buffer, &upload_buffer_offset, nullptr); if (!upload_buffer_mapping) { XELOGE("Failed to get a buffer for restoring a EDRAM snapshot"); diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc index c24336664..6c9c735b1 100644 --- a/src/xenia/gpu/d3d12/shared_memory.cc +++ b/src/xenia/gpu/d3d12/shared_memory.cc @@ -154,8 +154,8 @@ bool SharedMemory::Initialize() { system_page_flags_.resize((page_count_ + 63) / 64); upload_buffer_pool_ = std::make_unique( - provider, - xe::align(uint32_t(4 * 1024 * 1024), uint32_t(1) << page_size_log2_)); + provider, xe::align(ui::d3d12::UploadBufferPool::kDefaultPageSize, + uint32_t(1) << page_size_log2_)); memory_invalidation_callback_handle_ = memory_.RegisterPhysicalMemoryInvalidationCallback( @@ -442,8 +442,9 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) { uint32_t upload_buffer_offset, upload_buffer_size; uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial( command_processor_.GetCurrentSubmission(), - upload_range_length << page_size_log2_, &upload_buffer, - &upload_buffer_offset, &upload_buffer_size, nullptr); + upload_range_length << page_size_log2_, + uint32_t(1) << page_size_log2_, &upload_buffer, &upload_buffer_offset, + &upload_buffer_size, nullptr); if (upload_buffer_mapping == nullptr) { XELOGE("Shared memory: Failed to get an upload buffer"); return false; diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 821a0136f..5556c2d8d 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -2396,9 +2396,9 @@ bool TextureCache::LoadTextureData(Texture* texture) { } D3D12_GPU_VIRTUAL_ADDRESS cbuffer_gpu_address; uint8_t* cbuffer_mapping = cbuffer_pool.Request( - command_processor_.GetCurrentFrame(), - xe::align(uint32_t(sizeof(load_constants)), uint32_t(256)), nullptr, - nullptr, &cbuffer_gpu_address); + command_processor_.GetCurrentFrame(), sizeof(load_constants), + D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, + &cbuffer_gpu_address); if (cbuffer_mapping == nullptr) { command_processor_.ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state); diff --git a/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc b/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc index 3c0ea8a3a..625b3ce69 100644 --- a/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc +++ b/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc @@ -287,8 +287,7 @@ bool D3D12ImmediateDrawer::Initialize() { device->CreateSampler(&sampler_desc, sampler_handle); // Create pools for draws. - vertex_buffer_pool_ = - std::make_unique(provider, 2 * 1024 * 1024); + vertex_buffer_pool_ = std::make_unique(provider); texture_descriptor_pool_ = std::make_unique( device, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, 2048); texture_descriptor_pool_heap_index_ = DescriptorHeapPool::kHeapIndexInvalid; @@ -506,8 +505,8 @@ void D3D12ImmediateDrawer::BeginDrawBatch(const ImmediateDrawBatch& batch) { vertex_buffer_view.SizeInBytes = batch.vertex_count * uint32_t(sizeof(ImmediateVertex)); void* vertex_buffer_mapping = vertex_buffer_pool_->Request( - current_fence_value, vertex_buffer_view.SizeInBytes, nullptr, nullptr, - &vertex_buffer_view.BufferLocation); + current_fence_value, vertex_buffer_view.SizeInBytes, sizeof(uint32_t), + nullptr, nullptr, &vertex_buffer_view.BufferLocation); if (vertex_buffer_mapping == nullptr) { XELOGE("Failed to get a buffer for {} vertices in the immediate drawer", batch.vertex_count); @@ -524,8 +523,7 @@ void D3D12ImmediateDrawer::BeginDrawBatch(const ImmediateDrawBatch& batch) { index_buffer_view.SizeInBytes = batch.index_count * sizeof(uint16_t); index_buffer_view.Format = DXGI_FORMAT_R16_UINT; void* index_buffer_mapping = vertex_buffer_pool_->Request( - current_fence_value, - xe::align(index_buffer_view.SizeInBytes, UINT(sizeof(uint32_t))), + current_fence_value, index_buffer_view.SizeInBytes, sizeof(uint16_t), nullptr, nullptr, &index_buffer_view.BufferLocation); if (index_buffer_mapping == nullptr) { XELOGE("Failed to get a buffer for {} indices in the immediate drawer", diff --git a/src/xenia/ui/d3d12/pools.cc b/src/xenia/ui/d3d12/pools.cc index 7b892caa9..b646b1c6c 100644 --- a/src/xenia/ui/d3d12/pools.cc +++ b/src/xenia/ui/d3d12/pools.cc @@ -13,14 +13,20 @@ #include "xenia/base/assert.h" #include "xenia/base/logging.h" +#include "xenia/base/math.h" #include "xenia/ui/d3d12/d3d12_util.h" namespace xe { namespace ui { namespace d3d12 { +// Align to D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT not to waste any space if +// it's smaller (the size of the heap backing the buffer will be aligned to +// D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT anyway). UploadBufferPool::UploadBufferPool(D3D12Provider& provider, uint32_t page_size) - : provider_(provider), page_size_(page_size) {} + : provider_(provider), + page_size_(xe::align( + page_size, uint32_t(D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT))) {} UploadBufferPool::~UploadBufferPool() { ClearCache(); } @@ -68,9 +74,13 @@ void UploadBufferPool::ClearCache() { } uint8_t* UploadBufferPool::Request(uint64_t submission_index, uint32_t size, + uint32_t alignment, ID3D12Resource** buffer_out, uint32_t* offset_out, D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out) { + assert_not_zero(alignment); + assert_true(xe::is_pow2(alignment)); + size = xe::align(size, alignment); assert_true(size <= page_size_); if (size > page_size_) { return nullptr; @@ -79,7 +89,8 @@ uint8_t* UploadBufferPool::Request(uint64_t submission_index, uint32_t size, submission_index >= writable_first_->last_submission_index); assert_true(!submitted_last_ || submission_index >= submitted_last_->last_submission_index); - if (page_size_ - current_page_used_ < size || !writable_first_) { + uint32_t current_page_used_aligned = xe::align(current_page_used_, alignment); + if (current_page_used_aligned + size > page_size_ || !writable_first_) { // Start a new page if can't fit all the bytes or don't have an open page. if (writable_first_) { // Close the page that was current. @@ -128,33 +139,39 @@ uint8_t* UploadBufferPool::Request(uint64_t submission_index, uint32_t size, writable_last_ = writable_first_; } current_page_used_ = 0; + current_page_used_aligned = 0; } writable_first_->last_submission_index = submission_index; if (buffer_out) { *buffer_out = writable_first_->buffer; } if (offset_out) { - *offset_out = current_page_used_; + *offset_out = current_page_used_aligned; } if (gpu_address_out) { - *gpu_address_out = writable_first_->gpu_address + current_page_used_; + *gpu_address_out = writable_first_->gpu_address + current_page_used_aligned; } - uint8_t* mapping = - reinterpret_cast(writable_first_->mapping) + current_page_used_; - current_page_used_ += size; + uint8_t* mapping = reinterpret_cast(writable_first_->mapping) + + current_page_used_aligned; + current_page_used_ = current_page_used_aligned + size; return mapping; } uint8_t* UploadBufferPool::RequestPartial( - uint64_t submission_index, uint32_t size, ID3D12Resource** buffer_out, - uint32_t* offset_out, uint32_t* size_out, + uint64_t submission_index, uint32_t size, uint32_t alignment, + ID3D12Resource** buffer_out, uint32_t* offset_out, uint32_t* size_out, D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out) { + assert_not_zero(alignment); + assert_true(xe::is_pow2(alignment)); + size = xe::align(size, alignment); size = std::min(size, page_size_); - if (current_page_used_ < page_size_) { - size = std::min(size, page_size_ - current_page_used_); + uint32_t current_page_used_aligned = xe::align(current_page_used_, alignment); + if (current_page_used_aligned + alignment <= page_size_) { + size = std::min( + size, (page_size_ - current_page_used_aligned) & ~(alignment - 1)); } - uint8_t* mapping = - Request(submission_index, size, buffer_out, offset_out, gpu_address_out); + uint8_t* mapping = Request(submission_index, size, alignment, buffer_out, + offset_out, gpu_address_out); if (!mapping) { return nullptr; } diff --git a/src/xenia/ui/d3d12/pools.h b/src/xenia/ui/d3d12/pools.h index 21606cc42..e0bb39928 100644 --- a/src/xenia/ui/d3d12/pools.h +++ b/src/xenia/ui/d3d12/pools.h @@ -23,7 +23,12 @@ namespace d3d12 { class UploadBufferPool { public: - UploadBufferPool(D3D12Provider& provider, uint32_t page_size); + // Taken from the Direct3D 12 MiniEngine sample (LinearAllocator + // kCpuAllocatorPageSize). Large enough for most cases. + static constexpr uint32_t kDefaultPageSize = 2 * 1024 * 1024; + + UploadBufferPool(D3D12Provider& provider, + uint32_t page_size = kDefaultPageSize); ~UploadBufferPool(); void Reclaim(uint64_t completed_submission_index); @@ -31,13 +36,13 @@ class UploadBufferPool { // Request to write data in a single piece, creating a new page if the current // one doesn't have enough free space. - uint8_t* Request(uint64_t submission_index, uint32_t size, + uint8_t* Request(uint64_t submission_index, uint32_t size, uint32_t alignment, ID3D12Resource** buffer_out, uint32_t* offset_out, D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out); // Request to write data in multiple parts, filling the buffer entirely. uint8_t* RequestPartial(uint64_t submission_index, uint32_t size, - ID3D12Resource** buffer_out, uint32_t* offset_out, - uint32_t* size_out, + uint32_t alignment, ID3D12Resource** buffer_out, + uint32_t* offset_out, uint32_t* size_out, D3D12_GPU_VIRTUAL_ADDRESS* gpu_address_out); private: