From 6025599d3b851472e5e09612e31a11d6e2fbf2fb Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 22 Dec 2018 00:57:31 +0300 Subject: [PATCH] [D3D12] Request memory for memexport in shared memory --- .../gpu/d3d12/d3d12_command_processor.cc | 228 +++++++++++++++--- src/xenia/gpu/d3d12/d3d12_command_processor.h | 7 + 2 files changed, 208 insertions(+), 27 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 1eef15b41..af567982f 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1155,12 +1155,8 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, if ((regs[XE_GPU_REG_RB_SURFACE_INFO].u32 & 0x3FFF) == 0) { // Doesn't actually draw. - return true; - } - if ((regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & 0x3) == 0x3 && - primitive_type != PrimitiveType::kPointList && - primitive_type != PrimitiveType::kRectangleList) { - // Both sides are culled - can't reproduce this with rasterizer state. + // TODO(Triang3l): Do something so memexport still works in this case maybe? + // Unlikely that zero would even really be legal though. return true; } @@ -1179,17 +1175,36 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, // Need a pixel shader in normal color mode. return false; } - // Translate shaders now because to get the color mask, which is needed by the - // render target cache. + // Translate the shaders now to get memexport configuration and color mask, + // which is needed by the render target cache, and also to get used textures + // and samplers. if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader, pixel_shader, primitive_type)) { return false; } + // Check if memexport is used. If it is, we can't skip draw calls that have no + // visual effect. + bool memexport_used_vertex = + !vertex_shader->memexport_stream_constants().empty(); + bool memexport_used_pixel = + pixel_shader != nullptr && + !pixel_shader->memexport_stream_constants().empty(); + bool memexport_used = memexport_used_vertex || memexport_used_pixel; + + if (!memexport_used_vertex && + (regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & 0x3) == 0x3 && + primitive_type != PrimitiveType::kPointList && + primitive_type != PrimitiveType::kRectangleList) { + // Both sides are culled - can't reproduce this with rasterizer state. + return true; + } + uint32_t color_mask = GetCurrentColorMask(pixel_shader); uint32_t rb_depthcontrol = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32; uint32_t rb_stencilrefmask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32; - if (!color_mask && ((rb_depthcontrol & (0x2 | 0x4)) != (0x2 | 0x4)) && + if (!memexport_used && !color_mask && + ((rb_depthcontrol & (0x2 | 0x4)) != (0x2 | 0x4)) && (!(rb_depthcontrol & 0x1) || !(rb_stencilrefmask & (0xFF << 16)))) { // Not writing to color, depth or stencil, so doesn't draw. return true; @@ -1201,11 +1216,14 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, // Set up the render targets - this may bind pipelines. if (!render_target_cache_->UpdateRenderTargets(pixel_shader)) { // Doesn't actually draw. + // TODO(Triang3l): Do something so memexport still works in this case maybe? + // Not distingushing between no operation and a true failure. return true; } const RenderTargetCache::PipelineRenderTarget* pipeline_render_targets = render_target_cache_->GetCurrentPipelineRenderTargets(); + // Set up primitive topology. bool indexed = index_buffer_info != nullptr && index_buffer_info->guest_base; // Adaptive tessellation requires an index buffer, but it contains per-edge // tessellation factors (as floats) in it instead of control point indices. @@ -1250,14 +1268,11 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, } else { adaptive_tessellation = false; } - // TODO(Triang3l): Non-indexed line loops (by movc'ing zero to the vertex // index if it's one beyond the end). if (primitive_type == PrimitiveType::kLineLoop && !indexed) { return false; } - - // Set the primitive topology. PrimitiveType primitive_type_converted = PrimitiveConverter::GetReplacementPrimitiveType(primitive_type); D3D_PRIMITIVE_TOPOLOGY primitive_topology; @@ -1295,7 +1310,12 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, command_list->IASetPrimitiveTopology(primitive_topology); } - // Get the pipeline and translate the shaders so used textures are known. + // Update the textures - this may bind pipelines. + texture_cache_->RequestTextures( + vertex_shader->GetUsedTextureMask(), + pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0); + + // Create the pipeline if needed and bind it. ID3D12PipelineState* pipeline; ID3D12RootSignature* root_signature; auto pipeline_status = pipeline_cache_->ConfigurePipeline( @@ -1305,21 +1325,14 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, if (pipeline_status == PipelineCache::UpdateStatus::kError) { return false; } - - // Update the textures - this may bind pipelines. - texture_cache_->RequestTextures( - vertex_shader->GetUsedTextureMask(), - pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0); - - // Update viewport, scissor, blend factor and stencil reference. - UpdateFixedFunctionState(command_list); - - // Bind the pipeline. if (current_pipeline_ != pipeline) { GetCurrentCommandList()->SetPipelineState(pipeline); current_pipeline_ = pipeline; } + // Update viewport, scissor, blend factor and stencil reference. + UpdateFixedFunctionState(command_list); + // Update system constants before uploading them. UpdateSystemConstantValues( primitive_type, @@ -1350,16 +1363,139 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, regs[vfetch_constant_index + 1].u32); return false; } - shared_memory_->RequestRange( - regs[vfetch_constant_index].u32 & 0x1FFFFFFC, - regs[vfetch_constant_index + 1].u32 & 0x3FFFFFC); + if (!shared_memory_->RequestRange( + regs[vfetch_constant_index].u32 & 0x1FFFFFFC, + regs[vfetch_constant_index + 1].u32 & 0x3FFFFFC)) { + XELOGE( + "Failed to request vertex buffer at 0x%.8X (size %u) in the shared " + "memory", + regs[vfetch_constant_index].u32 & 0x1FFFFFFC, + regs[vfetch_constant_index + 1].u32 & 0x3FFFFFC); + return false; + } vertex_buffers_resident[vfetch_index >> 6] |= 1ull << (vfetch_index & 63); } + // Gather memexport ranges and ensure the heaps for them are resident, and + // also load the data surrounding the export and to fill the regions that + // won't be modified by the shaders. + struct MemExportRange { + uint32_t base_address_dwords; + uint32_t size_dwords; + }; + MemExportRange memexport_ranges[512]; + uint32_t memexport_range_count = 0; + if (memexport_used_vertex) { + const std::vector& memexport_stream_constants_vertex = + vertex_shader->memexport_stream_constants(); + for (uint32_t constant_index : memexport_stream_constants_vertex) { + const xenos::xe_gpu_memexport_stream_t* memexport_stream = + reinterpret_cast( + ®s[XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4]); + if (memexport_stream->index_count == 0) { + continue; + } + uint32_t memexport_format_size = + GetSupportedMemExportFormatSize(memexport_stream->format); + if (memexport_format_size == 0) { + XELOGE( + "Unsupported memexport format %s", + FormatInfo::Get(TextureFormat(uint32_t(memexport_stream->format))) + ->name); + return false; + } + uint32_t memexport_base_address = memexport_stream->base_address; + uint32_t memexport_size_dwords = + memexport_stream->index_count * memexport_format_size; + // Try to reduce the number of shared memory operations when writing + // different elements into the same buffer through different exports + // (happens in Halo 3). + bool memexport_range_reused = false; + for (uint32_t i = 0; i < memexport_range_count; ++i) { + MemExportRange& memexport_range = memexport_ranges[i]; + if (memexport_range.base_address_dwords == memexport_base_address) { + memexport_range.size_dwords = + std::max(memexport_range.size_dwords, memexport_size_dwords); + memexport_range_reused = true; + break; + } + } + // Add a new range if haven't expanded an existing one. + if (!memexport_range_reused) { + MemExportRange& memexport_range = + memexport_ranges[memexport_range_count++]; + memexport_range.base_address_dwords = memexport_base_address; + memexport_range.size_dwords = memexport_size_dwords; + } + } + } + if (memexport_used_pixel) { + const std::vector& memexport_stream_constants_pixel = + pixel_shader->memexport_stream_constants(); + for (uint32_t constant_index : memexport_stream_constants_pixel) { + const xenos::xe_gpu_memexport_stream_t* memexport_stream = + reinterpret_cast( + ®s[XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4]); + if (memexport_stream->index_count == 0) { + continue; + } + uint32_t memexport_format_size = + GetSupportedMemExportFormatSize(memexport_stream->format); + if (memexport_format_size == 0) { + XELOGE( + "Unsupported memexport format %s", + FormatInfo::Get(TextureFormat(uint32_t(memexport_stream->format))) + ->name); + return false; + } + uint32_t memexport_base_address = memexport_stream->base_address; + uint32_t memexport_size_dwords = + memexport_stream->index_count * memexport_format_size; + bool memexport_range_reused = false; + for (uint32_t i = 0; i < memexport_range_count; ++i) { + MemExportRange& memexport_range = memexport_ranges[i]; + if (memexport_range.base_address_dwords == memexport_base_address) { + memexport_range.size_dwords = + std::max(memexport_range.size_dwords, memexport_size_dwords); + memexport_range_reused = true; + break; + } + } + if (!memexport_range_reused) { + MemExportRange& memexport_range = + memexport_ranges[memexport_range_count++]; + memexport_range.base_address_dwords = memexport_base_address; + memexport_range.size_dwords = memexport_size_dwords; + } + } + } + for (uint32_t i = 0; i < memexport_range_count; ++i) { + const MemExportRange& memexport_range = memexport_ranges[i]; + if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2, + memexport_range.size_dwords << 2)) { + XELOGE( + "Failed to request memexport stream at 0x%.8X (size %u) in the " + "shared memory", + memexport_range.base_address_dwords << 2, + memexport_range.size_dwords << 2); + return false; + } + } + if (IsROVUsedForEDRAM()) { render_target_cache_->UseEDRAMAsUAV(); } + + // TODO(Triang3l): Copy the index buffer to a scratch buffer if using + // memexport with an index buffer, because a resource can't be an index buffer + // (read-only) and a UAV (read/write) at once. + + // Actually draw. if (indexed) { + if (memexport_used) { + // TODO(Triang3l): Index buffer copying for memexport. + return false; + } uint32_t index_size = index_buffer_info->format == IndexFormat::kInt32 ? sizeof(uint32_t) : sizeof(uint16_t); @@ -1388,7 +1524,13 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, index_count = converted_index_count; } else { uint32_t index_buffer_size = index_buffer_info->count * index_size; - shared_memory_->RequestRange(index_base, index_buffer_size); + if (!shared_memory_->RequestRange(index_base, index_buffer_size)) { + XELOGE( + "Failed to request index buffer at 0x%.8X (size %u) in the shared " + "memory", + index_base, index_buffer_size); + return false; + } index_buffer_view.BufferLocation = shared_memory_->GetGPUAddress() + index_base; index_buffer_view.SizeInBytes = index_buffer_size; @@ -1422,6 +1564,8 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, } } + // TODO(Triang3l): Read back memexported data if the respective gflag is set. + return true; } @@ -2855,6 +2999,36 @@ bool D3D12CommandProcessor::UpdateBindings( return true; } +uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize( + ColorFormat format) { + switch (format) { + case ColorFormat::k_8_8_8_8: + case ColorFormat::k_2_10_10_10: + // TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the + // texture cache currently. + // case ColorFormat::k_8_8_8_8_A: + case ColorFormat::k_10_11_11: + case ColorFormat::k_11_11_10: + case ColorFormat::k_16_16: + case ColorFormat::k_16_16_FLOAT: + case ColorFormat::k_32_FLOAT: + case ColorFormat::k_8_8_8_8_AS_16_16_16_16: + case ColorFormat::k_2_10_10_10_AS_16_16_16_16: + case ColorFormat::k_10_11_11_AS_16_16_16_16: + case ColorFormat::k_11_11_10_AS_16_16_16_16: + return 1; + case ColorFormat::k_16_16_16_16: + case ColorFormat::k_16_16_16_16_FLOAT: + case ColorFormat::k_32_32_FLOAT: + return 2; + case ColorFormat::k_32_32_32_32_FLOAT: + return 4; + default: + break; + } + return 0; +} + } // namespace d3d12 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 84ef78b74..329277ee3 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -209,6 +209,13 @@ class D3D12CommandProcessor : public CommandProcessor { const D3D12Shader* pixel_shader, ID3D12RootSignature* root_signature); + // Returns dword count for one element for a memexport format, or 0 if it's + // not supported by the D3D12 command processor (if it's smaller that 1 dword, + // for instance). + // TODO(Triang3l): Check if any game uses memexport with formats smaller than + // 32 bits per element. + static uint32_t GetSupportedMemExportFormatSize(ColorFormat format); + bool cache_clear_requested_ = false; std::unique_ptr