From e6fa0ad13993282a891009ff4a757d5dcc46fd7c Mon Sep 17 00:00:00 2001 From: Triang3l <triang3l@yandex.ru> Date: Sat, 19 Dec 2020 16:14:54 +0300 Subject: [PATCH] [GPU] Dynamic r# count via shader modifications + refactoring --- .../gpu/d3d12/d3d12_command_processor.cc | 189 +- src/xenia/gpu/d3d12/d3d12_command_processor.h | 2 +- src/xenia/gpu/d3d12/d3d12_shader.cc | 2 +- src/xenia/gpu/d3d12/d3d12_shader.h | 4 +- src/xenia/gpu/d3d12/pipeline_cache.cc | 360 ++-- src/xenia/gpu/d3d12/pipeline_cache.h | 42 +- src/xenia/gpu/d3d12/render_target_cache.cc | 6 +- src/xenia/gpu/d3d12/render_target_cache.h | 2 +- src/xenia/gpu/d3d12/texture_cache.cc | 8 +- src/xenia/gpu/d3d12/texture_cache.h | 4 +- src/xenia/gpu/dxbc_shader.cc | 2 +- src/xenia/gpu/dxbc_shader.h | 26 +- src/xenia/gpu/dxbc_shader_translator.cc | 262 +-- src/xenia/gpu/dxbc_shader_translator.h | 37 +- .../gpu/dxbc_shader_translator_memexport.cc | 2 +- src/xenia/gpu/dxbc_shader_translator_om.cc | 72 +- src/xenia/gpu/registers.h | 1 + src/xenia/gpu/shader.cc | 8 +- src/xenia/gpu/shader.h | 234 ++- src/xenia/gpu/shader_compiler_main.cc | 19 +- src/xenia/gpu/shader_translator.cc | 1792 ++++++++--------- src/xenia/gpu/shader_translator.h | 217 +- src/xenia/gpu/spirv_shader_translator.cc | 26 +- src/xenia/gpu/spirv_shader_translator.h | 16 +- src/xenia/gpu/ucode.h | 11 +- src/xenia/gpu/vulkan/pipeline_cache.cc | 17 +- src/xenia/gpu/vulkan/pipeline_cache.h | 6 +- src/xenia/gpu/vulkan/vulkan_shader.cc | 2 +- src/xenia/gpu/vulkan/vulkan_shader.h | 4 +- src/xenia/gpu/xenos.h | 27 - 30 files changed, 1684 insertions(+), 1716 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index f6af89881..95744b49c 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -99,14 +99,11 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) { } uint32_t D3D12CommandProcessor::GetCurrentColorMask( - const Shader* pixel_shader) const { - if (pixel_shader == nullptr) { - return 0; - } + uint32_t shader_writes_color_targets) const { auto& regs = *register_file_; uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32 & 0xFFFF; for (uint32_t i = 0; i < 4; ++i) { - if (!pixel_shader->writes_color_target(i)) { + if (!(shader_writes_color_targets & (1 << i))) { color_mask &= ~(0xF << (i * 4)); } } @@ -167,14 +164,18 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( tessellated ? D3D12_SHADER_VISIBILITY_DOMAIN : D3D12_SHADER_VISIBILITY_VERTEX; - uint32_t texture_count_vertex, sampler_count_vertex; - vertex_shader->GetTextureBindings(texture_count_vertex); - vertex_shader->GetSamplerBindings(sampler_count_vertex); - uint32_t texture_count_pixel = 0, sampler_count_pixel = 0; - if (pixel_shader != nullptr) { - pixel_shader->GetTextureBindings(texture_count_pixel); - pixel_shader->GetSamplerBindings(sampler_count_pixel); - } + uint32_t texture_count_vertex = + uint32_t(vertex_shader->GetTextureBindingsAfterTranslation().size()); + uint32_t sampler_count_vertex = + uint32_t(vertex_shader->GetSamplerBindingsAfterTranslation().size()); + uint32_t texture_count_pixel = + pixel_shader + ? uint32_t(pixel_shader->GetTextureBindingsAfterTranslation().size()) + : 0; + uint32_t sampler_count_pixel = + pixel_shader + ? uint32_t(pixel_shader->GetSamplerBindingsAfterTranslation().size()) + : 0; // Better put the pixel texture/sampler in the lower bits probably because it // changes often. @@ -383,33 +384,26 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices( const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, RootBindfulExtraParameterIndices& indices_out) { - uint32_t texture_count_pixel = 0, sampler_count_pixel = 0; - if (pixel_shader != nullptr) { - pixel_shader->GetTextureBindings(texture_count_pixel); - pixel_shader->GetSamplerBindings(sampler_count_pixel); - } - uint32_t texture_count_vertex, sampler_count_vertex; - vertex_shader->GetTextureBindings(texture_count_vertex); - vertex_shader->GetSamplerBindings(sampler_count_vertex); - uint32_t index = kRootParameter_Bindful_Count_Base; - if (texture_count_pixel != 0) { + if (pixel_shader && + !pixel_shader->GetTextureBindingsAfterTranslation().empty()) { indices_out.textures_pixel = index++; } else { indices_out.textures_pixel = RootBindfulExtraParameterIndices::kUnavailable; } - if (sampler_count_pixel != 0) { + if (pixel_shader && + !pixel_shader->GetSamplerBindingsAfterTranslation().empty()) { indices_out.samplers_pixel = index++; } else { indices_out.samplers_pixel = RootBindfulExtraParameterIndices::kUnavailable; } - if (texture_count_vertex != 0) { + if (!vertex_shader->GetTextureBindingsAfterTranslation().empty()) { indices_out.textures_vertex = index++; } else { indices_out.textures_vertex = RootBindfulExtraParameterIndices::kUnavailable; } - if (sampler_count_vertex != 0) { + if (!vertex_shader->GetSamplerBindingsAfterTranslation().empty()) { indices_out.samplers_vertex = index++; } else { indices_out.samplers_vertex = @@ -1839,10 +1833,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Need a pixel shader in normal color mode. return false; } + // Gather shader ucode information to get the color mask, which is needed by + // the render target cache, and memexport configuration, and also get the + // current shader modification bits. DxbcShaderTranslator::Modification vertex_shader_modification; DxbcShaderTranslator::Modification pixel_shader_modification; - if (!pipeline_cache_->GetCurrentShaderModifications( - vertex_shader_modification, pixel_shader_modification)) { + if (!pipeline_cache_->AnalyzeShaderUcodeAndGetCurrentModifications( + vertex_shader, pixel_shader, vertex_shader_modification, + pixel_shader_modification)) { return false; } D3D12Shader::D3D12Translation* vertex_shader_translation = @@ -1854,13 +1852,6 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, pixel_shader->GetOrCreateTranslation( pixel_shader_modification.value)) : nullptr; - // Translate the shaders now to get memexport configuration and color mask, - // which is needed by the render target cache, and also to get used textures - // and samplers. - if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader_translation, - pixel_shader_translation)) { - return false; - } bool tessellated = vertex_shader_modification.host_vertex_shader_type != Shader::HostVertexShaderType::kVertex; @@ -1889,7 +1880,10 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, BeginSubmission(true); // Set up the render targets - this may bind pipelines. - if (!render_target_cache_->UpdateRenderTargets(pixel_shader)) { + uint32_t pixel_shader_writes_color_targets = + pixel_shader ? pixel_shader->writes_color_targets() : 0; + if (!render_target_cache_->UpdateRenderTargets( + pixel_shader_writes_color_targets)) { return false; } const RenderTargetCache::PipelineRenderTarget* pipeline_render_targets = @@ -1958,13 +1952,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, line_loop_closing_index = 0; } - // Update the textures - this may bind pipelines. - uint32_t used_texture_mask = - vertex_shader->GetUsedTextureMask() | - (pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0); - texture_cache_->RequestTextures(used_texture_mask); - - // Create the pipeline if needed and bind it. + // Translate the shaders and create the pipeline if needed. void* pipeline_handle; ID3D12RootSignature* root_signature; if (!pipeline_cache_->ConfigurePipeline( @@ -1974,6 +1962,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, pipeline_render_targets, &pipeline_handle, &root_signature)) { return false; } + + // Update the textures - this may bind pipelines. + uint32_t used_texture_mask = + vertex_shader->GetUsedTextureMaskAfterTranslation() | + (pixel_shader != nullptr + ? pixel_shader->GetUsedTextureMaskAfterTranslation() + : 0); + texture_cache_->RequestTextures(used_texture_mask); + + // Bind the pipeline after configuring it and doing everything that may bind + // other pipelines. if (current_cached_pipeline_ != pipeline_handle) { deferred_command_list_.SetPipelineStateHandle( reinterpret_cast<void*>(pipeline_handle)); @@ -2026,7 +2025,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, memexport_used, primitive_polygonal, line_loop_closing_index, indexed ? index_buffer_info->endianness : xenos::Endian::kNone, viewport_info, pixel_size_x, pixel_size_y, used_texture_mask, - GetCurrentColorMask(pixel_shader), pipeline_render_targets); + pixel_shader ? GetCurrentColorMask(pixel_shader->writes_color_targets()) + : 0, + pipeline_render_targets); // Update constant buffers, descriptors and root parameters. if (!UpdateBindings(vertex_shader, pixel_shader, root_signature)) { @@ -2089,9 +2090,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, MemExportRange memexport_ranges[512]; uint32_t memexport_range_count = 0; if (memexport_used_vertex) { - const std::vector<uint32_t>& memexport_stream_constants_vertex = - vertex_shader->memexport_stream_constants(); - for (uint32_t constant_index : memexport_stream_constants_vertex) { + for (uint32_t constant_index : + vertex_shader->memexport_stream_constants()) { const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>( XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4); if (memexport_stream.index_count == 0) { @@ -2132,9 +2132,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } } if (memexport_used_pixel) { - const std::vector<uint32_t>& memexport_stream_constants_pixel = - pixel_shader->memexport_stream_constants(); - for (uint32_t constant_index : memexport_stream_constants_pixel) { + for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) { const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>( XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4); if (memexport_stream.index_count == 0) { @@ -3588,20 +3586,21 @@ bool D3D12CommandProcessor::UpdateBindings( vertex_shader->GetTextureBindingLayoutUserUID(); size_t sampler_layout_uid_vertex = vertex_shader->GetSamplerBindingLayoutUserUID(); - uint32_t texture_count_vertex, sampler_count_vertex; - const D3D12Shader::TextureBinding* textures_vertex = - vertex_shader->GetTextureBindings(texture_count_vertex); - const D3D12Shader::SamplerBinding* samplers_vertex = - vertex_shader->GetSamplerBindings(sampler_count_vertex); + const std::vector<D3D12Shader::TextureBinding>& textures_vertex = + vertex_shader->GetTextureBindingsAfterTranslation(); + const std::vector<D3D12Shader::SamplerBinding>& samplers_vertex = + vertex_shader->GetSamplerBindingsAfterTranslation(); + size_t texture_count_vertex = textures_vertex.size(); + size_t sampler_count_vertex = samplers_vertex.size(); if (sampler_count_vertex) { if (current_sampler_layout_uid_vertex_ != sampler_layout_uid_vertex) { current_sampler_layout_uid_vertex_ = sampler_layout_uid_vertex; cbuffer_binding_descriptor_indices_vertex_.up_to_date = false; bindful_samplers_written_vertex_ = false; } - current_samplers_vertex_.resize(std::max(current_samplers_vertex_.size(), - size_t(sampler_count_vertex))); - for (uint32_t i = 0; i < sampler_count_vertex; ++i) { + current_samplers_vertex_.resize( + std::max(current_samplers_vertex_.size(), sampler_count_vertex)); + for (size_t i = 0; i < sampler_count_vertex; ++i) { TextureCache::SamplerParameters parameters = texture_cache_->GetSamplerParameters(samplers_vertex[i]); if (current_samplers_vertex_[i] != parameters) { @@ -3615,14 +3614,16 @@ bool D3D12CommandProcessor::UpdateBindings( // Get textures and samplers used by the pixel shader, check if the last used // samplers are compatible and update them. size_t texture_layout_uid_pixel, sampler_layout_uid_pixel; - uint32_t texture_count_pixel, sampler_count_pixel; - const D3D12Shader::TextureBinding* textures_pixel; - const D3D12Shader::SamplerBinding* samplers_pixel; + const std::vector<D3D12Shader::TextureBinding>* textures_pixel; + const std::vector<D3D12Shader::SamplerBinding>* samplers_pixel; + size_t texture_count_pixel, sampler_count_pixel; if (pixel_shader != nullptr) { texture_layout_uid_pixel = pixel_shader->GetTextureBindingLayoutUserUID(); sampler_layout_uid_pixel = pixel_shader->GetSamplerBindingLayoutUserUID(); - textures_pixel = pixel_shader->GetTextureBindings(texture_count_pixel); - samplers_pixel = pixel_shader->GetSamplerBindings(sampler_count_pixel); + textures_pixel = &pixel_shader->GetTextureBindingsAfterTranslation(); + texture_count_pixel = textures_pixel->size(); + samplers_pixel = &pixel_shader->GetSamplerBindingsAfterTranslation(); + sampler_count_pixel = samplers_pixel->size(); if (sampler_count_pixel) { if (current_sampler_layout_uid_pixel_ != sampler_layout_uid_pixel) { current_sampler_layout_uid_pixel_ = sampler_layout_uid_pixel; @@ -3633,7 +3634,7 @@ bool D3D12CommandProcessor::UpdateBindings( size_t(sampler_count_pixel))); for (uint32_t i = 0; i < sampler_count_pixel; ++i) { TextureCache::SamplerParameters parameters = - texture_cache_->GetSamplerParameters(samplers_pixel[i]); + texture_cache_->GetSamplerParameters((*samplers_pixel)[i]); if (current_samplers_pixel_[i] != parameters) { current_samplers_pixel_[i] = parameters; cbuffer_binding_descriptor_indices_pixel_.up_to_date = false; @@ -3663,7 +3664,7 @@ bool D3D12CommandProcessor::UpdateBindings( cbuffer_binding_descriptor_indices_vertex_.up_to_date && (current_texture_layout_uid_vertex_ != texture_layout_uid_vertex || !texture_cache_->AreActiveTextureSRVKeysUpToDate( - current_texture_srv_keys_vertex_.data(), textures_vertex, + current_texture_srv_keys_vertex_.data(), textures_vertex.data(), texture_count_vertex))) { cbuffer_binding_descriptor_indices_vertex_.up_to_date = false; } @@ -3671,7 +3672,7 @@ bool D3D12CommandProcessor::UpdateBindings( cbuffer_binding_descriptor_indices_pixel_.up_to_date && (current_texture_layout_uid_pixel_ != texture_layout_uid_pixel || !texture_cache_->AreActiveTextureSRVKeysUpToDate( - current_texture_srv_keys_pixel_.data(), textures_pixel, + current_texture_srv_keys_pixel_.data(), textures_pixel->data(), texture_count_pixel))) { cbuffer_binding_descriptor_indices_pixel_.up_to_date = false; } @@ -3804,15 +3805,14 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t* descriptor_indices = reinterpret_cast<uint32_t*>(constant_buffer_pool_->Request( frame_current_, - std::max(texture_count_vertex + sampler_count_vertex, - uint32_t(1)) * + std::max(texture_count_vertex + sampler_count_vertex, size_t(1)) * sizeof(uint32_t), D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, &cbuffer_binding_descriptor_indices_vertex_.address)); if (!descriptor_indices) { return false; } - for (uint32_t i = 0; i < texture_count_vertex; ++i) { + for (size_t i = 0; i < texture_count_vertex; ++i) { const D3D12Shader::TextureBinding& texture = textures_vertex[i]; descriptor_indices[texture.bindless_descriptor_index] = texture_cache_->GetActiveTextureBindlessSRVIndex(texture) - @@ -3824,11 +3824,11 @@ bool D3D12CommandProcessor::UpdateBindings( std::max(current_texture_srv_keys_vertex_.size(), size_t(texture_count_vertex))); texture_cache_->WriteActiveTextureSRVKeys( - current_texture_srv_keys_vertex_.data(), textures_vertex, + current_texture_srv_keys_vertex_.data(), textures_vertex.data(), texture_count_vertex); } // Current samplers have already been updated. - for (uint32_t i = 0; i < sampler_count_vertex; ++i) { + for (size_t i = 0; i < sampler_count_vertex; ++i) { descriptor_indices[samplers_vertex[i].bindless_descriptor_index] = current_sampler_bindless_indices_vertex_[i]; } @@ -3841,15 +3841,15 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t* descriptor_indices = reinterpret_cast<uint32_t*>(constant_buffer_pool_->Request( frame_current_, - std::max(texture_count_pixel + sampler_count_pixel, uint32_t(1)) * + std::max(texture_count_pixel + sampler_count_pixel, size_t(1)) * sizeof(uint32_t), D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr, &cbuffer_binding_descriptor_indices_pixel_.address)); if (!descriptor_indices) { return false; } - for (uint32_t i = 0; i < texture_count_pixel; ++i) { - const D3D12Shader::TextureBinding& texture = textures_pixel[i]; + for (size_t i = 0; i < texture_count_pixel; ++i) { + const D3D12Shader::TextureBinding& texture = (*textures_pixel)[i]; descriptor_indices[texture.bindless_descriptor_index] = texture_cache_->GetActiveTextureBindlessSRVIndex(texture) - uint32_t(SystemBindlessView::kUnboundedSRVsStart); @@ -3860,12 +3860,12 @@ bool D3D12CommandProcessor::UpdateBindings( std::max(current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel))); texture_cache_->WriteActiveTextureSRVKeys( - current_texture_srv_keys_pixel_.data(), textures_pixel, + current_texture_srv_keys_pixel_.data(), textures_pixel->data(), texture_count_pixel); } // Current samplers have already been updated. - for (uint32_t i = 0; i < sampler_count_pixel; ++i) { - descriptor_indices[samplers_pixel[i].bindless_descriptor_index] = + for (size_t i = 0; i < sampler_count_pixel; ++i) { + descriptor_indices[(*samplers_pixel)[i].bindless_descriptor_index] = current_sampler_bindless_indices_pixel_[i]; } cbuffer_binding_descriptor_indices_pixel_.up_to_date = true; @@ -3884,14 +3884,14 @@ bool D3D12CommandProcessor::UpdateBindings( (!bindful_textures_written_vertex_ || current_texture_layout_uid_vertex_ != texture_layout_uid_vertex || !texture_cache_->AreActiveTextureSRVKeysUpToDate( - current_texture_srv_keys_vertex_.data(), textures_vertex, + current_texture_srv_keys_vertex_.data(), textures_vertex.data(), texture_count_vertex)); bool write_textures_pixel = texture_count_pixel && (!bindful_textures_written_pixel_ || current_texture_layout_uid_pixel_ != texture_layout_uid_pixel || !texture_cache_->AreActiveTextureSRVKeysUpToDate( - current_texture_srv_keys_pixel_.data(), textures_pixel, + current_texture_srv_keys_pixel_.data(), textures_pixel->data(), texture_count_pixel)); bool write_samplers_vertex = sampler_count_vertex && !bindful_samplers_written_vertex_; @@ -3899,7 +3899,7 @@ bool D3D12CommandProcessor::UpdateBindings( sampler_count_pixel && !bindful_samplers_written_pixel_; // Allocate the descriptors. - uint32_t view_count_partial_update = 0; + size_t view_count_partial_update = 0; if (write_textures_vertex) { view_count_partial_update += texture_count_vertex; } @@ -3907,7 +3907,7 @@ bool D3D12CommandProcessor::UpdateBindings( view_count_partial_update += texture_count_pixel; } // All the constants + shared memory SRV and UAV + textures. - uint32_t view_count_full_update = + size_t view_count_full_update = 2 + texture_count_vertex + texture_count_pixel; if (edram_rov_used_) { // + EDRAM UAV. @@ -3917,14 +3917,14 @@ bool D3D12CommandProcessor::UpdateBindings( D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle; uint32_t descriptor_size_view = provider.GetViewDescriptorSize(); uint64_t view_heap_index = RequestViewBindfulDescriptors( - draw_view_bindful_heap_index_, view_count_partial_update, - view_count_full_update, view_cpu_handle, view_gpu_handle); + draw_view_bindful_heap_index_, uint32_t(view_count_partial_update), + uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle); if (view_heap_index == ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { XELOGE("Failed to allocate view descriptors"); return false; } - uint32_t sampler_count_partial_update = 0; + size_t sampler_count_partial_update = 0; if (write_samplers_vertex) { sampler_count_partial_update += sampler_count_vertex; } @@ -3938,9 +3938,10 @@ bool D3D12CommandProcessor::UpdateBindings( ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid; if (sampler_count_vertex != 0 || sampler_count_pixel != 0) { sampler_heap_index = RequestSamplerBindfulDescriptors( - draw_sampler_bindful_heap_index_, sampler_count_partial_update, - sampler_count_vertex + sampler_count_pixel, sampler_cpu_handle, - sampler_gpu_handle); + draw_sampler_bindful_heap_index_, + uint32_t(sampler_count_partial_update), + uint32_t(sampler_count_vertex + sampler_count_pixel), + sampler_cpu_handle, sampler_gpu_handle); if (sampler_heap_index == ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) { XELOGE("Failed to allocate sampler descriptors"); @@ -3985,7 +3986,7 @@ bool D3D12CommandProcessor::UpdateBindings( assert_true(current_graphics_root_bindful_extras_.textures_vertex != RootBindfulExtraParameterIndices::kUnavailable); gpu_handle_textures_vertex_ = view_gpu_handle; - for (uint32_t i = 0; i < texture_count_vertex; ++i) { + for (size_t i = 0; i < texture_count_vertex; ++i) { texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i], view_cpu_handle); view_cpu_handle.ptr += descriptor_size_view; @@ -3996,7 +3997,7 @@ bool D3D12CommandProcessor::UpdateBindings( std::max(current_texture_srv_keys_vertex_.size(), size_t(texture_count_vertex))); texture_cache_->WriteActiveTextureSRVKeys( - current_texture_srv_keys_vertex_.data(), textures_vertex, + current_texture_srv_keys_vertex_.data(), textures_vertex.data(), texture_count_vertex); bindful_textures_written_vertex_ = true; current_graphics_root_up_to_date_ &= @@ -4006,8 +4007,8 @@ bool D3D12CommandProcessor::UpdateBindings( assert_true(current_graphics_root_bindful_extras_.textures_pixel != RootBindfulExtraParameterIndices::kUnavailable); gpu_handle_textures_pixel_ = view_gpu_handle; - for (uint32_t i = 0; i < texture_count_pixel; ++i) { - texture_cache_->WriteActiveTextureBindfulSRV(textures_pixel[i], + for (size_t i = 0; i < texture_count_pixel; ++i) { + texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i], view_cpu_handle); view_cpu_handle.ptr += descriptor_size_view; view_gpu_handle.ptr += descriptor_size_view; @@ -4016,7 +4017,7 @@ bool D3D12CommandProcessor::UpdateBindings( current_texture_srv_keys_pixel_.resize(std::max( current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel))); texture_cache_->WriteActiveTextureSRVKeys( - current_texture_srv_keys_pixel_.data(), textures_pixel, + current_texture_srv_keys_pixel_.data(), textures_pixel->data(), texture_count_pixel); bindful_textures_written_pixel_ = true; current_graphics_root_up_to_date_ &= @@ -4026,7 +4027,7 @@ bool D3D12CommandProcessor::UpdateBindings( assert_true(current_graphics_root_bindful_extras_.samplers_vertex != RootBindfulExtraParameterIndices::kUnavailable); gpu_handle_samplers_vertex_ = sampler_gpu_handle; - for (uint32_t i = 0; i < sampler_count_vertex; ++i) { + for (size_t i = 0; i < sampler_count_vertex; ++i) { texture_cache_->WriteSampler(current_samplers_vertex_[i], sampler_cpu_handle); sampler_cpu_handle.ptr += descriptor_size_sampler; @@ -4041,7 +4042,7 @@ bool D3D12CommandProcessor::UpdateBindings( assert_true(current_graphics_root_bindful_extras_.samplers_pixel != RootBindfulExtraParameterIndices::kUnavailable); gpu_handle_samplers_pixel_ = sampler_gpu_handle; - for (uint32_t i = 0; i < sampler_count_pixel; ++i) { + for (size_t i = 0; i < sampler_count_pixel; ++i) { texture_cache_->WriteSampler(current_samplers_pixel_[i], sampler_cpu_handle); sampler_cpu_handle.ptr += descriptor_size_sampler; diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index a9181f1c3..fc72433fc 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -89,7 +89,7 @@ class D3D12CommandProcessor : public CommandProcessor { // there are 4 render targets bound with the same EDRAM base (clearly not // correct usage), but the shader only clears 1, and then EDRAM buffer stores // conflict with each other. - uint32_t GetCurrentColorMask(const Shader* pixel_shader) const; + uint32_t GetCurrentColorMask(uint32_t shader_writes_color_targets) const; void PushTransitionBarrier( ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state, diff --git a/src/xenia/gpu/d3d12/d3d12_shader.cc b/src/xenia/gpu/d3d12/d3d12_shader.cc index 672f1e37d..eef4ca7de 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.cc +++ b/src/xenia/gpu/d3d12/d3d12_shader.cc @@ -99,7 +99,7 @@ void D3D12Shader::D3D12Translation::DisassembleDxbcAndDxil( } Shader::Translation* D3D12Shader::CreateTranslationInstance( - uint32_t modification) { + uint64_t modification) { return new D3D12Translation(*this, modification); } diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h index 384e48a8a..b64681dc7 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.h +++ b/src/xenia/gpu/d3d12/d3d12_shader.h @@ -23,7 +23,7 @@ class D3D12Shader : public DxbcShader { public: class D3D12Translation : public DxbcTranslation { public: - D3D12Translation(D3D12Shader& shader, uint32_t modification) + D3D12Translation(D3D12Shader& shader, uint64_t modification) : DxbcTranslation(shader, modification) {} void DisassembleDxbcAndDxil(const ui::d3d12::D3D12Provider& provider, @@ -60,7 +60,7 @@ class D3D12Shader : public DxbcShader { } protected: - Translation* CreateTranslationInstance(uint32_t modification) override; + Translation* CreateTranslationInstance(uint64_t modification) override; private: std::atomic_flag binding_layout_user_uids_set_up_ = ATOMIC_FLAG_INIT; diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index cc9f5c9be..c29dd4c0d 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -18,6 +18,7 @@ #include <mutex> #include <set> #include <utility> +#include <vector> #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/assert.h" @@ -29,6 +30,7 @@ #include "xenia/base/math.h" #include "xenia/base/profiling.h" #include "xenia/base/string.h" +#include "xenia/base/string_buffer.h" #include "xenia/base/xxhash.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/gpu_flags.h" @@ -265,7 +267,7 @@ void PipelineCache::InitializeShaderStorage( // collect used shader modifications to translate. std::vector<PipelineStoredDescription> pipeline_stored_descriptions; // <Shader hash, modification bits>. - std::set<std::pair<uint64_t, uint32_t>> shader_translations_needed; + std::set<std::pair<uint64_t, uint64_t>> shader_translations_needed; auto pipeline_storage_file_path = shader_storage_shareable_root / fmt::format("{:08X}.{}.d3d12.xpso", title_id, @@ -292,7 +294,6 @@ void PipelineCache::InitializeShaderStorage( uint32_t magic; uint32_t magic_api; uint32_t version_swapped; - uint32_t device_features; } pipeline_storage_file_header; if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), 1, pipeline_storage_file_) && @@ -331,6 +332,9 @@ void PipelineCache::InitializeShaderStorage( pipeline_stored_descriptions.resize(i); break; } + // TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported + // device features (to keep the cache files mostly shareable across + // devices). // Mark the shader modifications as needed for translation. shader_translations_needed.emplace( pipeline_stored_description.description.vertex_shader_hash, @@ -391,14 +395,14 @@ void PipelineCache::InitializeShaderStorage( // Threads overlapping file reading. std::mutex shaders_translation_thread_mutex; std::condition_variable shaders_translation_thread_cond; - std::deque<std::pair<ShaderStoredHeader, D3D12Shader::D3D12Translation*>> - shaders_to_translate; + std::deque<D3D12Shader*> shaders_to_translate; size_t shader_translation_threads_busy = 0; bool shader_translation_threads_shutdown = false; std::mutex shaders_failed_to_translate_mutex; std::vector<D3D12Shader::D3D12Translation*> shaders_failed_to_translate; auto shader_translation_thread_function = [&]() { auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + StringBuffer ucode_disasm_buffer; DxbcShaderTranslator translator( provider.GetAdapterVendorID(), bindless_resources_used_, edram_rov_used_, provider.GetGraphicsAnalysis() != nullptr); @@ -416,8 +420,7 @@ void PipelineCache::InitializeShaderStorage( IID_PPV_ARGS(&dxc_compiler)); } for (;;) { - std::pair<ShaderStoredHeader, D3D12Shader::D3D12Translation*> - shader_to_translate; + D3D12Shader* shader_to_translate; for (;;) { std::unique_lock<std::mutex> lock(shaders_translation_thread_mutex); if (shaders_to_translate.empty()) { @@ -432,12 +435,29 @@ void PipelineCache::InitializeShaderStorage( ++shader_translation_threads_busy; break; } - assert_not_null(shader_to_translate.second); - if (!TranslateShader(translator, *shader_to_translate.second, - shader_to_translate.first.sq_program_cntl, - dxbc_converter, dxc_utils, dxc_compiler)) { - std::lock_guard<std::mutex> lock(shaders_failed_to_translate_mutex); - shaders_failed_to_translate.push_back(shader_to_translate.second); + shader_to_translate->AnalyzeUcode(ucode_disasm_buffer); + // Translate each needed modification on this thread after performing + // modification-independent analysis of the whole shader. + uint64_t ucode_data_hash = shader_to_translate->ucode_data_hash(); + for (auto modification_it = shader_translations_needed.lower_bound( + std::make_pair(ucode_data_hash, uint64_t(0))); + modification_it != shader_translations_needed.end() && + modification_it->first == ucode_data_hash; + ++modification_it) { + D3D12Shader::D3D12Translation* translation = + static_cast<D3D12Shader::D3D12Translation*>( + shader_to_translate->GetOrCreateTranslation( + modification_it->second)); + // Only try (and delete in case of failure) if it's a new translation. + // If it's a shader previously encountered in the game, translation of + // which has failed, and the shader storage is loaded later, keep it + // this way not to try to translate it again. + if (!translation->is_translated() && + !TranslateAnalyzedShader(translator, *translation, dxbc_converter, + dxc_utils, dxc_compiler)) { + std::lock_guard<std::mutex> lock(shaders_failed_to_translate_mutex); + shaders_failed_to_translate.push_back(translation); + } } { std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex); @@ -477,59 +497,41 @@ void PipelineCache::InitializeShaderStorage( break; } shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count; - // Only add the shader if needed. - auto modification_it = shader_translations_needed.lower_bound( - std::make_pair(ucode_data_hash, uint32_t(0))); - if (modification_it == shader_translations_needed.end() || - modification_it->first != ucode_data_hash) { - continue; - } D3D12Shader* shader = LoadShader(shader_header.type, ucode_dwords.data(), shader_header.ucode_dword_count, ucode_data_hash); + if (shader->ucode_storage_index() == shader_storage_index_) { + // Appeared twice in this file for some reason - skip, otherwise race + // condition will be caused by translating twice in parallel. + continue; + } // Loaded from the current storage - don't write again. shader->set_ucode_storage_index(shader_storage_index_); - // Translate all the needed modifications. - for (; modification_it != shader_translations_needed.end() && - modification_it->first == ucode_data_hash; - ++modification_it) { - bool translation_is_new; - D3D12Shader::D3D12Translation* translation = - static_cast<D3D12Shader::D3D12Translation*>( - shader->GetOrCreateTranslation(modification_it->second, - &translation_is_new)); - if (!translation_is_new) { - // Already added - usually shaders aren't added without the intention - // of translating them imminently, so don't do additional checks to - // actually ensure that translation happens right now (they would - // cause a race condition with shaders currently queued for - // translation). - continue; - } - // Create new threads if the currently existing threads can't keep up - // with file reading, but not more than the number of logical processors - // minus one. - size_t shader_translation_threads_needed; - { - std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex); - shader_translation_threads_needed = - std::min(shader_translation_threads_busy + - shaders_to_translate.size() + size_t(1), - logical_processor_count - size_t(1)); - } - while (shader_translation_threads.size() < - shader_translation_threads_needed) { - shader_translation_threads.push_back(xe::threading::Thread::Create( - {}, shader_translation_thread_function)); - shader_translation_threads.back()->set_name("Shader Translation"); - } - { - std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex); - shaders_to_translate.emplace_back(shader_header, translation); - } - shaders_translation_thread_cond.notify_one(); - ++shaders_translated; + // Create new threads if the currently existing threads can't keep up + // with file reading, but not more than the number of logical processors + // minus one. + size_t shader_translation_threads_needed; + { + std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex); + shader_translation_threads_needed = + std::min(shader_translation_threads_busy + + shaders_to_translate.size() + size_t(1), + logical_processor_count - size_t(1)); } + while (shader_translation_threads.size() < + shader_translation_threads_needed) { + shader_translation_threads.push_back(xe::threading::Thread::Create( + {}, shader_translation_thread_function)); + shader_translation_threads.back()->set_name("Shader Translation"); + } + // Request ucode information gathering and translation of all the needed + // shaders. + { + std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex); + shaders_to_translate.push_back(shader); + } + shaders_translation_thread_cond.notify_one(); + ++shaders_translated; } if (!shader_translation_threads.empty()) { { @@ -593,6 +595,8 @@ void PipelineCache::InitializeShaderStorage( pipeline_stored_descriptions) { const PipelineDescription& pipeline_description = pipeline_stored_description.description; + // TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported device + // features (to keep the cache files mostly shareable across devices). // Skip already known pipelines - those have already been enqueued. auto found_range = pipelines_.equal_range(pipeline_stored_description.description_hash); @@ -621,6 +625,7 @@ void PipelineCache::InitializeShaderStorage( vertex_shader->GetTranslation( pipeline_description.vertex_shader_modification)); if (!pipeline_runtime_description.vertex_shader || + !pipeline_runtime_description.vertex_shader->is_translated() || !pipeline_runtime_description.vertex_shader->is_valid()) { continue; } @@ -637,6 +642,7 @@ void PipelineCache::InitializeShaderStorage( pixel_shader->GetTranslation( pipeline_description.pixel_shader_modification)); if (!pipeline_runtime_description.pixel_shader || + !pipeline_runtime_description.pixel_shader->is_translated() || !pipeline_runtime_description.pixel_shader->is_valid()) { continue; } @@ -730,9 +736,6 @@ void PipelineCache::InitializeShaderStorage( pipeline_storage_file_header.magic_api = pipeline_storage_magic_api; pipeline_storage_file_header.version_swapped = pipeline_storage_version_swapped; - // Reserved for future (for Vulkan) - host device features affecting legal - // pipeline descriptions. - pipeline_storage_file_header.device_features = 0; fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), 1, pipeline_storage_file_); } @@ -854,52 +857,68 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, return shader; } -bool PipelineCache::GetCurrentShaderModifications( +bool PipelineCache::AnalyzeShaderUcodeAndGetCurrentModifications( + D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, DxbcShaderTranslator::Modification& vertex_shader_modification_out, - DxbcShaderTranslator::Modification& pixel_shader_modification_out) const { + DxbcShaderTranslator::Modification& pixel_shader_modification_out) { Shader::HostVertexShaderType host_vertex_shader_type = GetCurrentHostVertexShaderTypeIfValid(); if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { return false; } + const auto& regs = register_file_; + auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>(); + + vertex_shader->AnalyzeUcode(ucode_disasm_buffer_); vertex_shader_modification_out = DxbcShaderTranslator::Modification( - shader_translator_->GetDefaultModification(xenos::ShaderType::kVertex, - host_vertex_shader_type)); - DxbcShaderTranslator::Modification pixel_shader_modification( - shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel)); - if (!edram_rov_used_) { - const auto& regs = register_file_; - using DepthStencilMode = - DxbcShaderTranslator::Modification::DepthStencilMode; - if ((depth_float24_conversion_ == - flags::DepthFloat24Conversion::kOnOutputTruncating || - depth_float24_conversion_ == - flags::DepthFloat24Conversion::kOnOutputRounding) && - regs.Get<reg::RB_DEPTHCONTROL>().z_enable && - regs.Get<reg::RB_DEPTH_INFO>().depth_format == - xenos::DepthRenderTargetFormat::kD24FS8) { - pixel_shader_modification.depth_stencil_mode = - depth_float24_conversion_ == - flags::DepthFloat24Conversion::kOnOutputTruncating - ? DepthStencilMode::kFloat24Truncating - : DepthStencilMode::kFloat24Rounding; - } else { - // Hint to enable early depth/stencil writing if possible - whether it - // will actually take effect depends on the shader itself, it's not known - // before translation. - auto rb_colorcontrol = regs.Get<reg::RB_COLORCONTROL>(); - if ((!rb_colorcontrol.alpha_test_enable || - rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) && - !rb_colorcontrol.alpha_to_mask_enable) { + shader_translator_->GetDefaultModification( + xenos::ShaderType::kVertex, + vertex_shader->GetDynamicAddressableRegisterCount( + sq_program_cntl.vs_num_reg), + host_vertex_shader_type)); + + if (pixel_shader) { + pixel_shader->AnalyzeUcode(ucode_disasm_buffer_); + DxbcShaderTranslator::Modification pixel_shader_modification( + shader_translator_->GetDefaultModification( + xenos::ShaderType::kPixel, + pixel_shader->GetDynamicAddressableRegisterCount( + sq_program_cntl.ps_num_reg))); + if (!edram_rov_used_) { + using DepthStencilMode = + DxbcShaderTranslator::Modification::DepthStencilMode; + if ((depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputTruncating || + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputRounding) && + regs.Get<reg::RB_DEPTHCONTROL>().z_enable && + regs.Get<reg::RB_DEPTH_INFO>().depth_format == + xenos::DepthRenderTargetFormat::kD24FS8) { pixel_shader_modification.depth_stencil_mode = - DepthStencilMode::kEarlyHint; + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputTruncating + ? DepthStencilMode::kFloat24Truncating + : DepthStencilMode::kFloat24Rounding; } else { - pixel_shader_modification.depth_stencil_mode = - DepthStencilMode::kNoModifiers; + auto rb_colorcontrol = regs.Get<reg::RB_COLORCONTROL>(); + if (pixel_shader->implicit_early_z_write_allowed() && + (!rb_colorcontrol.alpha_test_enable || + rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) && + !rb_colorcontrol.alpha_to_mask_enable) { + pixel_shader_modification.depth_stencil_mode = + DepthStencilMode::kEarlyHint; + } else { + pixel_shader_modification.depth_stencil_mode = + DepthStencilMode::kNoModifiers; + } } } + pixel_shader_modification_out = pixel_shader_modification; + } else { + pixel_shader_modification_out = DxbcShaderTranslator::Modification( + shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel, + 0)); } - pixel_shader_modification_out = pixel_shader_modification; return true; } @@ -979,62 +998,6 @@ PipelineCache::GetCurrentHostVertexShaderTypeIfValid() const { return Shader::HostVertexShaderType(-1); } -bool PipelineCache::EnsureShadersTranslated( - D3D12Shader::D3D12Translation* vertex_shader, - D3D12Shader::D3D12Translation* pixel_shader) { - const auto& regs = register_file_; - auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>(); - - // Edge flags are not supported yet (because polygon primitives are not). - assert_true(sq_program_cntl.vs_export_mode != - xenos::VertexShaderExportMode::kPosition2VectorsEdge && - sq_program_cntl.vs_export_mode != - xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill); - assert_false(sq_program_cntl.gen_index_vtx); - - if (!vertex_shader->is_translated()) { - if (!TranslateShader(*shader_translator_, *vertex_shader, sq_program_cntl, - dxbc_converter_, dxc_utils_, dxc_compiler_)) { - XELOGE("Failed to translate the vertex shader!"); - return false; - } - if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() != - shader_storage_index_) { - vertex_shader->shader().set_ucode_storage_index(shader_storage_index_); - assert_not_null(storage_write_thread_); - shader_storage_file_flush_needed_ = true; - { - std::lock_guard<std::mutex> lock(storage_write_request_lock_); - storage_write_shader_queue_.push_back( - std::make_pair(&vertex_shader->shader(), sq_program_cntl)); - } - storage_write_request_cond_.notify_all(); - } - } - - if (pixel_shader != nullptr && !pixel_shader->is_translated()) { - if (!TranslateShader(*shader_translator_, *pixel_shader, sq_program_cntl, - dxbc_converter_, dxc_utils_, dxc_compiler_)) { - XELOGE("Failed to translate the pixel shader!"); - return false; - } - if (shader_storage_file_ && - pixel_shader->shader().ucode_storage_index() != shader_storage_index_) { - pixel_shader->shader().set_ucode_storage_index(shader_storage_index_); - assert_not_null(storage_write_thread_); - shader_storage_file_flush_needed_ = true; - { - std::lock_guard<std::mutex> lock(storage_write_request_lock_); - storage_write_shader_queue_.push_back( - std::make_pair(&pixel_shader->shader(), sq_program_cntl)); - } - storage_write_request_cond_.notify_all(); - } - } - - return true; -} - bool PipelineCache::ConfigurePipeline( D3D12Shader::D3D12Translation* vertex_shader, D3D12Shader::D3D12Translation* pixel_shader, @@ -1078,8 +1041,50 @@ bool PipelineCache::ConfigurePipeline( } } - if (!EnsureShadersTranslated(vertex_shader, pixel_shader)) { - return false; + // Ensure shaders are translated. + // Edge flags are not supported yet (because polygon primitives are not). + assert_true(register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode != + xenos::VertexShaderExportMode::kPosition2VectorsEdge && + register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode != + xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill); + assert_false(register_file_.Get<reg::SQ_PROGRAM_CNTL>().gen_index_vtx); + if (!vertex_shader->is_translated()) { + vertex_shader->shader().AnalyzeUcode(ucode_disasm_buffer_); + if (!TranslateAnalyzedShader(*shader_translator_, *vertex_shader, + dxbc_converter_, dxc_utils_, dxc_compiler_)) { + XELOGE("Failed to translate the vertex shader!"); + return false; + } + if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() != + shader_storage_index_) { + vertex_shader->shader().set_ucode_storage_index(shader_storage_index_); + assert_not_null(storage_write_thread_); + shader_storage_file_flush_needed_ = true; + { + std::lock_guard<std::mutex> lock(storage_write_request_lock_); + storage_write_shader_queue_.push_back(&vertex_shader->shader()); + } + storage_write_request_cond_.notify_all(); + } + } + if (pixel_shader != nullptr && !pixel_shader->is_translated()) { + pixel_shader->shader().AnalyzeUcode(ucode_disasm_buffer_); + if (!TranslateAnalyzedShader(*shader_translator_, *pixel_shader, + dxbc_converter_, dxc_utils_, dxc_compiler_)) { + XELOGE("Failed to translate the pixel shader!"); + return false; + } + if (shader_storage_file_ && + pixel_shader->shader().ucode_storage_index() != shader_storage_index_) { + pixel_shader->shader().set_ucode_storage_index(shader_storage_index_); + assert_not_null(storage_write_thread_); + shader_storage_file_flush_needed_ = true; + { + std::lock_guard<std::mutex> lock(storage_write_request_lock_); + storage_write_shader_queue_.push_back(&pixel_shader->shader()); + } + storage_write_request_cond_.notify_all(); + } } Pipeline* new_pipeline = new Pipeline; @@ -1121,17 +1126,15 @@ bool PipelineCache::ConfigurePipeline( return true; } -bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, - D3D12Shader::D3D12Translation& translation, - reg::SQ_PROGRAM_CNTL cntl, - IDxbcConverter* dxbc_converter, - IDxcUtils* dxc_utils, - IDxcCompiler* dxc_compiler) { +bool PipelineCache::TranslateAnalyzedShader( + DxbcShaderTranslator& translator, + D3D12Shader::D3D12Translation& translation, IDxbcConverter* dxbc_converter, + IDxcUtils* dxc_utils, IDxcCompiler* dxc_compiler) { D3D12Shader& shader = static_cast<D3D12Shader&>(translation.shader()); // Perform translation. // If this fails the shader will be marked as invalid and ignored later. - if (!translator.Translate(translation, cntl)) { + if (!translator.TranslateAnalyzedShader(translation)) { XELOGE("Shader {:016X} translation failed; marking as ignored", shader.ucode_data_hash()); return false; @@ -1171,21 +1174,21 @@ bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, // Set up texture and sampler binding layouts. if (shader.EnterBindingLayoutUserUIDSetup()) { - uint32_t texture_binding_count; - const D3D12Shader::TextureBinding* texture_bindings = - shader.GetTextureBindings(texture_binding_count); - uint32_t sampler_binding_count; - const D3D12Shader::SamplerBinding* sampler_bindings = - shader.GetSamplerBindings(sampler_binding_count); + const std::vector<D3D12Shader::TextureBinding>& texture_bindings = + shader.GetTextureBindingsAfterTranslation(); + uint32_t texture_binding_count = uint32_t(texture_bindings.size()); + const std::vector<D3D12Shader::SamplerBinding>& sampler_bindings = + shader.GetSamplerBindingsAfterTranslation(); + uint32_t sampler_binding_count = uint32_t(sampler_bindings.size()); assert_false(bindless_resources_used_ && texture_binding_count + sampler_binding_count > D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4); size_t texture_binding_layout_bytes = - texture_binding_count * sizeof(*texture_bindings); + texture_binding_count * sizeof(*texture_bindings.data()); uint64_t texture_binding_layout_hash = 0; if (texture_binding_count) { texture_binding_layout_hash = - XXH3_64bits(texture_bindings, texture_binding_layout_bytes); + XXH3_64bits(texture_bindings.data(), texture_binding_layout_bytes); } uint32_t bindless_sampler_count = bindless_resources_used_ ? sampler_binding_count : 0; @@ -1223,7 +1226,8 @@ bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, if (it->second.vector_span_length == texture_binding_count && !std::memcmp(texture_binding_layouts_.data() + it->second.vector_span_offset, - texture_bindings, texture_binding_layout_bytes)) { + texture_bindings.data(), + texture_binding_layout_bytes)) { texture_binding_layout_uid = it->second.uid; break; } @@ -1242,7 +1246,7 @@ bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, texture_binding_count); std::memcpy( texture_binding_layouts_.data() + new_uid.vector_span_offset, - texture_bindings, texture_binding_layout_bytes); + texture_bindings.data(), texture_binding_layout_bytes); texture_binding_layout_map_.emplace(texture_binding_layout_hash, new_uid); } @@ -1576,8 +1580,10 @@ bool PipelineCache::GetCurrentStateDescription( // Render targets and blending state. 32 because of 0x1F mask, for safety // (all unknown to zero). - uint32_t color_mask = command_processor_.GetCurrentColorMask( - pixel_shader ? &pixel_shader->shader() : nullptr); + uint32_t color_mask = + pixel_shader ? command_processor_.GetCurrentColorMask( + pixel_shader->shader().writes_color_targets()) + : 0; static const PipelineBlendFactor kBlendFactorMap[32] = { /* 0 */ PipelineBlendFactor::kZero, /* 1 */ PipelineBlendFactor::kOne, @@ -2038,7 +2044,7 @@ void PipelineCache::StorageWriteThread() { fflush(pipeline_storage_file_); } - std::pair<const Shader*, reg::SQ_PROGRAM_CNTL> shader_pair = {}; + const Shader* shader = nullptr; PipelineStoredDescription pipeline_description; bool write_pipeline = false; { @@ -2047,7 +2053,7 @@ void PipelineCache::StorageWriteThread() { return; } if (!storage_write_shader_queue_.empty()) { - shader_pair = storage_write_shader_queue_.front(); + shader = storage_write_shader_queue_.front(); storage_write_shader_queue_.pop_front(); } else if (storage_write_flush_shaders_) { storage_write_flush_shaders_ = false; @@ -2063,18 +2069,16 @@ void PipelineCache::StorageWriteThread() { storage_write_flush_pipelines_ = false; flush_pipelines = true; } - if (!shader_pair.first && !write_pipeline) { + if (!shader && !write_pipeline) { storage_write_request_cond_.wait(lock); continue; } } - const Shader* shader = shader_pair.first; if (shader) { shader_header.ucode_data_hash = shader->ucode_data_hash(); shader_header.ucode_dword_count = shader->ucode_dword_count(); shader_header.type = shader->type(); - shader_header.sq_program_cntl = shader_pair.second; assert_not_null(shader_storage_file_); fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_); if (shader_header.ucode_dword_count) { diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index fe867c82a..9a733e40a 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -23,6 +23,7 @@ #include "xenia/base/hash.h" #include "xenia/base/platform.h" +#include "xenia/base/string_buffer.h" #include "xenia/base/threading.h" #include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/d3d12/render_target_cache.h" @@ -63,15 +64,12 @@ class PipelineCache { D3D12Shader* LoadShader(xenos::ShaderType shader_type, const uint32_t* host_address, uint32_t dword_count); - // Retrieves the shader modifications for the current state, and returns - // whether they are valid. - bool GetCurrentShaderModifications( + // Ensures microcode is analyzed, retrieves the shader modifications for the + // current state, and returns whether they are valid. + bool AnalyzeShaderUcodeAndGetCurrentModifications( + D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, DxbcShaderTranslator::Modification& vertex_shader_modification_out, - DxbcShaderTranslator::Modification& pixel_shader_modification_out) const; - - // Translates shaders if needed, also making shader info up to date. - bool EnsureShadersTranslated(D3D12Shader::D3D12Translation* vertex_shader, - D3D12Shader::D3D12Translation* pixel_shader); + DxbcShaderTranslator::Modification& pixel_shader_modification_out); bool ConfigurePipeline( D3D12Shader::D3D12Translation* vertex_shader, @@ -93,9 +91,7 @@ class PipelineCache { uint32_t ucode_dword_count : 31; xenos::ShaderType type : 1; - reg::SQ_PROGRAM_CNTL sq_program_cntl; - - static constexpr uint32_t kVersion = 0x20201207; + static constexpr uint32_t kVersion = 0x20201219; }); // Update PipelineDescription::kVersion if any of the Pipeline* enums are @@ -171,10 +167,10 @@ class PipelineCache { XEPACKEDSTRUCT(PipelineDescription, { uint64_t vertex_shader_hash; + uint64_t vertex_shader_modification; // 0 if drawing without a pixel shader. uint64_t pixel_shader_hash; - uint32_t vertex_shader_modification; - uint32_t pixel_shader_modification; + uint64_t pixel_shader_modification; int32_t depth_bias; float depth_bias_slope_scaled; @@ -208,7 +204,7 @@ class PipelineCache { PipelineRenderTarget render_targets[4]; - static constexpr uint32_t kVersion = 0x20201207; + static constexpr uint32_t kVersion = 0x20201219; }); XEPACKEDSTRUCT(PipelineStoredDescription, { @@ -232,12 +228,11 @@ class PipelineCache { uint64_t data_hash); // Can be called from multiple threads. - bool TranslateShader(DxbcShaderTranslator& translator, - D3D12Shader::D3D12Translation& translation, - reg::SQ_PROGRAM_CNTL cntl, - IDxbcConverter* dxbc_converter = nullptr, - IDxcUtils* dxc_utils = nullptr, - IDxcCompiler* dxc_compiler = nullptr); + bool TranslateAnalyzedShader(DxbcShaderTranslator& translator, + D3D12Shader::D3D12Translation& translation, + IDxbcConverter* dxbc_converter = nullptr, + IDxcUtils* dxc_utils = nullptr, + IDxcCompiler* dxc_compiler = nullptr); bool GetCurrentStateDescription( D3D12Shader::D3D12Translation* vertex_shader, @@ -257,7 +252,9 @@ class PipelineCache { flags::DepthFloat24Conversion depth_float24_conversion_; uint32_t resolution_scale_; - // Reusable shader translator. + // Temporary storage for AnalyzeUcode calls on the processor thread. + StringBuffer ucode_disasm_buffer_; + // Reusable shader translator for the processor thread. std::unique_ptr<DxbcShaderTranslator> shader_translator_; // Command processor thread DXIL conversion/disassembly interfaces, if DXIL @@ -332,8 +329,7 @@ class PipelineCache { std::condition_variable storage_write_request_cond_; // Storage thread input is protected with storage_write_request_lock_, and the // thread is notified about its change via storage_write_request_cond_. - std::deque<std::pair<const Shader*, reg::SQ_PROGRAM_CNTL>> - storage_write_shader_queue_; + std::deque<const Shader*> storage_write_shader_queue_; std::deque<PipelineStoredDescription> storage_write_pipeline_queue_; bool storage_write_flush_shaders_ = false; bool storage_write_flush_pipelines_ = false; diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 8669d58a3..f5a4e0c6b 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -535,7 +535,8 @@ void RenderTargetCache::EndFrame() { FlushAndUnbindRenderTargets(); } -bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) { +bool RenderTargetCache::UpdateRenderTargets( + uint32_t shader_writes_color_targets) { // There are two kinds of render target binding updates in this implementation // in case something has been changed - full and partial. // @@ -635,7 +636,8 @@ bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) { uint32_t edram_bases[5]; uint32_t formats[5]; bool formats_are_64bpp[5]; - uint32_t color_mask = command_processor_.GetCurrentColorMask(pixel_shader); + uint32_t color_mask = + command_processor_.GetCurrentColorMask(shader_writes_color_targets); for (uint32_t i = 0; i < 4; ++i) { enabled[i] = (color_mask & (0xF << (i * 4))) != 0; auto color_info = regs.Get<reg::RB_COLOR_INFO>( diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index 2f71c13c8..3bb0af399 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -269,7 +269,7 @@ class RenderTargetCache { void EndFrame(); // Called in the beginning of a draw call - may bind pipelines and change the // view descriptor heap. - bool UpdateRenderTargets(const D3D12Shader* pixel_shader); + bool UpdateRenderTargets(uint32_t shader_writes_color_targets); // Returns the host-to-guest mappings and host formats of currently bound // render targets for pipeline creation and remapping in shaders. They are // consecutive, and format DXGI_FORMAT_UNKNOWN terminates the list. Depth diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index e1f9bdcc4..99909f83f 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -1334,8 +1334,8 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) { bool TextureCache::AreActiveTextureSRVKeysUpToDate( const TextureSRVKey* keys, const D3D12Shader::TextureBinding* host_shader_bindings, - uint32_t host_shader_binding_count) const { - for (uint32_t i = 0; i < host_shader_binding_count; ++i) { + size_t host_shader_binding_count) const { + for (size_t i = 0; i < host_shader_binding_count; ++i) { const TextureSRVKey& key = keys[i]; const TextureBinding& binding = texture_bindings_[host_shader_bindings[i].fetch_constant]; @@ -1350,8 +1350,8 @@ bool TextureCache::AreActiveTextureSRVKeysUpToDate( void TextureCache::WriteActiveTextureSRVKeys( TextureSRVKey* keys, const D3D12Shader::TextureBinding* host_shader_bindings, - uint32_t host_shader_binding_count) const { - for (uint32_t i = 0; i < host_shader_binding_count; ++i) { + size_t host_shader_binding_count) const { + for (size_t i = 0; i < host_shader_binding_count; ++i) { TextureSRVKey& key = keys[i]; const TextureBinding& binding = texture_bindings_[host_shader_bindings[i].fetch_constant]; diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 85131f25d..465824755 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -196,14 +196,14 @@ class TextureCache { bool AreActiveTextureSRVKeysUpToDate( const TextureSRVKey* keys, const D3D12Shader::TextureBinding* host_shader_bindings, - uint32_t host_shader_binding_count) const; + size_t host_shader_binding_count) const; // Exports the current binding data to texture SRV keys so they can be stored // for checking whether subsequent draw calls can keep using the same // bindings. Write host_shader_binding_count keys. void WriteActiveTextureSRVKeys( TextureSRVKey* keys, const D3D12Shader::TextureBinding* host_shader_bindings, - uint32_t host_shader_binding_count) const; + size_t host_shader_binding_count) const; // Returns the post-swizzle signedness of a currently bound texture (must be // called after RequestTextures). uint8_t GetActiveTextureSwizzledSigns(uint32_t index) const { diff --git a/src/xenia/gpu/dxbc_shader.cc b/src/xenia/gpu/dxbc_shader.cc index 144308d57..9b0243fca 100644 --- a/src/xenia/gpu/dxbc_shader.cc +++ b/src/xenia/gpu/dxbc_shader.cc @@ -19,7 +19,7 @@ DxbcShader::DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash, : Shader(shader_type, data_hash, dword_ptr, dword_count) {} Shader::Translation* DxbcShader::CreateTranslationInstance( - uint32_t modification) { + uint64_t modification) { return new DxbcTranslation(*this, modification); } diff --git a/src/xenia/gpu/dxbc_shader.h b/src/xenia/gpu/dxbc_shader.h index 49439a2a6..477dfdc5d 100644 --- a/src/xenia/gpu/dxbc_shader.h +++ b/src/xenia/gpu/dxbc_shader.h @@ -10,6 +10,7 @@ #ifndef XENIA_GPU_DXBC_SHADER_H_ #define XENIA_GPU_DXBC_SHADER_H_ +#include <atomic> #include <vector> #include "xenia/gpu/dxbc_shader_translator.h" @@ -23,13 +24,17 @@ class DxbcShader : public Shader { public: class DxbcTranslation : public Translation { public: - DxbcTranslation(DxbcShader& shader, uint32_t modification) + DxbcTranslation(DxbcShader& shader, uint64_t modification) : Translation(shader, modification) {} }; DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count); + // Resource bindings are gathered after the successful translation of any + // modification for simplicity of translation (and they don't depend on + // modification bits). + static constexpr uint32_t kMaxTextureBindingIndexBits = DxbcShaderTranslator::kMaxTextureBindingIndexBits; static constexpr uint32_t kMaxTextureBindings = @@ -43,11 +48,13 @@ class DxbcShader : public Shader { bool is_signed; }; // Safe to hash and compare with memcmp for layout hashing. - const TextureBinding* GetTextureBindings(uint32_t& count_out) const { - count_out = uint32_t(texture_bindings_.size()); - return texture_bindings_.data(); + const std::vector<TextureBinding>& GetTextureBindingsAfterTranslation() + const { + return texture_bindings_; + } + const uint32_t GetUsedTextureMaskAfterTranslation() const { + return used_texture_mask_; } - const uint32_t GetUsedTextureMask() const { return used_texture_mask_; } static constexpr uint32_t kMaxSamplerBindingIndexBits = DxbcShaderTranslator::kMaxSamplerBindingIndexBits; @@ -61,17 +68,18 @@ class DxbcShader : public Shader { xenos::TextureFilter mip_filter; xenos::AnisoFilter aniso_filter; }; - const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const { - count_out = uint32_t(sampler_bindings_.size()); - return sampler_bindings_.data(); + const std::vector<SamplerBinding>& GetSamplerBindingsAfterTranslation() + const { + return sampler_bindings_; } protected: - Translation* CreateTranslationInstance(uint32_t modification) override; + Translation* CreateTranslationInstance(uint64_t modification) override; private: friend class DxbcShaderTranslator; + std::atomic_flag bindings_setup_entered_ = ATOMIC_FLAG_INIT; std::vector<TextureBinding> texture_bindings_; std::vector<SamplerBinding> sampler_bindings_; uint32_t used_texture_mask_ = 0; diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 865fbd77e..534355ce3 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -10,6 +10,7 @@ #include "xenia/gpu/dxbc_shader_translator.h" #include <algorithm> +#include <atomic> #include <cstring> #include <memory> @@ -78,16 +79,23 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id, DxbcShaderTranslator::~DxbcShaderTranslator() = default; std::vector<uint8_t> DxbcShaderTranslator::CreateDepthOnlyPixelShader() { - Reset(xenos::ShaderType::kPixel); is_depth_only_pixel_shader_ = true; - StartTranslation(); - return std::move(CompleteTranslation()); + // TODO(Triang3l): Handle in a nicer way (is_depth_only_pixel_shader_ is a + // leftover from when a Shader object wasn't used during translation). + Shader shader(xenos::ShaderType::kPixel, 0, nullptr, 0); + shader.AnalyzeUcode(instruction_disassembly_buffer_); + Shader::Translation& translation = *shader.GetOrCreateTranslation(0); + TranslateAnalyzedShader(translation); + is_depth_only_pixel_shader_ = false; + return translation.translated_binary(); } -uint32_t DxbcShaderTranslator::GetDefaultModification( - xenos::ShaderType shader_type, +uint64_t DxbcShaderTranslator::GetDefaultModification( + xenos::ShaderType shader_type, uint32_t dynamic_addressable_register_count, Shader::HostVertexShaderType host_vertex_shader_type) const { Modification shader_modification; + shader_modification.dynamic_addressable_register_count = + dynamic_addressable_register_count; switch (shader_type) { case xenos::ShaderType::kVertex: shader_modification.host_vertex_shader_type = host_vertex_shader_type; @@ -100,13 +108,11 @@ uint32_t DxbcShaderTranslator::GetDefaultModification( return shader_modification.value; } -void DxbcShaderTranslator::Reset(xenos::ShaderType shader_type) { - ShaderTranslator::Reset(shader_type); +void DxbcShaderTranslator::Reset() { + ShaderTranslator::Reset(); shader_code_.clear(); - is_depth_only_pixel_shader_ = false; - cbuffer_count_ = 0; // System constants always used in prologues/epilogues. cbuffer_index_system_constants_ = cbuffer_count_++; @@ -231,6 +237,10 @@ void DxbcShaderTranslator::DxbcSrc::Write(std::vector<uint32_t>& code, } } +uint32_t DxbcShaderTranslator::GetModificationRegisterCount() const { + return GetDxbcShaderModification().dynamic_addressable_register_count; +} + bool DxbcShaderTranslator::UseSwitchForControlFlow() const { // Xenia crashes on Intel HD Graphics 4000 with switch. return cvars::dxbc_switch && vendor_id_ != 0x8086; @@ -239,7 +249,8 @@ bool DxbcShaderTranslator::UseSwitchForControlFlow() const { uint32_t DxbcShaderTranslator::PushSystemTemp(uint32_t zero_mask, uint32_t count) { uint32_t register_index = system_temp_count_current_; - if (!uses_register_dynamic_addressing() && !is_depth_only_pixel_shader_) { + if (!is_depth_only_pixel_shader_ && + !current_shader().uses_register_dynamic_addressing()) { // Guest shader registers first if they're not in x0. Depth-only pixel // shader is a special case of the DXBC translator usage, where there are no // GPRs because there's no shader to translate, and a guest shader is not @@ -327,10 +338,13 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() { return; } + bool uses_register_dynamic_addressing = + current_shader().uses_register_dynamic_addressing(); + // Writing the index to X of GPR 0 - either directly if not using indexable // registers, or via a system temporary register. uint32_t reg; - if (uses_register_dynamic_addressing()) { + if (uses_register_dynamic_addressing) { reg = PushSystemTemp(); } else { reg = 0; @@ -392,7 +406,7 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() { DxbcOpBreak(); DxbcOpEndSwitch(); - if (!uses_register_dynamic_addressing()) { + if (!uses_register_dynamic_addressing) { // Break register dependency. DxbcOpMov(swap_temp_dest, DxbcSrc::LF(0.0f)); } @@ -409,7 +423,7 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() { // Convert to float. DxbcOpIToF(index_dest, index_src); - if (uses_register_dynamic_addressing()) { + if (uses_register_dynamic_addressing) { // Store to indexed GPR 0 in x0[0]. DxbcOpMov(DxbcDest::X(0, 0, 0b0001), index_src); PopSystemTemp(); @@ -417,6 +431,9 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() { } void DxbcShaderTranslator::StartVertexOrDomainShader() { + bool uses_register_dynamic_addressing = + current_shader().uses_register_dynamic_addressing(); + // Zero the interpolators. for (uint32_t i = 0; i < xenos::kMaxInterpolators; ++i) { DxbcOpMov(DxbcDest::O(uint32_t(InOutRegister::kVSDSOutInterpolators) + i), @@ -438,13 +455,13 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // Copy the domain location to r0.xyz. // ZYX swizzle according to Call of Duty 3 and Viva Pinata. in_domain_location_used_ |= 0b0111; - DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0111) - : DxbcDest::R(0, 0b0111), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0111) + : DxbcDest::R(0, 0b0111), DxbcSrc::VDomain(0b000110)); if (register_count() >= 2) { // Copy the control point indices (already swapped and converted to // float by the host vertex and hull shaders) to r1.xyz. - DxbcDest control_point_index_dest(uses_register_dynamic_addressing() + DxbcDest control_point_index_dest(uses_register_dynamic_addressing ? DxbcDest::X(0, 1) : DxbcDest::R(1)); in_control_point_index_used_ = true; @@ -465,16 +482,16 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // ZYX swizzle with r1.y == 0, according to the water shader in // Banjo-Kazooie: Nuts & Bolts. in_domain_location_used_ |= 0b0111; - DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0111) - : DxbcDest::R(0, 0b0111), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0111) + : DxbcDest::R(0, 0b0111), DxbcSrc::VDomain(0b000110)); if (register_count() >= 2) { // Copy the primitive index to r1.x as a float. uint32_t primitive_id_temp = - uses_register_dynamic_addressing() ? PushSystemTemp() : 1; + uses_register_dynamic_addressing ? PushSystemTemp() : 1; in_primitive_id_used_ = true; DxbcOpUToF(DxbcDest::R(primitive_id_temp, 0b0001), DxbcSrc::VPrim()); - if (uses_register_dynamic_addressing()) { + if (uses_register_dynamic_addressing) { DxbcOpMov(DxbcDest::X(0, 1, 0b0001), DxbcSrc::R(primitive_id_temp, DxbcSrc::kXXXX)); // Release primitive_id_temp. @@ -499,9 +516,8 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // // Direct3D 12 passes the coordinates in a consistent order, so can // just use the identity swizzle. - DxbcOpMov(uses_register_dynamic_addressing() - ? DxbcDest::X(0, 1, 0b0010) - : DxbcDest::R(1, 0b0010), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 1, 0b0010) + : DxbcDest::R(1, 0b0010), DxbcSrc::LF(0.0f)); } } @@ -512,8 +528,8 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { if (register_count() >= 1) { // Copy the domain location to r0.xy. in_domain_location_used_ |= 0b0011; - DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0011) - : DxbcDest::R(0, 0b0011), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0011) + : DxbcDest::R(0, 0b0011), DxbcSrc::VDomain()); // Control point indices according to the shader from the main menu of // Defender, which starts from `cndeq r2, c255.xxxy, r1.xyzz, r0.zzzz`, @@ -524,14 +540,13 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // r1.z for (1 - r0.x) * r0.y in_control_point_index_used_ = true; DxbcOpMov( - uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0100) - : DxbcDest::R(0, 0b0100), + uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0100) + : DxbcDest::R(0, 0b0100), DxbcSrc::VICP(0, uint32_t(InOutRegister::kDSInControlPointIndex), DxbcSrc::kXXXX)); if (register_count() >= 2) { - DxbcDest r1_dest(uses_register_dynamic_addressing() - ? DxbcDest::X(0, 1) - : DxbcDest::R(1)); + DxbcDest r1_dest(uses_register_dynamic_addressing ? DxbcDest::X(0, 1) + : DxbcDest::R(1)); for (uint32_t i = 0; i < 3; ++i) { DxbcOpMov( r1_dest.Mask(1 << i), @@ -549,15 +564,15 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // Copy the domain location to r0.yz. // XY swizzle according to the ground shader in Viva Pinata. in_domain_location_used_ |= 0b0011; - DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0110) - : DxbcDest::R(0, 0b0110), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0110) + : DxbcDest::R(0, 0b0110), DxbcSrc::VDomain(0b010000)); // Copy the primitive index to r0.x as a float. uint32_t primitive_id_temp = - uses_register_dynamic_addressing() ? PushSystemTemp() : 0; + uses_register_dynamic_addressing ? PushSystemTemp() : 0; in_primitive_id_used_ = true; DxbcOpUToF(DxbcDest::R(primitive_id_temp, 0b0001), DxbcSrc::VPrim()); - if (uses_register_dynamic_addressing()) { + if (uses_register_dynamic_addressing) { DxbcOpMov(DxbcDest::X(0, 0, 0b0001), DxbcSrc::R(primitive_id_temp, DxbcSrc::kXXXX)); // Release primitive_id_temp. @@ -578,9 +593,8 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // // Direct3D 12 passes the coordinates in a consistent order, so can // just use the identity swizzle. - DxbcOpMov(uses_register_dynamic_addressing() - ? DxbcDest::X(0, 1, 0b0001) - : DxbcDest::R(1, 0b0001), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 1, 0b0001) + : DxbcDest::R(1, 0b0001), DxbcSrc::LF(0.0f)); } } @@ -611,7 +625,10 @@ void DxbcShaderTranslator::StartPixelShader() { return; } - if (!edram_rov_used_ && writes_depth()) { + bool uses_register_dynamic_addressing = + current_shader().uses_register_dynamic_addressing(); + + if (!edram_rov_used_ && current_shader().writes_depth()) { // Initialize the depth output if used, which must be written to regardless // of the taken execution path. DxbcOpMov(DxbcDest::ODepth(), DxbcSrc::LF(0.0f)); @@ -623,7 +640,7 @@ void DxbcShaderTranslator::StartPixelShader() { // Copy interpolants to GPRs. if (edram_rov_used_) { uint32_t centroid_temp = - uses_register_dynamic_addressing() ? PushSystemTemp() : UINT32_MAX; + uses_register_dynamic_addressing ? PushSystemTemp() : UINT32_MAX; system_constants_used_ |= 1ull << kSysConst_InterpolatorSamplingPattern_Index; DxbcSrc sampling_pattern_src( @@ -635,7 +652,7 @@ void DxbcShaderTranslator::StartPixelShader() { // With GPR dynamic addressing, first evaluate to centroid_temp r#, then // store to the x#. uint32_t centroid_register = - uses_register_dynamic_addressing() ? centroid_temp : i; + uses_register_dynamic_addressing ? centroid_temp : i; // Check if the input needs to be interpolated at center (if the bit is // set). DxbcOpAnd(DxbcDest::R(centroid_register, 0b0001), sampling_pattern_src, @@ -643,8 +660,8 @@ void DxbcShaderTranslator::StartPixelShader() { DxbcOpIf(bool(xenos::SampleLocation::kCenter), DxbcSrc::R(centroid_register, DxbcSrc::kXXXX)); // At center. - DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i) - : DxbcDest::R(i), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, i) + : DxbcDest::R(i), DxbcSrc::V(uint32_t(InOutRegister::kPSInInterpolators) + i)); DxbcOpElse(); // At centroid. Not really important that 2x MSAA is emulated using @@ -653,7 +670,7 @@ void DxbcShaderTranslator::StartPixelShader() { DxbcOpEvalCentroid( DxbcDest::R(centroid_register), DxbcSrc::V(uint32_t(InOutRegister::kPSInInterpolators) + i)); - if (uses_register_dynamic_addressing()) { + if (uses_register_dynamic_addressing) { DxbcOpMov(DxbcDest::X(0, i), DxbcSrc::R(centroid_register)); } DxbcOpEndIf(); @@ -665,8 +682,8 @@ void DxbcShaderTranslator::StartPixelShader() { // SSAA instead of MSAA without ROV - everything is interpolated at // samples, can't extrapolate. for (uint32_t i = 0; i < interpolator_count; ++i) { - DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i) - : DxbcDest::R(i), + DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, i) + : DxbcDest::R(i), DxbcSrc::V(uint32_t(InOutRegister::kPSInInterpolators) + i)); } } @@ -781,7 +798,7 @@ void DxbcShaderTranslator::StartPixelShader() { } // Write ps_param_gen to the specified GPR. DxbcSrc param_gen_src(DxbcSrc::R(param_gen_temp)); - if (uses_register_dynamic_addressing()) { + if (uses_register_dynamic_addressing) { // Copy the GPR number to r# for relative addressing. uint32_t param_gen_copy_temp = PushSystemTemp(); DxbcOpMov(DxbcDest::R(param_gen_copy_temp, 0b0001), @@ -863,10 +880,12 @@ void DxbcShaderTranslator::StartTranslation() { // by the guest code, so initialize because assumptions can't be made // about the integrity of the guest code. system_temp_depth_stencil_ = - PushSystemTemp(writes_depth() ? 0b0001 : 0b1111); + PushSystemTemp(current_shader().writes_depth() ? 0b0001 : 0b1111); } + uint32_t shader_writes_color_targets = + current_shader().writes_color_targets(); for (uint32_t i = 0; i < 4; ++i) { - if (writes_color_target(i)) { + if (shader_writes_color_targets & (1 << i)) { system_temps_color_[i] = PushSystemTemp(0b1111); } } @@ -879,8 +898,8 @@ void DxbcShaderTranslator::StartTranslation() { std::memset(system_temps_memexport_data_, 0xFF, sizeof(system_temps_memexport_data_)); system_temp_memexport_written_ = UINT32_MAX; - const uint8_t* memexports_written = memexport_eM_written(); - for (uint32_t i = 0; i < kMaxMemExports; ++i) { + const uint8_t* memexports_written = current_shader().memexport_eM_written(); + for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) { uint32_t memexport_alloc_written = memexports_written[i]; if (memexport_alloc_written == 0) { continue; @@ -915,8 +934,9 @@ void DxbcShaderTranslator::StartTranslation() { // references them after only initializing them conditionally. for (uint32_t i = is_pixel_shader() ? xenos::kMaxInterpolators : 0; i < register_count(); ++i) { - DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i) - : DxbcDest::R(i), + DxbcOpMov(current_shader().uses_register_dynamic_addressing() + ? DxbcDest::X(0, i) + : DxbcDest::R(i), DxbcSrc::LF(0.0f)); } } @@ -1120,7 +1140,7 @@ void DxbcShaderTranslator::CompleteShaderCode() { ExportToMemory(); // Release memexport temporary registers. - for (int i = kMaxMemExports - 1; i >= 0; --i) { + for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) { if (system_temps_memexport_address_[i] == UINT32_MAX) { continue; } @@ -1154,8 +1174,10 @@ void DxbcShaderTranslator::CompleteShaderCode() { PopSystemTemp(2); } else if (is_pixel_shader()) { // Release system_temps_color_. + uint32_t shader_writes_color_targets = + current_shader().writes_color_targets(); for (int32_t i = 3; i >= 0; --i) { - if (writes_color_target(i)) { + if (shader_writes_color_targets & (1 << i)) { PopSystemTemp(); } } @@ -1274,40 +1296,42 @@ std::vector<uint8_t> DxbcShaderTranslator::CompleteTranslation() { return shader_object_bytes; } -void DxbcShaderTranslator::PostTranslation( - Shader::Translation& translation, bool setup_shader_post_translation_info) { - if (setup_shader_post_translation_info) { - DxbcShader* dxbc_shader = dynamic_cast<DxbcShader*>(&translation.shader()); - if (dxbc_shader) { - dxbc_shader->texture_bindings_.clear(); - dxbc_shader->texture_bindings_.reserve(texture_bindings_.size()); - dxbc_shader->used_texture_mask_ = 0; - for (const TextureBinding& translator_binding : texture_bindings_) { - DxbcShader::TextureBinding& shader_binding = - dxbc_shader->texture_bindings_.emplace_back(); - // For a stable hash. - std::memset(&shader_binding, 0, sizeof(shader_binding)); - shader_binding.bindless_descriptor_index = - translator_binding.bindless_descriptor_index; - shader_binding.fetch_constant = translator_binding.fetch_constant; - shader_binding.dimension = translator_binding.dimension; - shader_binding.is_signed = translator_binding.is_signed; - dxbc_shader->used_texture_mask_ |= 1u - << translator_binding.fetch_constant; - } - dxbc_shader->sampler_bindings_.clear(); - dxbc_shader->sampler_bindings_.reserve(sampler_bindings_.size()); - for (const SamplerBinding& translator_binding : sampler_bindings_) { - DxbcShader::SamplerBinding& shader_binding = - dxbc_shader->sampler_bindings_.emplace_back(); - shader_binding.bindless_descriptor_index = - translator_binding.bindless_descriptor_index; - shader_binding.fetch_constant = translator_binding.fetch_constant; - shader_binding.mag_filter = translator_binding.mag_filter; - shader_binding.min_filter = translator_binding.min_filter; - shader_binding.mip_filter = translator_binding.mip_filter; - shader_binding.aniso_filter = translator_binding.aniso_filter; - } +void DxbcShaderTranslator::PostTranslation() { + Shader::Translation& translation = current_translation(); + if (!translation.is_valid()) { + return; + } + DxbcShader* dxbc_shader = dynamic_cast<DxbcShader*>(&translation.shader()); + if (dxbc_shader && !dxbc_shader->bindings_setup_entered_.test_and_set( + std::memory_order_relaxed)) { + dxbc_shader->texture_bindings_.clear(); + dxbc_shader->texture_bindings_.reserve(texture_bindings_.size()); + dxbc_shader->used_texture_mask_ = 0; + for (const TextureBinding& translator_binding : texture_bindings_) { + DxbcShader::TextureBinding& shader_binding = + dxbc_shader->texture_bindings_.emplace_back(); + // For a stable hash. + std::memset(&shader_binding, 0, sizeof(shader_binding)); + shader_binding.bindless_descriptor_index = + translator_binding.bindless_descriptor_index; + shader_binding.fetch_constant = translator_binding.fetch_constant; + shader_binding.dimension = translator_binding.dimension; + shader_binding.is_signed = translator_binding.is_signed; + dxbc_shader->used_texture_mask_ |= 1u + << translator_binding.fetch_constant; + } + dxbc_shader->sampler_bindings_.clear(); + dxbc_shader->sampler_bindings_.reserve(sampler_bindings_.size()); + for (const SamplerBinding& translator_binding : sampler_bindings_) { + DxbcShader::SamplerBinding& shader_binding = + dxbc_shader->sampler_bindings_.emplace_back(); + shader_binding.bindless_descriptor_index = + translator_binding.bindless_descriptor_index; + shader_binding.fetch_constant = translator_binding.fetch_constant; + shader_binding.mag_filter = translator_binding.mag_filter; + shader_binding.min_filter = translator_binding.min_filter; + shader_binding.mip_filter = translator_binding.mip_filter; + shader_binding.aniso_filter = translator_binding.aniso_filter; } } } @@ -1373,7 +1397,7 @@ DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand( DxbcSrc src(DxbcSrc::LF(0.0f)); switch (operand.storage_source) { case InstructionStorageSource::kRegister: { - if (uses_register_dynamic_addressing()) { + if (current_shader().uses_register_dynamic_addressing()) { // Load x#[#] to r# because x#[#] can be used only with mov. uint32_t temp = PushSystemTemp(); temp_pushed_out = true; @@ -1402,10 +1426,12 @@ DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand( if (cbuffer_index_float_constants_ == kBindingIndexUnallocated) { cbuffer_index_float_constants_ = cbuffer_count_++; } + const Shader::ConstantRegisterMap& constant_register_map = + current_shader().constant_register_map(); if (operand.storage_addressing_mode == InstructionStorageAddressingMode::kStatic) { uint32_t float_constant_index = - constant_register_map().GetPackedFloatConstantIndex( + constant_register_map.GetPackedFloatConstantIndex( operand.storage_index); assert_true(float_constant_index != UINT32_MAX); if (float_constant_index == UINT32_MAX) { @@ -1413,7 +1439,7 @@ DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand( } index.index_ = float_constant_index; } else { - assert_true(constant_register_map().float_dynamic_addressing); + assert_true(constant_register_map.float_dynamic_addressing); } src = DxbcSrc::CB(cbuffer_index_float_constants_, uint32_t(CbufferRegister::kFloatConstants), index); @@ -1453,7 +1479,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, case InstructionStorageTarget::kNone: return; case InstructionStorageTarget::kRegister: - if (uses_register_dynamic_addressing()) { + if (current_shader().uses_register_dynamic_addressing()) { DxbcIndex register_index(result.storage_index); switch (result.storage_addressing_mode) { case InstructionStorageAddressingMode::kStatic: @@ -1488,7 +1514,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, case InstructionStorageTarget::kExportAddress: // Validate memexport writes (Halo 3 has some weird invalid ones). if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 || - memexport_alloc_current_count_ > kMaxMemExports || + memexport_alloc_current_count_ > Shader::kMaxMemExports || system_temps_memexport_address_[memexport_alloc_current_count_ - 1] == UINT32_MAX) { return; @@ -1499,7 +1525,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, case InstructionStorageTarget::kExportData: { // Validate memexport writes (Halo 3 has some weird invalid ones). if (memexport_alloc_current_count_ == 0 || - memexport_alloc_current_count_ > kMaxMemExports || + memexport_alloc_current_count_ > Shader::kMaxMemExports || system_temps_memexport_data_[memexport_alloc_current_count_ - 1] [result.storage_index] == UINT32_MAX) { return; @@ -1519,7 +1545,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, } break; case InstructionStorageTarget::kColor: assert_not_zero(used_write_mask); - assert_true(writes_color_target(result.storage_index)); + assert_true(current_shader().writes_color_target(result.storage_index)); dest = DxbcDest::R(system_temps_color_[result.storage_index]); if (edram_rov_used_) { // For ROV output, mark that the color has been written to. @@ -1539,7 +1565,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, // Writes X to scalar oDepth or to X of system_temp_depth_stencil_, no // additional swizzling needed. assert_true(used_write_mask == 0b0001); - assert_true(writes_depth()); + assert_true(current_shader().writes_depth()); if (IsDepthStencilSystemTempUsed()) { dest = DxbcDest::R(system_temp_depth_stencil_); } else { @@ -2077,6 +2103,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { uint32_t chunk_position_dwords = uint32_t(shader_object_.size()); uint32_t new_offset; + const Shader::ConstantRegisterMap& constant_register_map = + current_shader().constant_register_map(); + // *************************************************************************** // Header // *************************************************************************** @@ -2162,7 +2191,7 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { // Declaring a 0-sized array may not be safe, so write something valid // even if they aren't used. shader_object_.push_back( - std::max(constant_register_map().float_count, uint32_t(1))); + std::max(constant_register_map.float_count, uint32_t(1))); break; case RdefTypeIndex::kUint4DescriptorIndexArray: shader_object_.push_back(std::max( @@ -2278,10 +2307,10 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { // Float constants. uint32_t constant_offset_float = new_offset; if (cbuffer_index_float_constants_ != kBindingIndexUnallocated) { - assert_not_zero(constant_register_map().float_count); + assert_not_zero(constant_register_map.float_count); shader_object_.push_back(constant_name_offset_float); shader_object_.push_back(0); - shader_object_.push_back(constant_register_map().float_count * 4 * + shader_object_.push_back(constant_register_map.float_count * 4 * sizeof(float)); shader_object_.push_back(kDxbcRdefVariableFlagUsed); shader_object_.push_back(types_offset + @@ -2405,11 +2434,11 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { // No D3D_SHADER_CBUFFER_FLAGS. shader_object_.push_back(0); } else if (i == cbuffer_index_float_constants_) { - assert_not_zero(constant_register_map().float_count); + assert_not_zero(constant_register_map.float_count); shader_object_.push_back(cbuffer_name_offset_float); shader_object_.push_back(1); shader_object_.push_back(constant_offset_float); - shader_object_.push_back(constant_register_map().float_count * 4 * + shader_object_.push_back(constant_register_map.float_count * 4 * sizeof(float)); shader_object_.push_back(uint32_t(DxbcRdefCbufferType::kCbuffer)); shader_object_.push_back(0); @@ -3211,7 +3240,7 @@ void DxbcShaderTranslator::WriteOutputSignature() { if (!edram_rov_used_) { // Color render targets (SV_Target#). size_t target_position = SIZE_MAX; - if (writes_any_color_target()) { + if (current_shader().writes_color_targets()) { target_position = shader_object_.size(); shader_object_.resize(shader_object_.size() + 4 * kParameterDwords); parameter_count += 4; @@ -3233,7 +3262,7 @@ void DxbcShaderTranslator::WriteOutputSignature() { Modification::DepthStencilMode depth_stencil_mode = GetDxbcShaderModification().depth_stencil_mode; size_t depth_position = SIZE_MAX; - if (writes_depth() || DSV_IsWritingFloat24Depth()) { + if (current_shader().writes_depth() || DSV_IsWritingFloat24Depth()) { depth_position = shader_object_.size(); shader_object_.resize(shader_object_.size() + kParameterDwords); ++parameter_count; @@ -3268,7 +3297,7 @@ void DxbcShaderTranslator::WriteOutputSignature() { depth.semantic_name = semantic_offset; } const char* depth_semantic_name; - if (!writes_depth() && + if (!current_shader().writes_depth() && GetDxbcShaderModification().depth_stencil_mode == Modification::DepthStencilMode::kFloat24Truncating) { depth_semantic_name = "SV_DepthLessEqual"; @@ -3361,7 +3390,7 @@ void DxbcShaderTranslator::WriteShaderCode() { if (is_pixel_shader() && GetDxbcShaderModification().depth_stencil_mode == Modification::DepthStencilMode::kEarlyHint && - !edram_rov_used_ && CanWriteZEarly()) { + !edram_rov_used_ && current_shader().implicit_early_z_write_allowed()) { global_flags_opcode |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL; } shader_object_.push_back(global_flags_opcode); @@ -3369,11 +3398,13 @@ void DxbcShaderTranslator::WriteShaderCode() { // Constant buffers, from most frequenly accessed to least frequently accessed // (the order is a hint to the driver according to the DXBC header). if (cbuffer_index_float_constants_ != kBindingIndexUnallocated) { - assert_not_zero(constant_register_map().float_count); + const Shader::ConstantRegisterMap& constant_register_map = + current_shader().constant_register_map(); + assert_not_zero(constant_register_map.float_count); shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_CONSTANT_BUFFER) | ENCODE_D3D10_SB_D3D10_SB_CONSTANT_BUFFER_ACCESS_PATTERN( - constant_register_map().float_dynamic_addressing + constant_register_map.float_dynamic_addressing ? D3D10_SB_CONSTANT_BUFFER_DYNAMIC_INDEXED : D3D10_SB_CONSTANT_BUFFER_IMMEDIATE_INDEXED) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); @@ -3382,7 +3413,7 @@ void DxbcShaderTranslator::WriteShaderCode() { shader_object_.push_back(cbuffer_index_float_constants_); shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants)); shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants)); - shader_object_.push_back(constant_register_map().float_count); + shader_object_.push_back(constant_register_map.float_count); shader_object_.push_back(0); } if (cbuffer_index_system_constants_ != kBindingIndexUnallocated) { @@ -3715,6 +3746,7 @@ void DxbcShaderTranslator::WriteShaderCode() { ++stat_.dcl_count; } else if (is_pixel_shader()) { bool is_writing_float24_depth = DSV_IsWritingFloat24Depth(); + bool shader_writes_depth = current_shader().writes_depth(); // Interpolator input. if (!is_depth_only_pixel_shader_) { uint32_t interpolator_count = @@ -3766,7 +3798,7 @@ void DxbcShaderTranslator::WriteShaderCode() { // applicable here) position is mandatory. However, with depth output, on // the guest, there's only one depth value for the whole pixel. D3D10_SB_INTERPOLATION_MODE position_interpolation_mode = - is_writing_float24_depth && !writes_depth() + is_writing_float24_depth && !shader_writes_depth ? D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE_SAMPLE : D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE; shader_object_.push_back( @@ -3806,7 +3838,7 @@ void DxbcShaderTranslator::WriteShaderCode() { EncodeScalarOperand(D3D11_SB_OPERAND_TYPE_INPUT_COVERAGE_MASK, 0)); ++stat_.dcl_count; } else { - if (writes_any_color_target()) { + if (current_shader().writes_color_targets()) { // Color output. for (uint32_t i = 0; i < 4; ++i) { shader_object_.push_back( @@ -3819,9 +3851,9 @@ void DxbcShaderTranslator::WriteShaderCode() { } } // Depth output. - if (is_writing_float24_depth || writes_depth()) { + if (is_writing_float24_depth || shader_writes_depth) { D3D10_SB_OPERAND_TYPE depth_operand_type; - if (!writes_depth() && + if (!shader_writes_depth && GetDxbcShaderModification().depth_stencil_mode == Modification::DepthStencilMode::kFloat24Truncating) { depth_operand_type = D3D11_SB_OPERAND_TYPE_OUTPUT_DEPTH_LESS_EQUAL; @@ -3840,7 +3872,8 @@ void DxbcShaderTranslator::WriteShaderCode() { // Temporary registers - guest general-purpose registers if not using dynamic // indexing and Xenia internal registers. stat_.temp_register_count = system_temp_count_max_; - if (!is_depth_only_pixel_shader_ && !uses_register_dynamic_addressing()) { + if (!is_depth_only_pixel_shader_ && + !current_shader().uses_register_dynamic_addressing()) { stat_.temp_register_count += register_count(); } if (stat_.temp_register_count != 0) { @@ -3851,7 +3884,8 @@ void DxbcShaderTranslator::WriteShaderCode() { } // General-purpose registers if using dynamic indexing (x0). - if (!is_depth_only_pixel_shader_ && uses_register_dynamic_addressing()) { + if (!is_depth_only_pixel_shader_ && + current_shader().uses_register_dynamic_addressing()) { assert_true(register_count() != 0); shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_INDEXABLE_TEMP) | diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 1e9891771..808b311fa 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -106,13 +106,12 @@ class DxbcShaderTranslator : public ShaderTranslator { // If anything in this is structure is changed in a way not compatible with // the previous layout, invalidate the pipeline storages by increasing this // version number (0xYYYYMMDD)! - static constexpr uint32_t kVersion = 0x20201203; + static constexpr uint32_t kVersion = 0x20201219; enum class DepthStencilMode : uint32_t { kNoModifiers, // [earlydepthstencil] - enable if alpha test and alpha to coverage are - // disabled; ignored if anything in the shader blocks early Z writing - // (which is not known before translation, so this will be set anyway). + // disabled; ignored if anything in the shader blocks early Z writing. kEarlyHint, // Converting the depth to the closest 32-bit float representable exactly // as a 20e4 float, to support invariance in cases when the guest @@ -136,15 +135,17 @@ class DxbcShaderTranslator : public ShaderTranslator { }; struct { + // Both - dynamically indexable register count from SQ_PROGRAM_CNTL. + uint32_t dynamic_addressable_register_count : 8; // VS - pipeline stage and input configuration. Shader::HostVertexShaderType host_vertex_shader_type : Shader::kHostVertexShaderTypeBitCount; // PS, non-ROV - depth / stencil output mode. DepthStencilMode depth_stencil_mode : 2; }; - uint32_t value = 0; + uint64_t value = 0; - Modification(uint32_t modification_value = 0) : value(modification_value) {} + Modification(uint64_t modification_value = 0) : value(modification_value) {} }; // Constant buffer bindings in space 0. @@ -467,8 +468,9 @@ class DxbcShaderTranslator : public ShaderTranslator { float& clamp_alpha_high, uint32_t& keep_mask_low, uint32_t& keep_mask_high); - uint32_t GetDefaultModification( + uint64_t GetDefaultModification( xenos::ShaderType shader_type, + uint32_t dynamic_addressable_register_count, Shader::HostVertexShaderType host_vertex_shader_type = Shader::HostVertexShaderType::kVertex) const override; @@ -477,12 +479,13 @@ class DxbcShaderTranslator : public ShaderTranslator { std::vector<uint8_t> CreateDepthOnlyPixelShader(); protected: - void Reset(xenos::ShaderType shader_type) override; + void Reset() override; + + uint32_t GetModificationRegisterCount() const override; void StartTranslation() override; std::vector<uint8_t> CompleteTranslation() override; - void PostTranslation(Shader::Translation& translation, - bool setup_shader_post_translation_info) override; + void PostTranslation() override; void ProcessLabel(uint32_t cf_index) override; @@ -2184,7 +2187,7 @@ class DxbcShaderTranslator : public ShaderTranslator { } Modification GetDxbcShaderModification() const { - return Modification(modification()); + return Modification(current_translation().modification()); } bool IsDxbcVertexShader() const { @@ -2227,9 +2230,9 @@ class DxbcShaderTranslator : public ShaderTranslator { bool IsDepthStencilSystemTempUsed() const { // See system_temp_depth_stencil_ documentation for explanation of cases. if (edram_rov_used_) { - return writes_depth() || ROV_IsDepthStencilEarly(); + return current_shader().writes_depth() || ROV_IsDepthStencilEarly(); } - return writes_depth() && DSV_IsWritingFloat24Depth(); + return current_shader().writes_depth() && DSV_IsWritingFloat24Depth(); } // Whether the current non-ROV pixel shader should convert the depth to 20e4. bool DSV_IsWritingFloat24Depth() const { @@ -2246,8 +2249,8 @@ class DxbcShaderTranslator : public ShaderTranslator { // Whether it's possible and worth skipping running the translated shader for // 2x2 quads. bool ROV_IsDepthStencilEarly() const { - return !is_depth_only_pixel_shader_ && !writes_depth() && - memexport_stream_constants().empty(); + return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() && + current_shader().memexport_stream_constants().empty(); } // Converts the depth value to 24-bit (storing the result in bits 0:23 and // zeros in 24:31, not creating room for stencil - since this may be involved @@ -2467,7 +2470,7 @@ class DxbcShaderTranslator : public ShaderTranslator { // Is currently writing the empty depth-only pixel shader, for // CompleteTranslation. - bool is_depth_only_pixel_shader_; + bool is_depth_only_pixel_shader_ = false; // Data types used in constants buffers. Listed in dependency order. enum class RdefTypeIndex { @@ -2604,9 +2607,9 @@ class DxbcShaderTranslator : public ShaderTranslator { // 4 `alloc export`s per component. uint32_t system_temp_memexport_written_; // eA in each `alloc export`, or UINT32_MAX if not used. - uint32_t system_temps_memexport_address_[kMaxMemExports]; + uint32_t system_temps_memexport_address_[Shader::kMaxMemExports]; // eM# in each `alloc export`, or UINT32_MAX if not used. - uint32_t system_temps_memexport_data_[kMaxMemExports][5]; + uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5]; // Vector ALU or fetch result/scratch (since Xenos write masks can contain // swizzles). diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index 5f3d47bc0..76bec3e60 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -136,7 +136,7 @@ void DxbcShaderTranslator::ExportToMemory() { DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kXXXX)); // control_temp.x is now free. - for (uint32_t i = 0; i < kMaxMemExports; ++i) { + for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) { uint32_t eA_temp = system_temps_memexport_address_[i]; if (eA_temp == UINT32_MAX) { // Export not used. diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index ea79b737c..8c01648f1 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -144,7 +144,7 @@ void DxbcShaderTranslator::ROV_GetColorFormatSystemConstants( } void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { - bool color_targets_written = writes_any_color_target(); + bool any_color_targets_written = current_shader().writes_color_targets() != 0; // *************************************************************************** // Get EDRAM offsets for the pixel: @@ -272,7 +272,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { DxbcOpIAdd(DxbcDest::R(system_temp_rov_params_, 0b0001), DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kZZZZ), DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX)); - if (color_targets_written) { + if (any_color_targets_written) { // Write 32bpp color offset to system_temp_rov_params_.z. // system_temp_rov_params_.x = X sample 0 position within the depth tile // system_temp_rov_params_.y = row offset @@ -303,8 +303,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // Release resolution_scale_log2_temp. PopSystemTemp(); { - DxbcDest offsets_dest(DxbcDest::R(system_temp_rov_params_, - color_targets_written ? 0b0110 : 0b0010)); + DxbcDest offsets_dest(DxbcDest::R( + system_temp_rov_params_, any_color_targets_written ? 0b0110 : 0b0010)); // Scale the offsets by the resolution scale. // system_temp_rov_params_.y = scaled 32bpp depth/stencil first host pixel // address @@ -329,7 +329,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // Close the resolution scale conditional. DxbcOpEndIf(); - if (color_targets_written) { + if (any_color_targets_written) { // Get the 64bpp color offset to system_temp_rov_params_.w. // TODO(Triang3l): Find some game that aliases 64bpp with 32bpp to emulate // the real layout. @@ -388,8 +388,6 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { } void DxbcShaderTranslator::ROV_DepthStencilTest() { - bool depth_stencil_early = ROV_IsDepthStencilEarly(); - uint32_t temp = PushSystemTemp(); DxbcDest temp_x_dest(DxbcDest::R(temp, 0b0001)); DxbcSrc temp_x_src(DxbcSrc::R(temp, DxbcSrc::kXXXX)); @@ -413,6 +411,9 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // temp.x = free DxbcOpIf(true, temp_x_src); + bool depth_stencil_early = ROV_IsDepthStencilEarly(); + bool shader_writes_depth = current_shader().writes_depth(); + for (uint32_t i = 0; i < 4; ++i) { // With early depth/stencil, depth/stencil writing may be deferred to the // end of the shader to prevent writing in case something (like alpha test, @@ -427,7 +428,7 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { : temp_x_src); if (!i) { - if (writes_depth()) { + if (shader_writes_depth) { // Clamp oDepth to the lower viewport depth bound (depth clamp happens // after the pixel shader in the pipeline, at least on Direct3D 11 and // Vulkan, thus applies to the shader's depth output too). @@ -569,7 +570,7 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // temp.w = free DxbcOpIf(true, temp_w_src); - if (writes_depth()) { + if (shader_writes_depth) { // Copy the 24-bit depth common to all samples to sample_depth_stencil. // temp.x = shader-generated 24-bit depth DxbcOpMov(sample_depth_stencil_dest, @@ -1024,7 +1025,8 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // temp.z = viewport maximum depth if not writing to oDepth // temp.w = whether depth/stencil has been modified DxbcOpINE(temp_w_dest, sample_depth_stencil_src, temp_w_src); - if (depth_stencil_early && !CanWriteZEarly()) { + if (depth_stencil_early && + !current_shader().implicit_early_z_write_allowed()) { // Set the sample bit in bits 4:7 of system_temp_rov_params_.x - always // need to write late in this shader, as it may do something like // explicitly killing pixels. @@ -1734,7 +1736,7 @@ void DxbcShaderTranslator::ROV_HandleAlphaBlendFactorCases( void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs_AlphaToMask() { // Check if alpha to coverage can be done at all in this shader. - if (!writes_color_target(0)) { + if (!current_shader().writes_color_target(0)) { return; } @@ -1863,21 +1865,22 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs_AlphaToMask() { } void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { - if (!writes_any_color_target()) { + uint32_t shader_writes_color_targets = + current_shader().writes_color_targets(); + if (!shader_writes_color_targets) { return; } // Check if this sample needs to be discarded by alpha to coverage. CompletePixelShader_WriteToRTVs_AlphaToMask(); - // Get the write mask as components, and also apply the exponent bias after - // alpha to coverage because it needs the unbiased alpha from the shader. - uint32_t guest_rt_mask = 0; + uint32_t gamma_temp = PushSystemTemp(); for (uint32_t i = 0; i < 4; ++i) { - if (!writes_color_target(i)) { + if (!(shader_writes_color_targets & (1 << i))) { continue; } - guest_rt_mask |= 1 << i; + // Apply the exponent bias after alpha to coverage because it needs the + // unbiased alpha from the shader system_constants_used_ |= 1ull << kSysConst_ColorExpBias_Index; DxbcOpMul(DxbcDest::R(system_temps_color_[i]), DxbcSrc::R(system_temps_color_[i]), @@ -1885,16 +1888,9 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { uint32_t(CbufferRegister::kSystemConstants), kSysConst_ColorExpBias_Vec) .Select(i)); - } - - // Convert to gamma space - this is incorrect, since it must be done after - // blending on the Xbox 360, but this is just one of many blending issues in - // the RTV path. - uint32_t gamma_temp = PushSystemTemp(); - for (uint32_t i = 0; i < 4; ++i) { - if (!(guest_rt_mask & (1 << i))) { - continue; - } + // Convert to gamma space - this is incorrect, since it must be done after + // blending on the Xbox 360, but this is just one of many blending issues in + // the RTV path. system_constants_used_ |= 1ull << kSysConst_Flags_Index; DxbcOpAnd(DxbcDest::R(gamma_temp, 0b0001), DxbcSrc::CB(cbuffer_index_system_constants_, @@ -1923,7 +1919,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { // Host RT i, guest RT j. for (uint32_t i = 0; i < 4; ++i) { // mask = map.iiii == (0, 1, 2, 3) - DxbcOpIEq(DxbcDest::R(remap_movc_mask_temp, guest_rt_mask), + DxbcOpIEq(DxbcDest::R(remap_movc_mask_temp, shader_writes_color_targets), DxbcSrc::CB(cbuffer_index_system_constants_, uint32_t(CbufferRegister::kSystemConstants), kSysConst_ColorOutputMap_Vec) @@ -1932,7 +1928,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { bool guest_rt_first = true; for (uint32_t j = 0; j < 4; ++j) { // If map.i == j, move guest color j to the temporary host color. - if (!(guest_rt_mask & (1 << j))) { + if (!(shader_writes_color_targets & (1 << j))) { continue; } DxbcOpMovC(DxbcDest::R(remap_movc_target_temp), @@ -1954,8 +1950,10 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() { return; } + bool shader_writes_depth = current_shader().writes_depth(); + uint32_t temp; - if (writes_depth()) { + if (shader_writes_depth) { // The depth is already written to system_temp_depth_stencil_.x and clamped // to 0...1 with NaNs dropped (saturating in StoreResult); yzw are free. temp = system_temp_depth_stencil_; @@ -1991,8 +1989,8 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() { // The smallest denormalized 20e4 number is -34 - should drop 23 mantissa // bits at -34. // Anything smaller than 2^-34 becomes 0. - DxbcDest truncate_dest(writes_depth() ? DxbcDest::ODepth() - : DxbcDest::ODepthLE()); + DxbcDest truncate_dest(shader_writes_depth ? DxbcDest::ODepth() + : DxbcDest::ODepthLE()); // Check if the number is representable as a float24 after truncation - the // exponent is at least -34. DxbcOpUGE(temp_y_dest, temp_x_src, DxbcSrc::LU(0x2E800000)); @@ -2076,7 +2074,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() { temp_y_src); } - if (!writes_depth()) { + if (!shader_writes_depth) { // Release temp. PopSystemTemp(); } @@ -2106,7 +2104,7 @@ void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMaskSample( void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMask() { // Check if alpha to coverage can be done at all in this shader. - if (!writes_color_target(0)) { + if (!current_shader().writes_color_target(0)) { return; } @@ -2269,8 +2267,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { } // Write color values. + uint32_t shader_writes_color_targets = + current_shader().writes_color_targets(); for (uint32_t i = 0; i < 4; ++i) { - if (!writes_color_target(i)) { + if (!(shader_writes_color_targets & (1 << i))) { continue; } @@ -3156,7 +3156,7 @@ void DxbcShaderTranslator::CompletePixelShader() { return; } - if (writes_color_target(0)) { + if (current_shader().writes_color_target(0)) { // Alpha test. // X - mask, then masked result (SGPR for loading, VGPR for masking). // Y - operation result (SGPR for mask operations, VGPR for alpha diff --git a/src/xenia/gpu/registers.h b/src/xenia/gpu/registers.h index 07986b169..deaecaf39 100644 --- a/src/xenia/gpu/registers.h +++ b/src/xenia/gpu/registers.h @@ -97,6 +97,7 @@ union SQ_PROGRAM_CNTL { // Note from a2xx.xml: // Only 0x3F worth of valid register values for VS_NUM_REG and PS_NUM_REG, // but high bit is set to indicate "0 registers used". + // (Register count = (num_reg & 0x80) ? 0 : (num_reg + 1)) uint32_t vs_num_reg : 8; // +0 uint32_t ps_num_reg : 8; // +8 uint32_t vs_resource : 1; // +16 diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc index 6df03fb81..78451035d 100644 --- a/src/xenia/gpu/shader.cc +++ b/src/xenia/gpu/shader.cc @@ -55,7 +55,7 @@ std::filesystem::path Shader::Translation::Dump( } path = path / fmt::format( - "shader_{:016X}_{:08X}.{}.{}", shader().ucode_data_hash(), + "shader_{:016X}_{:016X}.{}.{}", shader().ucode_data_hash(), modification(), path_prefix, shader().type() == xenos::ShaderType::kVertex ? "vert" : "frag"); FILE* f = filesystem::OpenFile(path, "wb"); @@ -78,7 +78,7 @@ std::filesystem::path Shader::Translation::Dump( return std::move(path); } -Shader::Translation* Shader::GetOrCreateTranslation(uint32_t modification, +Shader::Translation* Shader::GetOrCreateTranslation(uint64_t modification, bool* is_new) { auto it = translations_.find(modification); if (it != translations_.end()) { @@ -95,7 +95,7 @@ Shader::Translation* Shader::GetOrCreateTranslation(uint32_t modification, return translation; } -void Shader::DestroyTranslation(uint32_t modification) { +void Shader::DestroyTranslation(uint64_t modification) { auto it = translations_.find(modification); if (it == translations_.end()) { return; @@ -124,7 +124,7 @@ std::filesystem::path Shader::DumpUcodeBinary( return std::move(path); } -Shader::Translation* Shader::CreateTranslationInstance(uint32_t modification) { +Shader::Translation* Shader::CreateTranslationInstance(uint64_t modification) { // Default implementation for simple cases like ucode disassembly. return new Translation(*this, modification); } diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index e533ba9b8..9f849ee8b 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -11,9 +11,9 @@ #define XENIA_GPU_SHADER_H_ #include <algorithm> -#include <atomic> #include <cstdint> #include <filesystem> +#include <set> #include <string> #include <unordered_map> #include <utility> @@ -593,6 +593,41 @@ struct ParsedAluInstruction { void Disassemble(StringBuffer* out) const; }; +void ParseControlFlowExec(const ucode::ControlFlowExecInstruction& cf, + uint32_t cf_index, ParsedExecInstruction& instr); +void ParseControlFlowCondExec(const ucode::ControlFlowCondExecInstruction& cf, + uint32_t cf_index, ParsedExecInstruction& instr); +void ParseControlFlowCondExecPred( + const ucode::ControlFlowCondExecPredInstruction& cf, uint32_t cf_index, + ParsedExecInstruction& instr); +void ParseControlFlowLoopStart(const ucode::ControlFlowLoopStartInstruction& cf, + uint32_t cf_index, + ParsedLoopStartInstruction& instr); +void ParseControlFlowLoopEnd(const ucode::ControlFlowLoopEndInstruction& cf, + uint32_t cf_index, + ParsedLoopEndInstruction& instr); +void ParseControlFlowCondCall(const ucode::ControlFlowCondCallInstruction& cf, + uint32_t cf_index, ParsedCallInstruction& instr); +void ParseControlFlowReturn(const ucode::ControlFlowReturnInstruction& cf, + uint32_t cf_index, ParsedReturnInstruction& instr); +void ParseControlFlowCondJmp(const ucode::ControlFlowCondJmpInstruction& cf, + uint32_t cf_index, ParsedJumpInstruction& instr); +void ParseControlFlowAlloc(const ucode::ControlFlowAllocInstruction& cf, + uint32_t cf_index, bool is_vertex_shader, + ParsedAllocInstruction& instr); + +// Returns whether the fetch is a full one, and the next parsed mini vertex +// fetch should inherit most of its parameters. +bool ParseVertexFetchInstruction( + const ucode::VertexFetchInstruction& op, + const ucode::VertexFetchInstruction& previous_full_op, + ParsedVertexFetchInstruction& instr); +void ParseTextureFetchInstruction(const ucode::TextureFetchInstruction& op, + ParsedTextureFetchInstruction& instr); +void ParseAluInstruction(const ucode::AluInstruction& op, + xenos::ShaderType shader_type, + ParsedAluInstruction& instr); + class Shader { public: // Type of the vertex shader in a D3D11-like rendering pipeline - shader @@ -619,12 +654,8 @@ class Shader { struct VertexBinding { struct Attribute { - // Attribute index, 0-based in the entire shader. - int attrib_index; // Fetch instruction with all parameters. ParsedVertexFetchInstruction fetch_instr; - // Size of the attribute, in words. - uint32_t size_words; }; // Index within the vertex binding listing. @@ -691,6 +722,10 @@ class Shader { } }; + // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game + // .pdb. + static constexpr uint32_t kMaxMemExports = 16; + class Translation { public: virtual ~Translation() {} @@ -698,7 +733,7 @@ class Shader { Shader& shader() const { return shader_; } // Translator-specific modification bits. - uint32_t modification() const { return modification_; } + uint64_t modification() const { return modification_; } // True if the shader was translated and prepared without error. bool is_valid() const { return is_valid_; } @@ -735,7 +770,7 @@ class Shader { const char* path_prefix); protected: - Translation(Shader& shader, uint32_t modification) + Translation(Shader& shader, uint64_t modification) : shader_(shader), modification_(modification) {} private: @@ -743,7 +778,7 @@ class Shader { friend class ShaderTranslator; Shader& shader_; - uint32_t modification_; + uint64_t modification_; bool is_valid_ = false; bool is_translated_ = false; @@ -765,32 +800,23 @@ class Shader { const uint32_t* ucode_dwords() const { return ucode_data_.data(); } size_t ucode_dword_count() const { return ucode_data_.size(); } - // Host translations with the specified modification bits. Not thread-safe - // with respect to translation creation/destruction. - const std::unordered_map<uint32_t, Translation*>& translations() const { - return translations_; - } - Translation* GetTranslation(uint32_t modification) const { - auto it = translations_.find(modification); - if (it != translations_.cend()) { - return it->second; - } - return nullptr; - } - Translation* GetOrCreateTranslation(uint32_t modification, - bool* is_new = nullptr); - // For shader storage loading, to remove a modification in case of translation - // failure. Not thread-safe. - void DestroyTranslation(uint32_t modification); + bool is_ucode_analyzed() const { return is_ucode_analyzed_; } + // ucode_disasm_buffer is temporary storage for disassembly (provided + // externally so it won't need to be reallocated for every shader). + void AnalyzeUcode(StringBuffer& ucode_disasm_buffer); + + // The following parameters, until the translation, are valid if ucode + // information has been gathered. + + // Microcode disassembly in D3D format. + const std::string& ucode_disassembly() const { return ucode_disassembly_; } // All vertex bindings used in the shader. - // Valid for vertex shaders only. const std::vector<VertexBinding>& vertex_bindings() const { return vertex_bindings_; } // All texture bindings used in the shader. - // Valid for both vertex and pixel shaders. const std::vector<TextureBinding>& texture_bindings() const { return texture_bindings_; } @@ -800,24 +826,99 @@ class Shader { return constant_register_map_; } + // uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have + // been written to after each `alloc export`, for up to Shader::kMaxMemExports + // exports. This will contain zero for certain corrupt exports - for those to + // which a valid eA was not written via a MAD with a stream constant. + const uint8_t* memexport_eM_written() const { return memexport_eM_written_; } + // All c# registers used as the addend in MAD operations to eA. - const std::vector<uint32_t>& memexport_stream_constants() const { + const std::set<uint32_t>& memexport_stream_constants() const { return memexport_stream_constants_; } - // Returns true if the given color target index [0-3]. - bool writes_color_target(uint32_t i) const { - return writes_color_targets_[i]; + // Labels that jumps (explicit or from loops) can be done to. + const std::set<uint32_t>& label_addresses() const { return label_addresses_; } + + // Exclusive upper bound of the indexes of paired control flow instructions + // (each corresponds to 3 dwords). + uint32_t cf_pair_index_bound() const { return cf_pair_index_bound_; } + + // Upper bound of temporary registers addressed statically by the shader - + // highest static register address + 1, or 0 if no registers referenced this + // way. SQ_PROGRAM_CNTL is not always reliable - some draws (like single point + // draws with oPos = 0001 that are done by Xbox 360's Direct3D 9 sometimes; + // can be reproduced by launching Arrival in Halo 3 from the campaign lobby) + // that aren't supposed to cover any pixels use an invalid (zero) + // SQ_PROGRAM_CNTL, but with an outdated pixel shader loaded, in this case + // SQ_PROGRAM_CNTL may contain a number smaller than actually needed by the + // pixel shader - SQ_PROGRAM_CNTL should be used to go above this count if + // uses_register_dynamic_addressing is true. + uint32_t register_static_address_bound() const { + return register_static_address_bound_; } - // True if the shader overrides the pixel depth. - bool writes_depth() const { return writes_depth_; } + // Whether the shader addresses temporary registers dynamically, thus + // SQ_PROGRAM_CNTL should determine the number of registers to use, not only + // register_static_address_bound. + bool uses_register_dynamic_addressing() const { + return uses_register_dynamic_addressing_; + } + + // For building shader modification bits (and also for normalization of them), + // returns the amount of temporary registers that need to be allocated + // explicitly - if not using register dynamic addressing, the shader + // translator will use register_static_address_bound directly. + uint32_t GetDynamicAddressableRegisterCount( + uint32_t program_cntl_num_reg) const { + if (!uses_register_dynamic_addressing()) { + return 0; + } + return std::max((program_cntl_num_reg & 0x80) + ? uint32_t(0) + : (program_cntl_num_reg + uint32_t(1)), + register_static_address_bound()); + } // True if the current shader has any `kill` instructions. bool kills_pixels() const { return kills_pixels_; } - // Microcode disassembly in D3D format. - const std::string& ucode_disassembly() const { return ucode_disassembly_; } + // True if the shader overrides the pixel depth. + bool writes_depth() const { return writes_depth_; } + + // Whether the shader can have early depth and stencil writing enabled, unless + // alpha test or alpha to coverage is enabled. + bool implicit_early_z_write_allowed() const { + // TODO(Triang3l): Investigate what happens to memexport when the pixel + // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early + // depth/stencil. + return !writes_depth() && !kills_pixels() && + memexport_stream_constants().empty(); + } + + // Whether each color render target is written to on any exection path. + uint32_t writes_color_targets() const { return writes_color_targets_; } + bool writes_color_target(uint32_t i) const { + return (writes_color_targets() & (uint32_t(1) << i)) != 0; + } + + // Host translations with the specified modification bits. Not thread-safe + // with respect to translation creation/destruction. + const std::unordered_map<uint64_t, Translation*>& translations() const { + return translations_; + } + Translation* GetTranslation(uint64_t modification) const { + auto it = translations_.find(modification); + if (it != translations_.cend()) { + return it->second; + } + return nullptr; + } + Translation* GetOrCreateTranslation(uint64_t modification, + bool* is_new = nullptr); + // For shader storage loading, to remove a modification in case of translation + // failure. Not thread-safe. + void DestroyTranslation(uint64_t modification); // An externally managed identifier of the shader storage the microcode of the // shader was last written to, or was loaded from, to only write the shader @@ -835,33 +936,68 @@ class Shader { protected: friend class ShaderTranslator; - virtual Translation* CreateTranslationInstance(uint32_t modification); + virtual Translation* CreateTranslationInstance(uint64_t modification); xenos::ShaderType shader_type_; std::vector<uint32_t> ucode_data_; uint64_t ucode_data_hash_; - // Modification bits -> translation. - std::unordered_map<uint32_t, Translation*> translations_; + // Whether info needed before translating has been gathered already - may be + // needed to determine which modifications are actually needed and make sense + // (for instance, there may be draws not covering anything and not allocating + // any pixel shader registers in SQ_PROGRAM_CNTL, but still using the pixel + // shader from the previous draw - in this case, every shader that happens to + // be before such draw will need to be translated again with a different + // dynamically addressed register count, which may cause compilation of + // different random pipelines across many random frames, thus causing + // stuttering - normally host pipeline states are deterministically only + // compiled when a new material appears in the game, and having the order of + // draws also matter in such unpredictable way would break this rule; limit + // the effect to shaders with dynamic register addressing only, which are + // extremely rare), also some info needed for drawing is collected during the + // ucode analysis. + bool is_ucode_analyzed_ = false; - // Whether setup of the post-translation parameters (listed below, plus those - // specific to the implementation) has been initiated, by any thread. If - // translation is performed on multiple threads, only one thread must be - // setting this up (other threads would write the same data anyway). - std::atomic_flag post_translation_info_set_up_ = ATOMIC_FLAG_INIT; - - // Initialized after the first successful translation (these don't depend on - // the host-side modification bits). std::string ucode_disassembly_; std::vector<VertexBinding> vertex_bindings_; std::vector<TextureBinding> texture_bindings_; ConstantRegisterMap constant_register_map_ = {0}; - bool writes_color_targets_[4] = {false, false, false, false}; - bool writes_depth_ = false; + uint8_t memexport_eM_written_[kMaxMemExports] = {}; + std::set<uint32_t> memexport_stream_constants_; + std::set<uint32_t> label_addresses_; + uint32_t cf_pair_index_bound_ = 0; + uint32_t register_static_address_bound_ = 0; + bool uses_register_dynamic_addressing_ = false; bool kills_pixels_ = false; - std::vector<uint32_t> memexport_stream_constants_; + bool writes_depth_ = false; + uint32_t writes_color_targets_ = 0b0000; + + // Modification bits -> translation. + std::unordered_map<uint64_t, Translation*> translations_; uint32_t ucode_storage_index_ = UINT32_MAX; + + private: + void GatherExecInformation( + const ParsedExecInstruction& instr, + ucode::VertexFetchInstruction& previous_vfetch_full, + uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count, + uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer); + void GatherVertexFetchInformation( + const ucode::VertexFetchInstruction& op, + ucode::VertexFetchInstruction& previous_vfetch_full, + StringBuffer& ucode_disasm_buffer); + void GatherTextureFetchInformation(const ucode::TextureFetchInstruction& op, + uint32_t& unique_texture_bindings, + StringBuffer& ucode_disasm_buffer); + void GatherAluInstructionInformation(const ucode::AluInstruction& op, + uint32_t memexport_alloc_current_count, + uint32_t& memexport_eA_written, + StringBuffer& ucode_disasm_buffer); + void GatherOperandInformation(const InstructionOperand& operand); + void GatherFetchResultInformation(const InstructionResult& result); + void GatherAluResultInformation(const InstructionResult& result, + uint32_t memexport_alloc_current_count); }; } // namespace gpu diff --git a/src/xenia/gpu/shader_compiler_main.cc b/src/xenia/gpu/shader_compiler_main.cc index a9a744955..4874928d3 100644 --- a/src/xenia/gpu/shader_compiler_main.cc +++ b/src/xenia/gpu/shader_compiler_main.cc @@ -17,6 +17,7 @@ #include "xenia/base/main.h" #include "xenia/base/platform.h" #include "xenia/base/string.h" +#include "xenia/base/string_buffer.h" #include "xenia/gpu/dxbc_shader_translator.h" #include "xenia/gpu/shader_translator.h" #include "xenia/gpu/spirv_shader_translator.h" @@ -104,6 +105,8 @@ int shader_compiler_main(const std::vector<std::string>& args) { auto shader = std::make_unique<Shader>( shader_type, ucode_data_hash, ucode_dwords.data(), ucode_dwords.size()); + shader->AnalyzeUcode(StringBuffer()); + std::unique_ptr<ShaderTranslator> translator; if (cvars::shader_output_type == "spirv" || cvars::shader_output_type == "spirvtext") { @@ -114,7 +117,15 @@ int shader_compiler_main(const std::vector<std::string>& args) { 0, cvars::shader_output_bindless_resources, cvars::shader_output_dxbc_rov); } else { - translator = std::make_unique<UcodeShaderTranslator>(); + // Just output microcode disassembly generated during microcode information + // gathering. + if (!cvars::shader_output.empty()) { + auto output_file = filesystem::OpenFile(cvars::shader_output, "wb"); + fwrite(shader->ucode_disassembly().c_str(), 1, + shader->ucode_disassembly().length(), output_file); + fclose(output_file); + } + return 0; } Shader::HostVertexShaderType host_vertex_shader_type = @@ -140,12 +151,12 @@ int shader_compiler_main(const std::vector<std::string>& args) { Shader::HostVertexShaderType::kQuadDomainPatchIndexed; } } - uint32_t modification = - translator->GetDefaultModification(shader_type, host_vertex_shader_type); + uint64_t modification = translator->GetDefaultModification( + shader_type, 64, host_vertex_shader_type); Shader::Translation* translation = shader->GetOrCreateTranslation(modification); - translator->Translate(*translation); + translator->TranslateAnalyzedShader(*translation); const void* source_data = translation->translated_binary().data(); size_t source_data_size = translation->translated_binary().size(); diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 6d79e82c2..80f122ba9 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -9,7 +9,9 @@ #include "xenia/gpu/shader_translator.h" +#include <algorithm> #include <cstdarg> +#include <cstring> #include <set> #include <string> @@ -42,91 +44,159 @@ using namespace ucode; // Lots of naming comes from the disassembly spit out by the XNA GS compiler // and dumps of d3dcompiler and games: https://pastebin.com/i4kAv7bB -ShaderTranslator::ShaderTranslator() = default; - -ShaderTranslator::~ShaderTranslator() = default; - -void ShaderTranslator::Reset(xenos::ShaderType shader_type) { - shader_type_ = shader_type; - modification_ = GetDefaultModification(shader_type); - errors_.clear(); - ucode_disasm_buffer_.Reset(); - ucode_disasm_line_number_ = 0; - previous_ucode_disasm_scan_offset_ = 0; - register_count_ = 64; - label_addresses_.clear(); - total_attrib_count_ = 0; - vertex_bindings_.clear(); - unique_vertex_bindings_ = 0; - texture_bindings_.clear(); - unique_texture_bindings_ = 0; - std::memset(&constant_register_map_, 0, sizeof(constant_register_map_)); - uses_register_dynamic_addressing_ = false; - for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { - writes_color_targets_[i] = false; +void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) { + if (is_ucode_analyzed_) { + return; } - writes_depth_ = false; - kills_pixels_ = false; - memexport_alloc_count_ = 0; - memexport_eA_written_ = 0; - std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_)); - memexport_stream_constants_.clear(); -} - -bool ShaderTranslator::Translate(Shader::Translation& translation, - reg::SQ_PROGRAM_CNTL cntl) { - xenos::ShaderType shader_type = translation.shader().type(); - Reset(shader_type); - uint32_t cntl_num_reg = shader_type == xenos::ShaderType::kVertex - ? cntl.vs_num_reg - : cntl.ps_num_reg; - register_count_ = (cntl_num_reg & 0x80) ? 0 : (cntl_num_reg + 1); - - return TranslateInternal(translation); -} - -bool ShaderTranslator::Translate(Shader::Translation& translation) { - Reset(translation.shader().type()); - return TranslateInternal(translation); -} - -bool ShaderTranslator::TranslateInternal(Shader::Translation& translation) { - Shader& shader = translation.shader(); - assert_true(shader_type_ == shader.type()); - shader_type_ = shader.type(); - ucode_dwords_ = shader.ucode_dwords(); - ucode_dword_count_ = shader.ucode_dword_count(); - modification_ = translation.modification(); // Control flow instructions come paired in blocks of 3 dwords and all are // listed at the top of the ucode. // Each control flow instruction is executed sequentially until the final // ending instruction. - uint32_t max_cf_dword_index = static_cast<uint32_t>(ucode_dword_count_); - std::vector<ControlFlowInstruction> cf_instructions; - for (uint32_t i = 0; i < max_cf_dword_index; i += 3) { - ControlFlowInstruction cf_a; - ControlFlowInstruction cf_b; - UnpackControlFlowInstructions(ucode_dwords_ + i, &cf_a, &cf_b); - // Guess how long the control flow program is by scanning for the first - // kExec-ish and instruction and using its address as the upper bound. - // This is what freedreno does. - if (IsControlFlowOpcodeExec(cf_a.opcode())) { - max_cf_dword_index = - std::min(max_cf_dword_index, cf_a.exec.address() * 3); + // Gather the upper bound of the control flow instructions, and label + // addresses, which are needed for disassembly. + cf_pair_index_bound_ = uint32_t(ucode_data_.size() / 3); + for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) { + ControlFlowInstruction cf_ab[2]; + UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab); + for (uint32_t j = 0; j < 2; ++j) { + // Guess how long the control flow program is by scanning for the first + // kExec-ish and instruction and using its address as the upper bound. + // This is what freedreno does. + const ControlFlowInstruction& cf = cf_ab[j]; + if (IsControlFlowOpcodeExec(cf.opcode())) { + cf_pair_index_bound_ = + std::min(cf_pair_index_bound_, cf.exec.address()); + } + switch (cf.opcode()) { + case ControlFlowOpcode::kCondCall: + label_addresses_.insert(cf.cond_call.address()); + break; + case ControlFlowOpcode::kCondJmp: + label_addresses_.insert(cf.cond_jmp.address()); + break; + case ControlFlowOpcode::kLoopStart: + label_addresses_.insert(cf.loop_start.address()); + break; + case ControlFlowOpcode::kLoopEnd: + label_addresses_.insert(cf.loop_end.address()); + break; + default: + break; + } } - if (IsControlFlowOpcodeExec(cf_b.opcode())) { - max_cf_dword_index = - std::min(max_cf_dword_index, cf_b.exec.address() * 3); - } - // Gather all labels, binding, operand addressing and export information. - // Translators may need this before they start codegen. - GatherInstructionInformation(cf_a); - GatherInstructionInformation(cf_b); - cf_instructions.push_back(cf_a); - cf_instructions.push_back(cf_b); } + // Disassemble and gather information. + ucode_disasm_buffer.Reset(); + VertexFetchInstruction previous_vfetch_full; + std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full)); + uint32_t unique_texture_bindings = 0; + uint32_t memexport_alloc_count = 0; + uint32_t memexport_eA_written = 0; + for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) { + ControlFlowInstruction cf_ab[2]; + UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab); + for (uint32_t j = 0; j < 2; ++j) { + uint32_t cf_index = i * 2 + j; + if (label_addresses_.find(cf_index) != label_addresses_.end()) { + ucode_disasm_buffer.AppendFormat(" label L{}\n", + cf_index); + } + ucode_disasm_buffer.AppendFormat("/* {:4d}.{} */ ", i, j); + + const ControlFlowInstruction& cf = cf_ab[j]; + uint32_t bool_constant_index = UINT32_MAX; + switch (cf.opcode()) { + case ControlFlowOpcode::kNop: + ucode_disasm_buffer.Append(" cnop\n"); + break; + case ControlFlowOpcode::kExec: + case ControlFlowOpcode::kExecEnd: { + ParsedExecInstruction instr; + ParseControlFlowExec(cf.exec, cf_index, instr); + GatherExecInformation(instr, previous_vfetch_full, + unique_texture_bindings, memexport_alloc_count, + memexport_eA_written, ucode_disasm_buffer); + } break; + case ControlFlowOpcode::kCondExec: + case ControlFlowOpcode::kCondExecEnd: + case ControlFlowOpcode::kCondExecPredClean: + case ControlFlowOpcode::kCondExecPredCleanEnd: { + bool_constant_index = cf.cond_exec.bool_address(); + ParsedExecInstruction instr; + ParseControlFlowCondExec(cf.cond_exec, cf_index, instr); + GatherExecInformation(instr, previous_vfetch_full, + unique_texture_bindings, memexport_alloc_count, + memexport_eA_written, ucode_disasm_buffer); + } break; + case ControlFlowOpcode::kCondExecPred: + case ControlFlowOpcode::kCondExecPredEnd: { + ParsedExecInstruction instr; + ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr); + GatherExecInformation(instr, previous_vfetch_full, + unique_texture_bindings, memexport_alloc_count, + memexport_eA_written, ucode_disasm_buffer); + } break; + case ControlFlowOpcode::kLoopStart: { + ParsedLoopStartInstruction instr; + ParseControlFlowLoopStart(cf.loop_start, cf_index, instr); + instr.Disassemble(&ucode_disasm_buffer); + constant_register_map_.loop_bitmap |= uint32_t(1) + << instr.loop_constant_index; + } break; + case ControlFlowOpcode::kLoopEnd: { + ParsedLoopEndInstruction instr; + ParseControlFlowLoopEnd(cf.loop_end, cf_index, instr); + instr.Disassemble(&ucode_disasm_buffer); + constant_register_map_.loop_bitmap |= uint32_t(1) + << instr.loop_constant_index; + } break; + case ControlFlowOpcode::kCondCall: { + ParsedCallInstruction instr; + ParseControlFlowCondCall(cf.cond_call, cf_index, instr); + instr.Disassemble(&ucode_disasm_buffer); + if (instr.type == ParsedCallInstruction::Type::kConditional) { + bool_constant_index = instr.bool_constant_index; + } + } break; + case ControlFlowOpcode::kReturn: { + ParsedReturnInstruction instr; + ParseControlFlowReturn(cf.ret, cf_index, instr); + instr.Disassemble(&ucode_disasm_buffer); + } break; + case ControlFlowOpcode::kCondJmp: { + ParsedJumpInstruction instr; + ParseControlFlowCondJmp(cf.cond_jmp, cf_index, instr); + instr.Disassemble(&ucode_disasm_buffer); + if (instr.type == ParsedJumpInstruction::Type::kConditional) { + bool_constant_index = instr.bool_constant_index; + } + } break; + case ControlFlowOpcode::kAlloc: { + ParsedAllocInstruction instr; + ParseControlFlowAlloc(cf.alloc, cf_index, + type() == xenos::ShaderType::kVertex, instr); + instr.Disassemble(&ucode_disasm_buffer); + if (instr.type == AllocType::kMemory) { + ++memexport_alloc_count; + } + } break; + case ControlFlowOpcode::kMarkVsFetchDone: + break; + default: + assert_unhandled_case(cf.opcode); + break; + } + if (bool_constant_index != UINT32_MAX) { + constant_register_map_.bool_bitmap[bool_constant_index / 32] |= + uint32_t(1) << (bool_constant_index % 32); + } + // TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))? + } + } + ucode_disassembly_ = ucode_disasm_buffer.to_string(); + if (constant_register_map_.float_dynamic_addressing) { // All potentially can be referenced. constant_register_map_.float_count = 256; @@ -143,335 +213,75 @@ bool ShaderTranslator::TranslateInternal(Shader::Translation& translation) { // Cleanup invalid/unneeded memexport allocs. for (uint32_t i = 0; i < kMaxMemExports; ++i) { - if (!(memexport_eA_written_ & (uint32_t(1) << i))) { + if (!(memexport_eA_written & (uint32_t(1) << i))) { memexport_eM_written_[i] = 0; } else if (!memexport_eM_written_[i]) { - memexport_eA_written_ &= ~(uint32_t(1) << i); + memexport_eA_written &= ~(uint32_t(1) << i); } } - if (memexport_eA_written_ == 0) { + if (memexport_eA_written == 0) { memexport_stream_constants_.clear(); } - StartTranslation(); + is_ucode_analyzed_ = true; +} - PreProcessControlFlowInstructions(cf_instructions); - - // Translate all instructions. - for (uint32_t i = 0, cf_index = 0; i < max_cf_dword_index; i += 3) { - ControlFlowInstruction cf_a; - ControlFlowInstruction cf_b; - UnpackControlFlowInstructions(ucode_dwords_ + i, &cf_a, &cf_b); - - cf_index_ = cf_index; - MarkUcodeInstruction(i); - if (label_addresses_.find(cf_index) != label_addresses_.end()) { - AppendUcodeDisasmFormat(" label L%u\n", cf_index); - ProcessLabel(cf_index); +void Shader::GatherExecInformation( + const ParsedExecInstruction& instr, + ucode::VertexFetchInstruction& previous_vfetch_full, + uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count, + uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) { + instr.Disassemble(&ucode_disasm_buffer); + uint32_t sequence = instr.sequence; + for (uint32_t instr_offset = instr.instruction_address; + instr_offset < instr.instruction_address + instr.instruction_count; + ++instr_offset, sequence >>= 2) { + ucode_disasm_buffer.AppendFormat("/* {:4d} */ ", instr_offset); + if (sequence & 0b10) { + ucode_disasm_buffer.Append(" serialize\n "); } - AppendUcodeDisasmFormat("/* %4u.0 */ ", cf_index / 2); - ProcessControlFlowInstructionBegin(cf_index); - TranslateControlFlowInstruction(cf_a); - ProcessControlFlowInstructionEnd(cf_index); - ++cf_index; - - cf_index_ = cf_index; - MarkUcodeInstruction(i); - if (label_addresses_.find(cf_index) != label_addresses_.end()) { - AppendUcodeDisasmFormat(" label L%u\n", cf_index); - ProcessLabel(cf_index); - } - AppendUcodeDisasmFormat("/* %4u.1 */ ", cf_index / 2); - ProcessControlFlowInstructionBegin(cf_index); - TranslateControlFlowInstruction(cf_b); - ProcessControlFlowInstructionEnd(cf_index); - ++cf_index; - } - - translation.errors_ = std::move(errors_); - translation.translated_binary_ = CompleteTranslation(); - translation.is_translated_ = true; - - bool is_valid = true; - for (const auto& error : translation.errors_) { - if (error.is_fatal) { - is_valid = false; - break; - } - } - translation.is_valid_ = is_valid; - - // Setup info that doesn't depend on the modification only once. - bool setup_shader_post_translation_info = - is_valid && !shader.post_translation_info_set_up_.test_and_set(); - if (setup_shader_post_translation_info) { - shader.ucode_disassembly_ = ucode_disasm_buffer_.to_string(); - shader.vertex_bindings_ = std::move(vertex_bindings_); - shader.texture_bindings_ = std::move(texture_bindings_); - shader.constant_register_map_ = std::move(constant_register_map_); - for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { - shader.writes_color_targets_[i] = writes_color_targets_[i]; - } - shader.writes_depth_ = writes_depth_; - shader.kills_pixels_ = kills_pixels_; - shader.memexport_stream_constants_.clear(); - shader.memexport_stream_constants_.reserve( - memexport_stream_constants_.size()); - shader.memexport_stream_constants_.insert( - shader.memexport_stream_constants_.cend(), - memexport_stream_constants_.cbegin(), - memexport_stream_constants_.cend()); - } - PostTranslation(translation, setup_shader_post_translation_info); - - // In case is_valid_ is modified by PostTranslation, reload. - return translation.is_valid_; -} - -void ShaderTranslator::MarkUcodeInstruction(uint32_t dword_offset) { - auto disasm = ucode_disasm_buffer_.buffer(); - size_t current_offset = ucode_disasm_buffer_.length(); - for (size_t i = previous_ucode_disasm_scan_offset_; i < current_offset; ++i) { - if (disasm[i] == '\n') { - ++ucode_disasm_line_number_; - } - } - previous_ucode_disasm_scan_offset_ = current_offset; -} - -void ShaderTranslator::AppendUcodeDisasm(char c) { - ucode_disasm_buffer_.Append(c); -} - -void ShaderTranslator::AppendUcodeDisasm(const char* value) { - ucode_disasm_buffer_.Append(value); -} - -void ShaderTranslator::AppendUcodeDisasmFormat(const char* format, ...) { - va_list va; - va_start(va, format); - ucode_disasm_buffer_.AppendVarargs(format, va); - va_end(va); -} - -void ShaderTranslator::EmitTranslationError(const char* message, - bool is_fatal) { - Shader::Error error; - error.is_fatal = is_fatal; - error.message = message; - // TODO(benvanik): location information. - errors_.push_back(std::move(error)); - XELOGE("Shader translation {}error: {}", is_fatal ? "fatal " : "", message); -} - -void ShaderTranslator::GatherInstructionInformation( - const ControlFlowInstruction& cf) { - uint32_t bool_constant_index = UINT32_MAX; - switch (cf.opcode()) { - case ControlFlowOpcode::kCondExec: - case ControlFlowOpcode::kCondExecEnd: - case ControlFlowOpcode::kCondExecPredClean: - case ControlFlowOpcode::kCondExecPredCleanEnd: - bool_constant_index = cf.cond_exec.bool_address(); - break; - case ControlFlowOpcode::kCondCall: - label_addresses_.insert(cf.cond_call.address()); - if (!cf.cond_call.is_unconditional() && !cf.cond_call.is_predicated()) { - bool_constant_index = cf.cond_call.bool_address(); + if (sequence & 0b01) { + auto fetch_opcode = FetchOpcode(ucode_data_[instr_offset * 3] & 0x1F); + if (fetch_opcode == FetchOpcode::kVertexFetch) { + auto& op = *reinterpret_cast<const VertexFetchInstruction*>( + ucode_data_.data() + instr_offset * 3); + GatherVertexFetchInformation(op, previous_vfetch_full, + ucode_disasm_buffer); + } else { + auto& op = *reinterpret_cast<const TextureFetchInstruction*>( + ucode_data_.data() + instr_offset * 3); + GatherTextureFetchInformation(op, unique_texture_bindings, + ucode_disasm_buffer); } - break; - case ControlFlowOpcode::kCondJmp: - label_addresses_.insert(cf.cond_jmp.address()); - if (!cf.cond_jmp.is_unconditional() && !cf.cond_jmp.is_predicated()) { - bool_constant_index = cf.cond_jmp.bool_address(); - } - break; - case ControlFlowOpcode::kLoopStart: - label_addresses_.insert(cf.loop_start.address()); - constant_register_map_.loop_bitmap |= uint32_t(1) - << cf.loop_start.loop_id(); - break; - case ControlFlowOpcode::kLoopEnd: - label_addresses_.insert(cf.loop_end.address()); - constant_register_map_.loop_bitmap |= uint32_t(1) - << cf.loop_end.loop_id(); - break; - case ControlFlowOpcode::kAlloc: - if (cf.alloc.alloc_type() == AllocType::kMemory) { - ++memexport_alloc_count_; - } - break; - default: - break; - } - if (bool_constant_index != UINT32_MAX) { - constant_register_map_.bool_bitmap[bool_constant_index / 32] |= - uint32_t(1) << (bool_constant_index % 32); - } - - switch (cf.opcode()) { - case ControlFlowOpcode::kExec: - case ControlFlowOpcode::kExecEnd: - case ControlFlowOpcode::kCondExec: - case ControlFlowOpcode::kCondExecEnd: - case ControlFlowOpcode::kCondExecPred: - case ControlFlowOpcode::kCondExecPredEnd: - case ControlFlowOpcode::kCondExecPredClean: - case ControlFlowOpcode::kCondExecPredCleanEnd: { - uint32_t sequence = cf.exec.sequence(); - for (uint32_t instr_offset = cf.exec.address(); - instr_offset < cf.exec.address() + cf.exec.count(); - ++instr_offset, sequence >>= 2) { - bool is_fetch = (sequence & 0x1) == 0x1; - if (is_fetch) { - // Gather vertex and texture fetches. - auto fetch_opcode = - static_cast<FetchOpcode>(ucode_dwords_[instr_offset * 3] & 0x1F); - if (fetch_opcode == FetchOpcode::kVertexFetch) { - assert_true(is_vertex_shader()); - GatherVertexFetchInformation( - *reinterpret_cast<const VertexFetchInstruction*>( - ucode_dwords_ + instr_offset * 3)); - } else { - GatherTextureFetchInformation( - *reinterpret_cast<const TextureFetchInstruction*>( - ucode_dwords_ + instr_offset * 3)); - } - } else { - // Gather info needed for the translation pass because having such - // state changed in the middle of translation may break things. Check - // the comments for each specific variable set here to see usage - // restrictions that can be assumed here (such as only marking exports - // as written if the used write mask is non-empty). - auto& op = *reinterpret_cast<const AluInstruction*>(ucode_dwords_ + - instr_offset * 3); - ParsedAluInstruction instr; - ParseAluInstruction(op, instr); - - kills_pixels_ = kills_pixels_ || - ucode::AluVectorOpcodeIsKill(op.vector_opcode()) || - ucode::AluScalarOpcodeIsKill(op.scalar_opcode()); - - if (instr.vector_and_constant_result.storage_target != - InstructionStorageTarget::kRegister || - instr.scalar_result.storage_target != - InstructionStorageTarget::kRegister) { - // Export is done to vector_dest of the ucode instruction for both - // vector and scalar operations - no need to check separately. - assert_true(instr.vector_and_constant_result.storage_target == - instr.scalar_result.storage_target && - instr.vector_and_constant_result.storage_index == - instr.scalar_result.storage_index); - if (instr.vector_and_constant_result.GetUsedWriteMask() || - instr.scalar_result.GetUsedWriteMask()) { - InstructionStorageTarget export_target = - instr.vector_and_constant_result.storage_target; - uint32_t export_index = - instr.vector_and_constant_result.storage_index; - switch (export_target) { - case InstructionStorageTarget::kExportAddress: - // Store used memexport constants because CPU code needs - // addresses and sizes, and also whether there have been - // writes to eA and eM# for register allocation in shader - // translator implementations. - // eA is (hopefully) always written to using: - // mad eA, r#, const0100, c# - // (though there are some exceptions, shaders in Halo 3 for - // some reason set eA to zeros, but the swizzle of the - // constant is not .xyzw in this case, and they don't write to - // eM#). - if (memexport_alloc_count_ > 0 && - memexport_alloc_count_ <= kMaxMemExports) { - uint32_t memexport_stream_constant = - instr.GetMemExportStreamConstant(); - if (memexport_stream_constant != UINT32_MAX) { - memexport_eA_written_ |= uint32_t(1) - << (memexport_alloc_count_ - 1); - memexport_stream_constants_.insert( - memexport_stream_constant); - } else { - XELOGE( - "ShaderTranslator::GatherInstructionInformation: " - "Couldn't extract memexport stream constant index"); - } - } - break; - case InstructionStorageTarget::kExportData: - if (memexport_alloc_count_ > 0 && - memexport_alloc_count_ <= kMaxMemExports) { - memexport_eM_written_[memexport_alloc_count_ - 1] |= - uint32_t(1) << export_index; - } - break; - case InstructionStorageTarget::kColor: - writes_color_targets_[export_index] = true; - break; - case InstructionStorageTarget::kDepth: - writes_depth_ = true; - break; - default: - break; - } - } - } else { - if ((instr.vector_and_constant_result.GetUsedWriteMask() && - instr.vector_and_constant_result.storage_addressing_mode != - InstructionStorageAddressingMode::kStatic) || - (instr.scalar_result.GetUsedWriteMask() && - instr.scalar_result.storage_addressing_mode != - InstructionStorageAddressingMode::kStatic)) { - uses_register_dynamic_addressing_ = true; - } - } - - uint32_t total_operand_count = - instr.vector_operand_count + instr.scalar_operand_count; - for (uint32_t i = 0; i < total_operand_count; ++i) { - const InstructionOperand& operand = - (i < instr.vector_operand_count) - ? instr.vector_operands[i] - : instr.scalar_operands[i - instr.vector_operand_count]; - if (operand.storage_source == InstructionStorageSource::kRegister) { - if (operand.storage_addressing_mode != - InstructionStorageAddressingMode::kStatic) { - uses_register_dynamic_addressing_ = true; - } - } else if (operand.storage_source == - InstructionStorageSource::kConstantFloat) { - if (operand.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic) { - // Store used float constants before translating so the - // translator can use tightly packed indices if not dynamically - // indexed. - uint32_t constant_index = operand.storage_index; - constant_register_map_.float_bitmap[constant_index / 64] |= - uint64_t(1) << (constant_index % 64); - } else { - constant_register_map_.float_dynamic_addressing = true; - } - } - } - } - } - } break; - default: - break; + } else { + auto& op = *reinterpret_cast<const AluInstruction*>(ucode_data_.data() + + instr_offset * 3); + GatherAluInstructionInformation(op, memexport_alloc_current_count, + memexport_eA_written, + ucode_disasm_buffer); + } } } -void ShaderTranslator::GatherVertexFetchInformation( - const VertexFetchInstruction& op) { +void Shader::GatherVertexFetchInformation( + const VertexFetchInstruction& op, + VertexFetchInstruction& previous_vfetch_full, + StringBuffer& ucode_disasm_buffer) { ParsedVertexFetchInstruction fetch_instr; - ParseVertexFetchInstruction(op, &fetch_instr); + if (ParseVertexFetchInstruction(op, previous_vfetch_full, fetch_instr)) { + previous_vfetch_full = op; + } + fetch_instr.Disassemble(&ucode_disasm_buffer); + + GatherFetchResultInformation(fetch_instr.result); // Don't bother setting up a binding for an instruction that fetches nothing. - if (!op.fetches_any_data()) { + if (!fetch_instr.result.GetUsedResultComponents()) { return; } - // Check if using dynamic register indices. - if (op.is_dest_relative() || op.is_src_relative()) { - uses_register_dynamic_addressing_ = true; + for (size_t i = 0; i < fetch_instr.operand_count; ++i) { + GatherOperandInformation(fetch_instr.operands[i]); } // Try to allocate an attribute on an existing binding. @@ -500,17 +310,19 @@ void ShaderTranslator::GatherVertexFetchInformation( } // Populate attribute. - attrib->attrib_index = total_attrib_count_++; attrib->fetch_instr = fetch_instr; - attrib->size_words = xenos::GetVertexFormatSizeInWords( - attrib->fetch_instr.attributes.data_format); } -void ShaderTranslator::GatherTextureFetchInformation( - const TextureFetchInstruction& op) { - // Check if using dynamic register indices. - if (op.is_dest_relative() || op.is_src_relative()) { - uses_register_dynamic_addressing_ = true; +void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op, + uint32_t& unique_texture_bindings, + StringBuffer& ucode_disasm_buffer) { + TextureBinding binding; + ParseTextureFetchInstruction(op, binding.fetch_instr); + binding.fetch_instr.Disassemble(&ucode_disasm_buffer); + + GatherFetchResultInformation(binding.fetch_instr.result); + for (size_t i = 0; i < binding.fetch_instr.operand_count; ++i) { + GatherOperandInformation(binding.fetch_instr.operands[i]); } switch (op.opcode()) { @@ -523,9 +335,7 @@ void ShaderTranslator::GatherTextureFetchInformation( // Continue. break; } - Shader::TextureBinding binding; binding.binding_index = -1; - ParseTextureFetchInstruction(op, &binding.fetch_instr); binding.fetch_constant = binding.fetch_instr.operands[1].storage_index; // Check and see if this fetch constant was previously used... @@ -538,349 +348,502 @@ void ShaderTranslator::GatherTextureFetchInformation( if (binding.binding_index == -1) { // Assign a unique binding index. - binding.binding_index = unique_texture_bindings_++; + binding.binding_index = unique_texture_bindings++; } texture_bindings_.emplace_back(std::move(binding)); } -std::vector<uint8_t> UcodeShaderTranslator::CompleteTranslation() { - return ucode_disasm_buffer().to_bytes(); +void Shader::GatherAluInstructionInformation( + const AluInstruction& op, uint32_t memexport_alloc_current_count, + uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) { + ParsedAluInstruction instr; + ParseAluInstruction(op, type(), instr); + instr.Disassemble(&ucode_disasm_buffer); + + kills_pixels_ = kills_pixels_ || + ucode::AluVectorOpcodeIsKill(op.vector_opcode()) || + ucode::AluScalarOpcodeIsKill(op.scalar_opcode()); + + GatherAluResultInformation(instr.vector_and_constant_result, + memexport_alloc_current_count); + GatherAluResultInformation(instr.scalar_result, + memexport_alloc_current_count); + for (size_t i = 0; i < instr.vector_operand_count; ++i) { + GatherOperandInformation(instr.vector_operands[i]); + } + for (size_t i = 0; i < instr.scalar_operand_count; ++i) { + GatherOperandInformation(instr.scalar_operands[i]); + } + + // Store used memexport constants because CPU code needs addresses and sizes, + // and also whether there have been writes to eA and eM# for register + // allocation in shader translator implementations. + // eA is (hopefully) always written to using: + // mad eA, r#, const0100, c# + // (though there are some exceptions, shaders in Halo 3 for some reason set eA + // to zeros, but the swizzle of the constant is not .xyzw in this case, and + // they don't write to eM#). + // Export is done to vector_dest of the ucode instruction for both vector and + // scalar operations - no need to check separately. + if (instr.vector_and_constant_result.storage_target == + InstructionStorageTarget::kExportAddress && + memexport_alloc_current_count > 0 && + memexport_alloc_current_count <= Shader::kMaxMemExports) { + uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant(); + if (memexport_stream_constant != UINT32_MAX) { + memexport_eA_written |= uint32_t(1) + << (memexport_alloc_current_count - 1); + memexport_stream_constants_.insert(memexport_stream_constant); + } else { + XELOGE( + "ShaderTranslator::GatherAluInstructionInformation: Couldn't extract " + "memexport stream constant index"); + } + } +} + +void Shader::GatherOperandInformation(const InstructionOperand& operand) { + switch (operand.storage_source) { + case InstructionStorageSource::kRegister: + if (operand.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic) { + register_static_address_bound_ = + std::max(register_static_address_bound_, + operand.storage_index + uint32_t(1)); + } else { + uses_register_dynamic_addressing_ = true; + } + break; + case InstructionStorageSource::kConstantFloat: + if (operand.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic) { + // Store used float constants before translating so the + // translator can use tightly packed indices if not dynamically + // indexed. + constant_register_map_.float_bitmap[operand.storage_index >> 6] |= + uint64_t(1) << (operand.storage_index & 63); + } else { + constant_register_map_.float_dynamic_addressing = true; + } + break; + default: + break; + } +} + +void Shader::GatherFetchResultInformation(const InstructionResult& result) { + if (!result.GetUsedWriteMask()) { + return; + } + // Fetch instructions can't export - don't need the current memexport count + // operand. + assert_true(result.storage_target == InstructionStorageTarget::kRegister); + if (result.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic) { + register_static_address_bound_ = std::max( + register_static_address_bound_, result.storage_index + uint32_t(1)); + } else { + uses_register_dynamic_addressing_ = true; + } +} + +void Shader::GatherAluResultInformation( + const InstructionResult& result, uint32_t memexport_alloc_current_count) { + if (!result.GetUsedWriteMask()) { + return; + } + switch (result.storage_target) { + case InstructionStorageTarget::kRegister: + if (result.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic) { + register_static_address_bound_ = std::max( + register_static_address_bound_, result.storage_index + uint32_t(1)); + } else { + uses_register_dynamic_addressing_ = true; + } + break; + case InstructionStorageTarget::kExportData: + if (memexport_alloc_current_count > 0 && + memexport_alloc_current_count <= Shader::kMaxMemExports) { + memexport_eM_written_[memexport_alloc_current_count - 1] |= + uint32_t(1) << result.storage_index; + } + break; + case InstructionStorageTarget::kColor: + writes_color_targets_ |= uint32_t(1) << result.storage_index; + break; + case InstructionStorageTarget::kDepth: + writes_depth_ = true; + break; + } +} + +ShaderTranslator::ShaderTranslator() = default; + +ShaderTranslator::~ShaderTranslator() = default; + +void ShaderTranslator::Reset() { + errors_.clear(); + std::memset(&previous_vfetch_full_, 0, sizeof(previous_vfetch_full_)); +} + +bool ShaderTranslator::TranslateAnalyzedShader( + Shader::Translation& translation) { + const Shader& shader = translation.shader(); + assert_true(shader.is_ucode_analyzed()); + if (!shader.is_ucode_analyzed()) { + XELOGE("AnalyzeUcode must be done on the shader before translation"); + return false; + } + translation_ = &translation; + + Reset(); + + register_count_ = shader.register_static_address_bound(); + if (shader.uses_register_dynamic_addressing()) { + // An array of registers at the end of the r# space may be dynamically + // addressable - ensure enough space, as specified in SQ_PROGRAM_CNTL, is + // allocated. + register_count_ = std::max(register_count_, GetModificationRegisterCount()); + } + + StartTranslation(); + + const uint32_t* ucode_dwords = shader.ucode_data().data(); + + // TODO(Triang3l): Remove when the old SPIR-V shader translator is deleted. + uint32_t cf_pair_index_bound = shader.cf_pair_index_bound(); + std::vector<ControlFlowInstruction> cf_instructions; + for (uint32_t i = 0; i < cf_pair_index_bound; ++i) { + ControlFlowInstruction cf_ab[2]; + UnpackControlFlowInstructions(ucode_dwords + i * 3, cf_ab); + cf_instructions.push_back(cf_ab[0]); + cf_instructions.push_back(cf_ab[1]); + } + PreProcessControlFlowInstructions(cf_instructions); + + // Translate all instructions. + const std::set<uint32_t>& label_addresses = shader.label_addresses(); + for (uint32_t i = 0; i < cf_pair_index_bound; ++i) { + ControlFlowInstruction cf_ab[2]; + UnpackControlFlowInstructions(ucode_dwords + i * 3, cf_ab); + for (uint32_t j = 0; j < 2; ++j) { + uint32_t cf_index = i * 2 + j; + cf_index_ = cf_index; + if (label_addresses.find(cf_index) != label_addresses.end()) { + ProcessLabel(cf_index); + } + ProcessControlFlowInstructionBegin(cf_index); + TranslateControlFlowInstruction(cf_ab[j]); + ProcessControlFlowInstructionEnd(cf_index); + } + } + + translation.errors_ = std::move(errors_); + translation.translated_binary_ = CompleteTranslation(); + translation.is_translated_ = true; + + bool is_valid = true; + for (const auto& error : translation.errors_) { + if (error.is_fatal) { + is_valid = false; + break; + } + } + translation.is_valid_ = is_valid; + + PostTranslation(); + + // In case is_valid_ is modified by PostTranslation, reload. + return translation.is_valid_; +} + +void ShaderTranslator::EmitTranslationError(const char* message, + bool is_fatal) { + Shader::Error error; + error.is_fatal = is_fatal; + error.message = message; + // TODO(benvanik): location information. + errors_.push_back(std::move(error)); + XELOGE("Shader translation {}error: {}", is_fatal ? "fatal " : "", message); } void ShaderTranslator::TranslateControlFlowInstruction( const ControlFlowInstruction& cf) { switch (cf.opcode()) { case ControlFlowOpcode::kNop: - TranslateControlFlowNop(cf); + ProcessControlFlowNopInstruction(cf_index_); break; case ControlFlowOpcode::kExec: - TranslateControlFlowExec(cf.exec); - break; - case ControlFlowOpcode::kExecEnd: - TranslateControlFlowExec(cf.exec); - break; + case ControlFlowOpcode::kExecEnd: { + ParsedExecInstruction instr; + ParseControlFlowExec(cf.exec, cf_index_, instr); + TranslateExecInstructions(instr); + } break; case ControlFlowOpcode::kCondExec: - TranslateControlFlowCondExec(cf.cond_exec); - break; case ControlFlowOpcode::kCondExecEnd: - TranslateControlFlowCondExec(cf.cond_exec); - break; - case ControlFlowOpcode::kCondExecPred: - TranslateControlFlowCondExecPred(cf.cond_exec_pred); - break; - case ControlFlowOpcode::kCondExecPredEnd: - TranslateControlFlowCondExecPred(cf.cond_exec_pred); - break; case ControlFlowOpcode::kCondExecPredClean: - TranslateControlFlowCondExec(cf.cond_exec); - break; - case ControlFlowOpcode::kCondExecPredCleanEnd: - TranslateControlFlowCondExec(cf.cond_exec); - break; - case ControlFlowOpcode::kLoopStart: - TranslateControlFlowLoopStart(cf.loop_start); - break; - case ControlFlowOpcode::kLoopEnd: - TranslateControlFlowLoopEnd(cf.loop_end); - break; - case ControlFlowOpcode::kCondCall: - TranslateControlFlowCondCall(cf.cond_call); - break; - case ControlFlowOpcode::kReturn: - TranslateControlFlowReturn(cf.ret); - break; - case ControlFlowOpcode::kCondJmp: - TranslateControlFlowCondJmp(cf.cond_jmp); - break; - case ControlFlowOpcode::kAlloc: - TranslateControlFlowAlloc(cf.alloc); - break; + case ControlFlowOpcode::kCondExecPredCleanEnd: { + ParsedExecInstruction instr; + ParseControlFlowCondExec(cf.cond_exec, cf_index_, instr); + TranslateExecInstructions(instr); + } break; + case ControlFlowOpcode::kCondExecPred: + case ControlFlowOpcode::kCondExecPredEnd: { + ParsedExecInstruction instr; + ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index_, instr); + TranslateExecInstructions(instr); + } break; + case ControlFlowOpcode::kLoopStart: { + ParsedLoopStartInstruction instr; + ParseControlFlowLoopStart(cf.loop_start, cf_index_, instr); + ProcessLoopStartInstruction(instr); + } break; + case ControlFlowOpcode::kLoopEnd: { + ParsedLoopEndInstruction instr; + ParseControlFlowLoopEnd(cf.loop_end, cf_index_, instr); + ProcessLoopEndInstruction(instr); + } break; + case ControlFlowOpcode::kCondCall: { + ParsedCallInstruction instr; + ParseControlFlowCondCall(cf.cond_call, cf_index_, instr); + ProcessCallInstruction(instr); + } break; + case ControlFlowOpcode::kReturn: { + ParsedReturnInstruction instr; + ParseControlFlowReturn(cf.ret, cf_index_, instr); + ProcessReturnInstruction(instr); + } break; + case ControlFlowOpcode::kCondJmp: { + ParsedJumpInstruction instr; + ParseControlFlowCondJmp(cf.cond_jmp, cf_index_, instr); + ProcessJumpInstruction(instr); + } break; + case ControlFlowOpcode::kAlloc: { + ParsedAllocInstruction instr; + ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr); + ProcessAllocInstruction(instr); + } break; case ControlFlowOpcode::kMarkVsFetchDone: break; default: assert_unhandled_case(cf.opcode); break; } - bool ends_shader = DoesControlFlowOpcodeEndShader(cf.opcode()); - if (ends_shader) { - // TODO(benvanik): return? - } + // TODO(benvanik): return if (DoesControlFlowOpcodeEndShader(cf.opcode()))? } -void ShaderTranslator::TranslateControlFlowNop( - const ControlFlowInstruction& cf) { - ucode_disasm_buffer_.Append(" cnop\n"); - - ProcessControlFlowNopInstruction(cf_index_); +void ParseControlFlowExec(const ControlFlowExecInstruction& cf, + uint32_t cf_index, ParsedExecInstruction& instr) { + instr.dword_index = cf_index; + instr.opcode = cf.opcode(); + instr.opcode_name = + cf.opcode() == ControlFlowOpcode::kExecEnd ? "exece" : "exec"; + instr.instruction_address = cf.address(); + instr.instruction_count = cf.count(); + instr.type = ParsedExecInstruction::Type::kUnconditional; + instr.is_end = cf.opcode() == ControlFlowOpcode::kExecEnd; + instr.clean = cf.clean(); + instr.is_yield = cf.is_yield(); + instr.sequence = cf.sequence(); } -void ShaderTranslator::TranslateControlFlowExec( - const ControlFlowExecInstruction& cf) { - ParsedExecInstruction i; - i.dword_index = cf_index_; - i.opcode = cf.opcode(); - i.opcode_name = cf.opcode() == ControlFlowOpcode::kExecEnd ? "exece" : "exec"; - i.instruction_address = cf.address(); - i.instruction_count = cf.count(); - i.type = ParsedExecInstruction::Type::kUnconditional; - i.is_end = cf.opcode() == ControlFlowOpcode::kExecEnd; - i.clean = cf.clean(); - i.is_yield = cf.is_yield(); - i.sequence = cf.sequence(); - - TranslateExecInstructions(i); -} - -void ShaderTranslator::TranslateControlFlowCondExec( - const ControlFlowCondExecInstruction& cf) { - ParsedExecInstruction i; - i.dword_index = cf_index_; - i.opcode = cf.opcode(); - i.opcode_name = "cexec"; +void ParseControlFlowCondExec(const ControlFlowCondExecInstruction& cf, + uint32_t cf_index, ParsedExecInstruction& instr) { + instr.dword_index = cf_index; + instr.opcode = cf.opcode(); + instr.opcode_name = "cexec"; switch (cf.opcode()) { case ControlFlowOpcode::kCondExecEnd: case ControlFlowOpcode::kCondExecPredCleanEnd: - i.opcode_name = "cexece"; - i.is_end = true; + instr.opcode_name = "cexece"; + instr.is_end = true; break; default: break; } - i.instruction_address = cf.address(); - i.instruction_count = cf.count(); - i.type = ParsedExecInstruction::Type::kConditional; - i.bool_constant_index = cf.bool_address(); - assert_not_zero( - constant_register_map_.bool_bitmap[i.bool_constant_index / 32] & - (uint32_t(1) << (i.bool_constant_index % 32))); - i.condition = cf.condition(); + instr.instruction_address = cf.address(); + instr.instruction_count = cf.count(); + instr.type = ParsedExecInstruction::Type::kConditional; + instr.bool_constant_index = cf.bool_address(); + instr.condition = cf.condition(); switch (cf.opcode()) { case ControlFlowOpcode::kCondExec: case ControlFlowOpcode::kCondExecEnd: - i.clean = false; + instr.clean = false; break; default: break; } - i.is_yield = cf.is_yield(); - i.sequence = cf.sequence(); - - TranslateExecInstructions(i); + instr.is_yield = cf.is_yield(); + instr.sequence = cf.sequence(); } -void ShaderTranslator::TranslateControlFlowCondExecPred( - const ControlFlowCondExecPredInstruction& cf) { - ParsedExecInstruction i; - i.dword_index = cf_index_; - i.opcode = cf.opcode(); - i.opcode_name = +void ParseControlFlowCondExecPred(const ControlFlowCondExecPredInstruction& cf, + uint32_t cf_index, + ParsedExecInstruction& instr) { + instr.dword_index = cf_index; + instr.opcode = cf.opcode(); + instr.opcode_name = cf.opcode() == ControlFlowOpcode::kCondExecPredEnd ? "exece" : "exec"; - i.instruction_address = cf.address(); - i.instruction_count = cf.count(); - i.type = ParsedExecInstruction::Type::kPredicated; - i.condition = cf.condition(); - i.is_end = cf.opcode() == ControlFlowOpcode::kCondExecPredEnd; - i.clean = cf.clean(); - i.is_yield = cf.is_yield(); - i.sequence = cf.sequence(); - - TranslateExecInstructions(i); + instr.instruction_address = cf.address(); + instr.instruction_count = cf.count(); + instr.type = ParsedExecInstruction::Type::kPredicated; + instr.condition = cf.condition(); + instr.is_end = cf.opcode() == ControlFlowOpcode::kCondExecPredEnd; + instr.clean = cf.clean(); + instr.is_yield = cf.is_yield(); + instr.sequence = cf.sequence(); } -void ShaderTranslator::TranslateControlFlowLoopStart( - const ControlFlowLoopStartInstruction& cf) { - ParsedLoopStartInstruction i; - i.dword_index = cf_index_; - i.loop_constant_index = cf.loop_id(); - assert_not_zero(constant_register_map_.loop_bitmap & - (uint32_t(1) << i.loop_constant_index)); - i.is_repeat = cf.is_repeat(); - i.loop_skip_address = cf.address(); - - i.Disassemble(&ucode_disasm_buffer_); - - ProcessLoopStartInstruction(i); +void ParseControlFlowLoopStart(const ControlFlowLoopStartInstruction& cf, + uint32_t cf_index, + ParsedLoopStartInstruction& instr) { + instr.dword_index = cf_index; + instr.loop_constant_index = cf.loop_id(); + instr.is_repeat = cf.is_repeat(); + instr.loop_skip_address = cf.address(); } -void ShaderTranslator::TranslateControlFlowLoopEnd( - const ControlFlowLoopEndInstruction& cf) { - ParsedLoopEndInstruction i; - i.dword_index = cf_index_; - i.is_predicated_break = cf.is_predicated_break(); - i.predicate_condition = cf.condition(); - i.loop_constant_index = cf.loop_id(); - assert_not_zero(constant_register_map_.loop_bitmap & - (uint32_t(1) << i.loop_constant_index)); - i.loop_body_address = cf.address(); - - i.Disassemble(&ucode_disasm_buffer_); - - ProcessLoopEndInstruction(i); +void ParseControlFlowLoopEnd(const ControlFlowLoopEndInstruction& cf, + uint32_t cf_index, + ParsedLoopEndInstruction& instr) { + instr.dword_index = cf_index; + instr.is_predicated_break = cf.is_predicated_break(); + instr.predicate_condition = cf.condition(); + instr.loop_constant_index = cf.loop_id(); + instr.loop_body_address = cf.address(); } -void ShaderTranslator::TranslateControlFlowCondCall( - const ControlFlowCondCallInstruction& cf) { - ParsedCallInstruction i; - i.dword_index = cf_index_; - i.target_address = cf.address(); +void ParseControlFlowCondCall(const ControlFlowCondCallInstruction& cf, + uint32_t cf_index, ParsedCallInstruction& instr) { + instr.dword_index = cf_index; + instr.target_address = cf.address(); if (cf.is_unconditional()) { - i.type = ParsedCallInstruction::Type::kUnconditional; + instr.type = ParsedCallInstruction::Type::kUnconditional; } else if (cf.is_predicated()) { - i.type = ParsedCallInstruction::Type::kPredicated; - i.condition = cf.condition(); + instr.type = ParsedCallInstruction::Type::kPredicated; + instr.condition = cf.condition(); } else { - i.type = ParsedCallInstruction::Type::kConditional; - i.bool_constant_index = cf.bool_address(); - assert_not_zero( - constant_register_map_.bool_bitmap[i.bool_constant_index / 32] & - (uint32_t(1) << (i.bool_constant_index % 32))); - i.condition = cf.condition(); + instr.type = ParsedCallInstruction::Type::kConditional; + instr.bool_constant_index = cf.bool_address(); + instr.condition = cf.condition(); } - - i.Disassemble(&ucode_disasm_buffer_); - - ProcessCallInstruction(i); } -void ShaderTranslator::TranslateControlFlowReturn( - const ControlFlowReturnInstruction& cf) { - ParsedReturnInstruction i; - i.dword_index = cf_index_; - - i.Disassemble(&ucode_disasm_buffer_); - - ProcessReturnInstruction(i); +void ParseControlFlowReturn(const ControlFlowReturnInstruction& cf, + uint32_t cf_index, ParsedReturnInstruction& instr) { + instr.dword_index = cf_index; } -void ShaderTranslator::TranslateControlFlowCondJmp( - const ControlFlowCondJmpInstruction& cf) { - ParsedJumpInstruction i; - i.dword_index = cf_index_; - i.target_address = cf.address(); +void ParseControlFlowCondJmp(const ControlFlowCondJmpInstruction& cf, + uint32_t cf_index, ParsedJumpInstruction& instr) { + instr.dword_index = cf_index; + instr.target_address = cf.address(); if (cf.is_unconditional()) { - i.type = ParsedJumpInstruction::Type::kUnconditional; + instr.type = ParsedJumpInstruction::Type::kUnconditional; } else if (cf.is_predicated()) { - i.type = ParsedJumpInstruction::Type::kPredicated; - i.condition = cf.condition(); + instr.type = ParsedJumpInstruction::Type::kPredicated; + instr.condition = cf.condition(); } else { - i.type = ParsedJumpInstruction::Type::kConditional; - i.bool_constant_index = cf.bool_address(); - assert_not_zero( - constant_register_map_.bool_bitmap[i.bool_constant_index / 32] & - (uint32_t(1) << (i.bool_constant_index % 32))); - i.condition = cf.condition(); + instr.type = ParsedJumpInstruction::Type::kConditional; + instr.bool_constant_index = cf.bool_address(); + instr.condition = cf.condition(); } - - i.Disassemble(&ucode_disasm_buffer_); - - ProcessJumpInstruction(i); } -void ShaderTranslator::TranslateControlFlowAlloc( - const ControlFlowAllocInstruction& cf) { - ParsedAllocInstruction i; - i.dword_index = cf_index_; - i.type = cf.alloc_type(); - i.count = cf.size(); - i.is_vertex_shader = is_vertex_shader(); - - i.Disassemble(&ucode_disasm_buffer_); - - ProcessAllocInstruction(i); +void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf, + uint32_t cf_index, bool is_vertex_shader, + ParsedAllocInstruction& instr) { + instr.dword_index = cf_index; + instr.type = cf.alloc_type(); + instr.count = cf.size(); + instr.is_vertex_shader = is_vertex_shader; } void ShaderTranslator::TranslateExecInstructions( const ParsedExecInstruction& instr) { - instr.Disassemble(&ucode_disasm_buffer_); - ProcessExecInstructionBegin(instr); - + const uint32_t* ucode_dwords = current_shader().ucode_data().data(); uint32_t sequence = instr.sequence; for (uint32_t instr_offset = instr.instruction_address; instr_offset < instr.instruction_address + instr.instruction_count; ++instr_offset, sequence >>= 2) { - MarkUcodeInstruction(instr_offset); - AppendUcodeDisasmFormat("/* %4u */ ", instr_offset); - bool is_sync = (sequence & 0x2) == 0x2; - bool is_fetch = (sequence & 0x1) == 0x1; - if (is_sync) { - AppendUcodeDisasm(" serialize\n "); - } - if (is_fetch) { + if (sequence & 0b01) { auto fetch_opcode = - static_cast<FetchOpcode>(ucode_dwords_[instr_offset * 3] & 0x1F); + static_cast<FetchOpcode>(ucode_dwords[instr_offset * 3] & 0x1F); if (fetch_opcode == FetchOpcode::kVertexFetch) { auto& op = *reinterpret_cast<const VertexFetchInstruction*>( - ucode_dwords_ + instr_offset * 3); - TranslateVertexFetchInstruction(op); + ucode_dwords + instr_offset * 3); + ParsedVertexFetchInstruction vfetch_instr; + if (ParseVertexFetchInstruction(op, previous_vfetch_full_, + vfetch_instr)) { + previous_vfetch_full_ = op; + } + ProcessVertexFetchInstruction(vfetch_instr); } else { auto& op = *reinterpret_cast<const TextureFetchInstruction*>( - ucode_dwords_ + instr_offset * 3); - TranslateTextureFetchInstruction(op); + ucode_dwords + instr_offset * 3); + ParsedTextureFetchInstruction tfetch_instr; + ParseTextureFetchInstruction(op, tfetch_instr); + ProcessTextureFetchInstruction(tfetch_instr); } } else { - auto& op = *reinterpret_cast<const AluInstruction*>(ucode_dwords_ + + auto& op = *reinterpret_cast<const AluInstruction*>(ucode_dwords + instr_offset * 3); - TranslateAluInstruction(op); + ParsedAluInstruction alu_instr; + ParseAluInstruction(op, current_shader().type(), alu_instr); + ProcessAluInstruction(alu_instr); } } - ProcessExecInstructionEnd(instr); } -void ParseFetchInstructionResult(uint32_t dest, uint32_t swizzle, - bool is_relative, - InstructionResult* out_result) { - out_result->storage_target = InstructionStorageTarget::kRegister; - out_result->storage_index = dest; - out_result->is_clamped = false; - out_result->storage_addressing_mode = +static void ParseFetchInstructionResult(uint32_t dest, uint32_t swizzle, + bool is_relative, + InstructionResult& result) { + result.storage_target = InstructionStorageTarget::kRegister; + result.storage_index = dest; + result.is_clamped = false; + result.storage_addressing_mode = is_relative ? InstructionStorageAddressingMode::kAddressRelative : InstructionStorageAddressingMode::kStatic; - out_result->original_write_mask = 0b1111; + result.original_write_mask = 0b1111; for (int i = 0; i < 4; ++i) { switch (swizzle & 0x7) { case 4: case 6: - out_result->components[i] = SwizzleSource::k0; + result.components[i] = SwizzleSource::k0; break; case 5: - out_result->components[i] = SwizzleSource::k1; + result.components[i] = SwizzleSource::k1; break; case 7: - out_result->original_write_mask &= ~uint32_t(1 << i); + result.original_write_mask &= ~uint32_t(1 << i); break; default: - out_result->components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3); + result.components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3); } swizzle >>= 3; } } -void ShaderTranslator::TranslateVertexFetchInstruction( - const VertexFetchInstruction& op) { - ParsedVertexFetchInstruction instr; - ParseVertexFetchInstruction(op, &instr); - instr.Disassemble(&ucode_disasm_buffer_); - ProcessVertexFetchInstruction(instr); -} - -void ShaderTranslator::ParseVertexFetchInstruction( - const VertexFetchInstruction& op, ParsedVertexFetchInstruction* out_instr) { - auto& i = *out_instr; - i.opcode = FetchOpcode::kVertexFetch; - i.opcode_name = op.is_mini_fetch() ? "vfetch_mini" : "vfetch_full"; - i.is_mini_fetch = op.is_mini_fetch(); - i.is_predicated = op.is_predicated(); - i.predicate_condition = op.predicate_condition(); +bool ParseVertexFetchInstruction(const VertexFetchInstruction& op, + const VertexFetchInstruction& previous_full_op, + ParsedVertexFetchInstruction& instr) { + instr.opcode = FetchOpcode::kVertexFetch; + instr.opcode_name = op.is_mini_fetch() ? "vfetch_mini" : "vfetch_full"; + instr.is_mini_fetch = op.is_mini_fetch(); + instr.is_predicated = op.is_predicated(); + instr.predicate_condition = op.predicate_condition(); ParseFetchInstructionResult(op.dest(), op.dest_swizzle(), - op.is_dest_relative(), &i.result); + op.is_dest_relative(), instr.result); // Reuse previous vfetch_full if this is a mini. - const auto& full_op = op.is_mini_fetch() ? previous_vfetch_full_ : op; - auto& src_op = i.operands[i.operand_count++]; + const auto& full_op = op.is_mini_fetch() ? previous_full_op : op; + auto& src_op = instr.operands[instr.operand_count++]; src_op.storage_source = InstructionStorageSource::kRegister; src_op.storage_index = full_op.src(); src_op.storage_addressing_mode = @@ -895,37 +858,25 @@ void ShaderTranslator::ParseVertexFetchInstruction( src_op.components[j] = GetSwizzleFromComponentIndex(swizzle & 0x3); } - auto& const_op = i.operands[i.operand_count++]; + auto& const_op = instr.operands[instr.operand_count++]; const_op.storage_source = InstructionStorageSource::kVertexFetchConstant; const_op.storage_index = full_op.fetch_constant_index(); - i.attributes.data_format = op.data_format(); - i.attributes.offset = op.offset(); - i.attributes.stride = full_op.stride(); - i.attributes.exp_adjust = op.exp_adjust(); - i.attributes.prefetch_count = op.prefetch_count(); - i.attributes.is_index_rounded = op.is_index_rounded(); - i.attributes.is_signed = op.is_signed(); - i.attributes.is_integer = !op.is_normalized(); - i.attributes.signed_rf_mode = op.signed_rf_mode(); + instr.attributes.data_format = op.data_format(); + instr.attributes.offset = op.offset(); + instr.attributes.stride = full_op.stride(); + instr.attributes.exp_adjust = op.exp_adjust(); + instr.attributes.prefetch_count = op.prefetch_count(); + instr.attributes.is_index_rounded = op.is_index_rounded(); + instr.attributes.is_signed = op.is_signed(); + instr.attributes.is_integer = !op.is_normalized(); + instr.attributes.signed_rf_mode = op.signed_rf_mode(); - // Store for later use by mini fetches. - if (!op.is_mini_fetch()) { - previous_vfetch_full_ = op; - } + return !op.is_mini_fetch(); } -void ShaderTranslator::TranslateTextureFetchInstruction( - const TextureFetchInstruction& op) { - ParsedTextureFetchInstruction instr; - ParseTextureFetchInstruction(op, &instr); - instr.Disassemble(&ucode_disasm_buffer_); - ProcessTextureFetchInstruction(instr); -} - -void ShaderTranslator::ParseTextureFetchInstruction( - const TextureFetchInstruction& op, - ParsedTextureFetchInstruction* out_instr) { +void ParseTextureFetchInstruction(const TextureFetchInstruction& op, + ParsedTextureFetchInstruction& instr) { struct TextureFetchOpcodeInfo { const char* name; bool has_dest; @@ -975,21 +926,20 @@ void ShaderTranslator::ParseTextureFetchInstruction( return; } - auto& i = *out_instr; - i.opcode = op.opcode(); - i.opcode_name = opcode_info.name; - i.dimension = op.dimension(); - i.is_predicated = op.is_predicated(); - i.predicate_condition = op.predicate_condition(); + instr.opcode = op.opcode(); + instr.opcode_name = opcode_info.name; + instr.dimension = op.dimension(); + instr.is_predicated = op.is_predicated(); + instr.predicate_condition = op.predicate_condition(); if (opcode_info.has_dest) { ParseFetchInstructionResult(op.dest(), op.dest_swizzle(), - op.is_dest_relative(), &i.result); + op.is_dest_relative(), instr.result); } else { - i.result.storage_target = InstructionStorageTarget::kNone; + instr.result.storage_target = InstructionStorageTarget::kNone; } - auto& src_op = i.operands[i.operand_count++]; + auto& src_op = instr.operands[instr.operand_count++]; src_op.storage_source = InstructionStorageSource::kRegister; src_op.storage_index = op.src(); src_op.storage_addressing_mode = @@ -1007,27 +957,27 @@ void ShaderTranslator::ParseTextureFetchInstruction( } if (opcode_info.has_const) { - auto& const_op = i.operands[i.operand_count++]; + auto& const_op = instr.operands[instr.operand_count++]; const_op.storage_source = InstructionStorageSource::kTextureFetchConstant; const_op.storage_index = op.fetch_constant_index(); } if (opcode_info.has_attributes) { - i.attributes.fetch_valid_only = op.fetch_valid_only(); - i.attributes.unnormalized_coordinates = op.unnormalized_coordinates(); - i.attributes.mag_filter = op.mag_filter(); - i.attributes.min_filter = op.min_filter(); - i.attributes.mip_filter = op.mip_filter(); - i.attributes.aniso_filter = op.aniso_filter(); - i.attributes.vol_mag_filter = op.vol_mag_filter(); - i.attributes.vol_min_filter = op.vol_min_filter(); - i.attributes.use_computed_lod = op.use_computed_lod(); - i.attributes.use_register_lod = op.use_register_lod(); - i.attributes.use_register_gradients = op.use_register_gradients(); - i.attributes.lod_bias = op.lod_bias(); - i.attributes.offset_x = op.offset_x(); - i.attributes.offset_y = op.offset_y(); - i.attributes.offset_z = op.offset_z(); + instr.attributes.fetch_valid_only = op.fetch_valid_only(); + instr.attributes.unnormalized_coordinates = op.unnormalized_coordinates(); + instr.attributes.mag_filter = op.mag_filter(); + instr.attributes.min_filter = op.min_filter(); + instr.attributes.mip_filter = op.mip_filter(); + instr.attributes.aniso_filter = op.aniso_filter(); + instr.attributes.vol_mag_filter = op.vol_mag_filter(); + instr.attributes.vol_min_filter = op.vol_min_filter(); + instr.attributes.use_computed_lod = op.use_computed_lod(); + instr.attributes.use_register_lod = op.use_register_lod(); + instr.attributes.use_register_gradients = op.use_register_gradients(); + instr.attributes.lod_bias = op.lod_bias(); + instr.attributes.offset_x = op.offset_x(); + instr.attributes.offset_y = op.offset_y(); + instr.attributes.offset_z = op.offset_z(); } } @@ -1079,250 +1029,102 @@ uint32_t ParsedTextureFetchInstruction::GetNonZeroResultComponents() const { return result.GetUsedResultComponents() & components; } -const ShaderTranslator::AluOpcodeInfo - ShaderTranslator::alu_vector_opcode_infos_[0x20] = { - {"add", 2, 4}, // 0 - {"mul", 2, 4}, // 1 - {"max", 2, 4}, // 2 - {"min", 2, 4}, // 3 - {"seq", 2, 4}, // 4 - {"sgt", 2, 4}, // 5 - {"sge", 2, 4}, // 6 - {"sne", 2, 4}, // 7 - {"frc", 1, 4}, // 8 - {"trunc", 1, 4}, // 9 - {"floor", 1, 4}, // 10 - {"mad", 3, 4}, // 11 - {"cndeq", 3, 4}, // 12 - {"cndge", 3, 4}, // 13 - {"cndgt", 3, 4}, // 14 - {"dp4", 2, 4}, // 15 - {"dp3", 2, 4}, // 16 - {"dp2add", 3, 4}, // 17 - {"cube", 2, 4}, // 18 - {"max4", 1, 4}, // 19 - {"setp_eq_push", 2, 4}, // 20 - {"setp_ne_push", 2, 4}, // 21 - {"setp_gt_push", 2, 4}, // 22 - {"setp_ge_push", 2, 4}, // 23 - {"kill_eq", 2, 4}, // 24 - {"kill_gt", 2, 4}, // 25 - {"kill_ge", 2, 4}, // 26 - {"kill_ne", 2, 4}, // 27 - {"dst", 2, 4}, // 28 - {"maxa", 2, 4}, // 29 +struct AluOpcodeInfo { + const char* name; + uint32_t argument_count; + uint32_t src_swizzle_component_count; }; -const ShaderTranslator::AluOpcodeInfo - ShaderTranslator::alu_scalar_opcode_infos_[0x40] = { - {"adds", 1, 2}, // 0 - {"adds_prev", 1, 1}, // 1 - {"muls", 1, 2}, // 2 - {"muls_prev", 1, 1}, // 3 - {"muls_prev2", 1, 2}, // 4 - {"maxs", 1, 2}, // 5 - {"mins", 1, 2}, // 6 - {"seqs", 1, 1}, // 7 - {"sgts", 1, 1}, // 8 - {"sges", 1, 1}, // 9 - {"snes", 1, 1}, // 10 - {"frcs", 1, 1}, // 11 - {"truncs", 1, 1}, // 12 - {"floors", 1, 1}, // 13 - {"exp", 1, 1}, // 14 - {"logc", 1, 1}, // 15 - {"log", 1, 1}, // 16 - {"rcpc", 1, 1}, // 17 - {"rcpf", 1, 1}, // 18 - {"rcp", 1, 1}, // 19 - {"rsqc", 1, 1}, // 20 - {"rsqf", 1, 1}, // 21 - {"rsq", 1, 1}, // 22 - {"maxas", 1, 2}, // 23 - {"maxasf", 1, 2}, // 24 - {"subs", 1, 2}, // 25 - {"subs_prev", 1, 1}, // 26 - {"setp_eq", 1, 1}, // 27 - {"setp_ne", 1, 1}, // 28 - {"setp_gt", 1, 1}, // 29 - {"setp_ge", 1, 1}, // 30 - {"setp_inv", 1, 1}, // 31 - {"setp_pop", 1, 1}, // 32 - {"setp_clr", 0, 0}, // 33 - {"setp_rstr", 1, 1}, // 34 - {"kills_eq", 1, 1}, // 35 - {"kills_gt", 1, 1}, // 36 - {"kills_ge", 1, 1}, // 37 - {"kills_ne", 1, 1}, // 38 - {"kills_one", 1, 1}, // 39 - {"sqrt", 1, 1}, // 40 - {"UNKNOWN", 0, 0}, // 41 - {"mulsc", 2, 1}, // 42 - {"mulsc", 2, 1}, // 43 - {"addsc", 2, 1}, // 44 - {"addsc", 2, 1}, // 45 - {"subsc", 2, 1}, // 46 - {"subsc", 2, 1}, // 47 - {"sin", 1, 1}, // 48 - {"cos", 1, 1}, // 49 - {"retain_prev", 0, 0}, // 50 +static const AluOpcodeInfo alu_vector_opcode_infos[0x20] = { + {"add", 2, 4}, // 0 + {"mul", 2, 4}, // 1 + {"max", 2, 4}, // 2 + {"min", 2, 4}, // 3 + {"seq", 2, 4}, // 4 + {"sgt", 2, 4}, // 5 + {"sge", 2, 4}, // 6 + {"sne", 2, 4}, // 7 + {"frc", 1, 4}, // 8 + {"trunc", 1, 4}, // 9 + {"floor", 1, 4}, // 10 + {"mad", 3, 4}, // 11 + {"cndeq", 3, 4}, // 12 + {"cndge", 3, 4}, // 13 + {"cndgt", 3, 4}, // 14 + {"dp4", 2, 4}, // 15 + {"dp3", 2, 4}, // 16 + {"dp2add", 3, 4}, // 17 + {"cube", 2, 4}, // 18 + {"max4", 1, 4}, // 19 + {"setp_eq_push", 2, 4}, // 20 + {"setp_ne_push", 2, 4}, // 21 + {"setp_gt_push", 2, 4}, // 22 + {"setp_ge_push", 2, 4}, // 23 + {"kill_eq", 2, 4}, // 24 + {"kill_gt", 2, 4}, // 25 + {"kill_ge", 2, 4}, // 26 + {"kill_ne", 2, 4}, // 27 + {"dst", 2, 4}, // 28 + {"maxa", 2, 4}, // 29 }; -void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) { - ParsedAluInstruction instr; - ParseAluInstruction(op, instr); - instr.Disassemble(&ucode_disasm_buffer_); - ProcessAluInstruction(instr); -} +static const AluOpcodeInfo alu_scalar_opcode_infos[0x40] = { + {"adds", 1, 2}, // 0 + {"adds_prev", 1, 1}, // 1 + {"muls", 1, 2}, // 2 + {"muls_prev", 1, 1}, // 3 + {"muls_prev2", 1, 2}, // 4 + {"maxs", 1, 2}, // 5 + {"mins", 1, 2}, // 6 + {"seqs", 1, 1}, // 7 + {"sgts", 1, 1}, // 8 + {"sges", 1, 1}, // 9 + {"snes", 1, 1}, // 10 + {"frcs", 1, 1}, // 11 + {"truncs", 1, 1}, // 12 + {"floors", 1, 1}, // 13 + {"exp", 1, 1}, // 14 + {"logc", 1, 1}, // 15 + {"log", 1, 1}, // 16 + {"rcpc", 1, 1}, // 17 + {"rcpf", 1, 1}, // 18 + {"rcp", 1, 1}, // 19 + {"rsqc", 1, 1}, // 20 + {"rsqf", 1, 1}, // 21 + {"rsq", 1, 1}, // 22 + {"maxas", 1, 2}, // 23 + {"maxasf", 1, 2}, // 24 + {"subs", 1, 2}, // 25 + {"subs_prev", 1, 1}, // 26 + {"setp_eq", 1, 1}, // 27 + {"setp_ne", 1, 1}, // 28 + {"setp_gt", 1, 1}, // 29 + {"setp_ge", 1, 1}, // 30 + {"setp_inv", 1, 1}, // 31 + {"setp_pop", 1, 1}, // 32 + {"setp_clr", 0, 0}, // 33 + {"setp_rstr", 1, 1}, // 34 + {"kills_eq", 1, 1}, // 35 + {"kills_gt", 1, 1}, // 36 + {"kills_ge", 1, 1}, // 37 + {"kills_ne", 1, 1}, // 38 + {"kills_one", 1, 1}, // 39 + {"sqrt", 1, 1}, // 40 + {"UNKNOWN", 0, 0}, // 41 + {"mulsc", 2, 1}, // 42 + {"mulsc", 2, 1}, // 43 + {"addsc", 2, 1}, // 44 + {"addsc", 2, 1}, // 45 + {"subsc", 2, 1}, // 46 + {"subsc", 2, 1}, // 47 + {"sin", 1, 1}, // 48 + {"cos", 1, 1}, // 49 + {"retain_prev", 0, 0}, // 50 +}; -void ShaderTranslator::ParseAluInstruction(const AluInstruction& op, - ParsedAluInstruction& instr) const { - instr.is_predicated = op.is_predicated(); - instr.predicate_condition = op.predicate_condition(); - - bool is_export = op.is_export(); - - InstructionStorageTarget storage_target = InstructionStorageTarget::kRegister; - uint32_t storage_index_export = 0; - if (is_export) { - storage_target = InstructionStorageTarget::kNone; - // Both vector and scalar operation export to vector_dest. - ExportRegister export_register = ExportRegister(op.vector_dest()); - if (export_register == ExportRegister::kExportAddress) { - storage_target = InstructionStorageTarget::kExportAddress; - } else if (export_register >= ExportRegister::kExportData0 && - export_register <= ExportRegister::kExportData4) { - storage_target = InstructionStorageTarget::kExportData; - storage_index_export = - uint32_t(export_register) - uint32_t(ExportRegister::kExportData0); - } else if (is_vertex_shader()) { - if (export_register >= ExportRegister::kVSInterpolator0 && - export_register <= ExportRegister::kVSInterpolator15) { - storage_target = InstructionStorageTarget::kInterpolator; - storage_index_export = uint32_t(export_register) - - uint32_t(ExportRegister::kVSInterpolator0); - } else if (export_register == ExportRegister::kVSPosition) { - storage_target = InstructionStorageTarget::kPosition; - } else if (export_register == - ExportRegister::kVSPointSizeEdgeFlagKillVertex) { - storage_target = InstructionStorageTarget::kPointSizeEdgeFlagKillVertex; - } - } else if (is_pixel_shader()) { - if (export_register >= ExportRegister::kPSColor0 && - export_register <= ExportRegister::kPSColor3) { - storage_target = InstructionStorageTarget::kColor; - storage_index_export = - uint32_t(export_register) - uint32_t(ExportRegister::kPSColor0); - } else if (export_register == ExportRegister::kPSDepth) { - storage_target = InstructionStorageTarget::kDepth; - } - } - if (storage_target == InstructionStorageTarget::kNone) { - assert_always(); - XELOGE( - "ShaderTranslator::ParseAluInstruction: Unsupported write to export " - "{}", - uint32_t(export_register)); - } - } - - // Vector operation and constant 0/1 writes. - - instr.vector_opcode = op.vector_opcode(); - const auto& vector_opcode_info = - alu_vector_opcode_infos_[uint32_t(instr.vector_opcode)]; - instr.vector_opcode_name = vector_opcode_info.name; - - instr.vector_and_constant_result.storage_target = storage_target; - instr.vector_and_constant_result.storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; - if (is_export) { - instr.vector_and_constant_result.storage_index = storage_index_export; - } else { - instr.vector_and_constant_result.storage_index = op.vector_dest(); - assert_true(op.vector_dest() < register_count()); - if (op.is_vector_dest_relative()) { - instr.vector_and_constant_result.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressRelative; - } - } - instr.vector_and_constant_result.is_clamped = op.vector_clamp(); - uint32_t constant_0_mask = op.GetConstant0WriteMask(); - uint32_t constant_1_mask = op.GetConstant1WriteMask(); - instr.vector_and_constant_result.original_write_mask = - op.GetVectorOpResultWriteMask() | constant_0_mask | constant_1_mask; - for (uint32_t i = 0; i < 4; ++i) { - SwizzleSource component = GetSwizzleFromComponentIndex(i); - if (constant_0_mask & (1 << i)) { - component = SwizzleSource::k0; - } else if (constant_1_mask & (1 << i)) { - component = SwizzleSource::k1; - } - instr.vector_and_constant_result.components[i] = component; - } - - instr.vector_operand_count = vector_opcode_info.argument_count; - for (uint32_t i = 0; i < instr.vector_operand_count; ++i) { - InstructionOperand& vector_operand = instr.vector_operands[i]; - ParseAluInstructionOperand(op, i + 1, - vector_opcode_info.src_swizzle_component_count, - vector_operand); - } - - // Scalar operation. - - instr.scalar_opcode = op.scalar_opcode(); - const auto& scalar_opcode_info = - alu_scalar_opcode_infos_[uint32_t(instr.scalar_opcode)]; - instr.scalar_opcode_name = scalar_opcode_info.name; - - instr.scalar_result.storage_target = storage_target; - instr.scalar_result.storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; - if (is_export) { - instr.scalar_result.storage_index = storage_index_export; - } else { - instr.scalar_result.storage_index = op.scalar_dest(); - assert_true(op.scalar_dest() < register_count()); - if (op.is_scalar_dest_relative()) { - instr.scalar_result.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressRelative; - } - } - instr.scalar_result.is_clamped = op.scalar_clamp(); - instr.scalar_result.original_write_mask = op.GetScalarOpResultWriteMask(); - for (uint32_t i = 0; i < 4; ++i) { - instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i); - } - - instr.scalar_operand_count = scalar_opcode_info.argument_count; - if (instr.scalar_operand_count) { - if (instr.scalar_operand_count == 1) { - ParseAluInstructionOperand(op, 3, - scalar_opcode_info.src_swizzle_component_count, - instr.scalar_operands[0]); - } else { - uint32_t src3_swizzle = op.src_swizzle(3); - uint32_t component_a = ((src3_swizzle >> 6) + 3) & 0x3; - uint32_t component_b = ((src3_swizzle >> 0) + 0) & 0x3; - uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) | - (static_cast<int>(op.scalar_opcode()) & 1); - int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0; - - ParseAluInstructionOperandSpecial( - op, InstructionStorageSource::kConstantFloat, op.src_reg(3), - op.src_negate(3), 0, component_a, instr.scalar_operands[0]); - - ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister, - reg2, op.src_negate(3), const_slot, - component_b, instr.scalar_operands[1]); - } - } -} - -void ShaderTranslator::ParseAluInstructionOperand( - const AluInstruction& op, uint32_t i, uint32_t swizzle_component_count, - InstructionOperand& out_op) { +static void ParseAluInstructionOperand(const AluInstruction& op, uint32_t i, + uint32_t swizzle_component_count, + InstructionOperand& out_op) { int const_slot = 0; switch (i) { case 2: @@ -1378,7 +1180,7 @@ void ShaderTranslator::ParseAluInstructionOperand( } } -void ShaderTranslator::ParseAluInstructionOperandSpecial( +static void ParseAluInstructionOperandSpecial( const AluInstruction& op, InstructionStorageSource storage_source, uint32_t reg, bool negate, int const_slot, uint32_t component_index, InstructionOperand& out_op) { @@ -1448,6 +1250,150 @@ bool ParsedAluInstruction::IsVectorOpDefaultNop() const { return true; } +void ParseAluInstruction(const AluInstruction& op, + xenos::ShaderType shader_type, + ParsedAluInstruction& instr) { + instr.is_predicated = op.is_predicated(); + instr.predicate_condition = op.predicate_condition(); + + bool is_export = op.is_export(); + + InstructionStorageTarget storage_target = InstructionStorageTarget::kRegister; + uint32_t storage_index_export = 0; + if (is_export) { + storage_target = InstructionStorageTarget::kNone; + // Both vector and scalar operation export to vector_dest. + ExportRegister export_register = ExportRegister(op.vector_dest()); + if (export_register == ExportRegister::kExportAddress) { + storage_target = InstructionStorageTarget::kExportAddress; + } else if (export_register >= ExportRegister::kExportData0 && + export_register <= ExportRegister::kExportData4) { + storage_target = InstructionStorageTarget::kExportData; + storage_index_export = + uint32_t(export_register) - uint32_t(ExportRegister::kExportData0); + } else if (shader_type == xenos::ShaderType::kVertex) { + if (export_register >= ExportRegister::kVSInterpolator0 && + export_register <= ExportRegister::kVSInterpolator15) { + storage_target = InstructionStorageTarget::kInterpolator; + storage_index_export = uint32_t(export_register) - + uint32_t(ExportRegister::kVSInterpolator0); + } else if (export_register == ExportRegister::kVSPosition) { + storage_target = InstructionStorageTarget::kPosition; + } else if (export_register == + ExportRegister::kVSPointSizeEdgeFlagKillVertex) { + storage_target = InstructionStorageTarget::kPointSizeEdgeFlagKillVertex; + } + } else if (shader_type == xenos::ShaderType::kPixel) { + if (export_register >= ExportRegister::kPSColor0 && + export_register <= ExportRegister::kPSColor3) { + storage_target = InstructionStorageTarget::kColor; + storage_index_export = + uint32_t(export_register) - uint32_t(ExportRegister::kPSColor0); + } else if (export_register == ExportRegister::kPSDepth) { + storage_target = InstructionStorageTarget::kDepth; + } + } + if (storage_target == InstructionStorageTarget::kNone) { + assert_always(); + XELOGE( + "ShaderTranslator::ParseAluInstruction: Unsupported write to export " + "{}", + uint32_t(export_register)); + } + } + + // Vector operation and constant 0/1 writes. + + instr.vector_opcode = op.vector_opcode(); + const auto& vector_opcode_info = + alu_vector_opcode_infos[uint32_t(instr.vector_opcode)]; + instr.vector_opcode_name = vector_opcode_info.name; + + instr.vector_and_constant_result.storage_target = storage_target; + instr.vector_and_constant_result.storage_addressing_mode = + InstructionStorageAddressingMode::kStatic; + if (is_export) { + instr.vector_and_constant_result.storage_index = storage_index_export; + } else { + instr.vector_and_constant_result.storage_index = op.vector_dest(); + if (op.is_vector_dest_relative()) { + instr.vector_and_constant_result.storage_addressing_mode = + InstructionStorageAddressingMode::kAddressRelative; + } + } + instr.vector_and_constant_result.is_clamped = op.vector_clamp(); + uint32_t constant_0_mask = op.GetConstant0WriteMask(); + uint32_t constant_1_mask = op.GetConstant1WriteMask(); + instr.vector_and_constant_result.original_write_mask = + op.GetVectorOpResultWriteMask() | constant_0_mask | constant_1_mask; + for (uint32_t i = 0; i < 4; ++i) { + SwizzleSource component = GetSwizzleFromComponentIndex(i); + if (constant_0_mask & (1 << i)) { + component = SwizzleSource::k0; + } else if (constant_1_mask & (1 << i)) { + component = SwizzleSource::k1; + } + instr.vector_and_constant_result.components[i] = component; + } + + instr.vector_operand_count = vector_opcode_info.argument_count; + for (uint32_t i = 0; i < instr.vector_operand_count; ++i) { + InstructionOperand& vector_operand = instr.vector_operands[i]; + ParseAluInstructionOperand(op, i + 1, + vector_opcode_info.src_swizzle_component_count, + vector_operand); + } + + // Scalar operation. + + instr.scalar_opcode = op.scalar_opcode(); + const auto& scalar_opcode_info = + alu_scalar_opcode_infos[uint32_t(instr.scalar_opcode)]; + instr.scalar_opcode_name = scalar_opcode_info.name; + + instr.scalar_result.storage_target = storage_target; + instr.scalar_result.storage_addressing_mode = + InstructionStorageAddressingMode::kStatic; + if (is_export) { + instr.scalar_result.storage_index = storage_index_export; + } else { + instr.scalar_result.storage_index = op.scalar_dest(); + if (op.is_scalar_dest_relative()) { + instr.scalar_result.storage_addressing_mode = + InstructionStorageAddressingMode::kAddressRelative; + } + } + instr.scalar_result.is_clamped = op.scalar_clamp(); + instr.scalar_result.original_write_mask = op.GetScalarOpResultWriteMask(); + for (uint32_t i = 0; i < 4; ++i) { + instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i); + } + + instr.scalar_operand_count = scalar_opcode_info.argument_count; + if (instr.scalar_operand_count) { + if (instr.scalar_operand_count == 1) { + ParseAluInstructionOperand(op, 3, + scalar_opcode_info.src_swizzle_component_count, + instr.scalar_operands[0]); + } else { + uint32_t src3_swizzle = op.src_swizzle(3); + uint32_t component_a = ((src3_swizzle >> 6) + 3) & 0x3; + uint32_t component_b = ((src3_swizzle >> 0) + 0) & 0x3; + uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) | + (static_cast<int>(op.scalar_opcode()) & 1); + int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0; + + ParseAluInstructionOperandSpecial( + op, InstructionStorageSource::kConstantFloat, op.src_reg(3), + op.src_negate(3), 0, component_a, instr.scalar_operands[0]); + + ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister, + reg2, op.src_negate(3), const_slot, + component_b, instr.scalar_operands[1]); + } + } +} + bool ParsedAluInstruction::IsScalarOpDefaultNop() const { if (scalar_opcode != ucode::AluScalarOpcode::kRetainPrev || scalar_result.original_write_mask || scalar_result.is_clamped) { diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index e1c97808a..d5d3677d5 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -29,106 +29,43 @@ class ShaderTranslator { public: virtual ~ShaderTranslator(); - virtual uint32_t GetDefaultModification( + virtual uint64_t GetDefaultModification( xenos::ShaderType shader_type, + uint32_t dynamic_addressable_register_count, Shader::HostVertexShaderType host_vertex_shader_type = Shader::HostVertexShaderType::kVertex) const { return 0; } - bool Translate(Shader::Translation& translation, reg::SQ_PROGRAM_CNTL cntl); - bool Translate(Shader::Translation& translation); + // AnalyzeUcode must be done on the shader before translating! + bool TranslateAnalyzedShader(Shader::Translation& translation); protected: ShaderTranslator(); // Resets translator state before beginning translation. - // shader_type is passed here so translator implementations can generate - // special fixed shaders for internal use, and set up the type for this - // purpose. - virtual void Reset(xenos::ShaderType shader_type); + virtual void Reset(); - // Current host-side modification being generated. - uint32_t modification() const { return modification_; } + // Shader and modification currently being translated. + Shader::Translation& current_translation() const { return *translation_; } + Shader& current_shader() const { return current_translation().shader(); } + + // Register count from SQ_PROGRAM_CNTL, stored by the implementation in its + // modification bits. + virtual uint32_t GetModificationRegisterCount() const { return 64; } - // Register count. - uint32_t register_count() const { return register_count_; } // True if the current shader is a vertex shader. bool is_vertex_shader() const { - return shader_type_ == xenos::ShaderType::kVertex; + return current_shader().type() == xenos::ShaderType::kVertex; } // True if the current shader is a pixel shader. bool is_pixel_shader() const { - return shader_type_ == xenos::ShaderType::kPixel; - } - // Labels that jumps (explicit or from loops) can be done to, gathered before - // translation. - const std::set<uint32_t>& label_addresses() const { return label_addresses_; } - // Used constant register info, populated before translation. - const Shader::ConstantRegisterMap& constant_register_map() const { - return constant_register_map_; - } - // True if the current shader addresses general-purpose registers with dynamic - // indices, set before translation. Doesn't include writes to r[#+a#] with an - // empty used write mask. - bool uses_register_dynamic_addressing() const { - return uses_register_dynamic_addressing_; - } - // True if the current shader writes to a color target on any execution path, - // set before translation. Doesn't include writes with an empty used write - // mask. - bool writes_color_target(int i) const { return writes_color_targets_[i]; } - bool writes_any_color_target() const { - for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { - if (writes_color_targets_[i]) { - return true; - } - } - return false; - } - // True if the current shader overrides the pixel depth, set before - // translation. Doesn't include writes with an empty used write mask. - bool writes_depth() const { return writes_depth_; } - // True if the current shader has any `kill` instructions. - bool kills_pixels() const { return kills_pixels_; } - // A list of all vertex bindings, populated before translation occurs. - const std::vector<Shader::VertexBinding>& vertex_bindings() const { - return vertex_bindings_; - } - // A list of all texture bindings, populated before translation occurs. - const std::vector<Shader::TextureBinding>& texture_bindings() const { - return texture_bindings_; + return current_shader().type() == xenos::ShaderType::kPixel; } - // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game - // .pdb. - static constexpr uint32_t kMaxMemExports = 16; - // Bits indicating which eM# registers have been written to after each - // `alloc export`, for up to kMaxMemExports exports. This will contain zero - // for certain corrupt exports - that don't write to eA before writing to eM#, - // or if the write was done any way other than MAD with a stream constant. - const uint8_t* memexport_eM_written() const { return memexport_eM_written_; } - // All c# registers used as the addend in MAD operations to eA, populated - // before translation occurs. - const std::set<uint32_t>& memexport_stream_constants() const { - return memexport_stream_constants_; - } + // Temporary register count, accessible via static and dynamic addressing. + uint32_t register_count() const { return register_count_; } - // Whether the shader can have early depth and stencil writing enabled, unless - // alpha test or alpha to coverage is enabled. Data gathered before - // translation. - bool CanWriteZEarly() const { - // TODO(Triang3l): Investigate what happens to memexport when the pixel - // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early - // depth/stencil. - return !writes_depth_ && !kills_pixels_ && - memexport_stream_constants_.empty(); - } - - // Current line number in the ucode disassembly. - size_t ucode_disasm_line_number() const { return ucode_disasm_line_number_; } - // Ucode disassembly buffer accumulated during translation. - StringBuffer& ucode_disasm_buffer() { return ucode_disasm_buffer_; } // Emits a translation error that will be passed back in the result. virtual void EmitTranslationError(const char* message, bool is_fatal = true); @@ -143,10 +80,7 @@ class ShaderTranslator { } // Handles post-translation tasks when the shader has been fully translated. - // setup_shader_post_translation_info if non-modification-specific parameters - // of the Shader object behind the Translation can be set by this invocation. - virtual void PostTranslation(Shader::Translation& translation, - bool setup_shader_post_translation_info) {} + virtual void PostTranslation() {} // Sets the host disassembly on a shader. void set_host_disassembly(Shader::Translation& translation, std::string value) { @@ -201,130 +135,23 @@ class ShaderTranslator { virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {} private: - struct AluOpcodeInfo { - const char* name; - uint32_t argument_count; - uint32_t src_swizzle_component_count; - }; - - bool TranslateInternal(Shader::Translation& translation); - - void MarkUcodeInstruction(uint32_t dword_offset); - void AppendUcodeDisasm(char c); - void AppendUcodeDisasm(const char* value); - void AppendUcodeDisasmFormat(const char* format, ...); - - void GatherInstructionInformation(const ucode::ControlFlowInstruction& cf); - void GatherVertexFetchInformation(const ucode::VertexFetchInstruction& op); - void GatherTextureFetchInformation(const ucode::TextureFetchInstruction& op); void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf); - void TranslateControlFlowNop(const ucode::ControlFlowInstruction& cf); - void TranslateControlFlowExec(const ucode::ControlFlowExecInstruction& cf); - void TranslateControlFlowCondExec( - const ucode::ControlFlowCondExecInstruction& cf); - void TranslateControlFlowCondExecPred( - const ucode::ControlFlowCondExecPredInstruction& cf); - void TranslateControlFlowLoopStart( - const ucode::ControlFlowLoopStartInstruction& cf); - void TranslateControlFlowLoopEnd( - const ucode::ControlFlowLoopEndInstruction& cf); - void TranslateControlFlowCondCall( - const ucode::ControlFlowCondCallInstruction& cf); - void TranslateControlFlowReturn( - const ucode::ControlFlowReturnInstruction& cf); - void TranslateControlFlowCondJmp( - const ucode::ControlFlowCondJmpInstruction& cf); - void TranslateControlFlowAlloc(const ucode::ControlFlowAllocInstruction& cf); - void TranslateExecInstructions(const ParsedExecInstruction& instr); - void TranslateVertexFetchInstruction(const ucode::VertexFetchInstruction& op); - void ParseVertexFetchInstruction(const ucode::VertexFetchInstruction& op, - ParsedVertexFetchInstruction* out_instr); - - void TranslateTextureFetchInstruction( - const ucode::TextureFetchInstruction& op); - void ParseTextureFetchInstruction(const ucode::TextureFetchInstruction& op, - ParsedTextureFetchInstruction* out_instr); - - void TranslateAluInstruction(const ucode::AluInstruction& op); - void ParseAluInstruction(const ucode::AluInstruction& op, - ParsedAluInstruction& out_instr) const; - static void ParseAluInstructionOperand(const ucode::AluInstruction& op, - uint32_t i, - uint32_t swizzle_component_count, - InstructionOperand& out_op); - static void ParseAluInstructionOperandSpecial( - const ucode::AluInstruction& op, InstructionStorageSource storage_source, - uint32_t reg, bool negate, int const_slot, uint32_t component_index, - InstructionOperand& out_op); - - // Input shader metadata and microcode. - xenos::ShaderType shader_type_; - const uint32_t* ucode_dwords_; - size_t ucode_dword_count_; - uint32_t register_count_; - - // Current host-side modification being generated. - uint32_t modification_ = 0; + // Current shader and modification being translated. + Shader::Translation* translation_ = nullptr; // Accumulated translation errors. std::vector<Shader::Error> errors_; + // Temporary register count, accessible via static and dynamic addressing. + uint32_t register_count_ = 0; + // Current control flow dword index. uint32_t cf_index_ = 0; - // Microcode disassembly buffer, accumulated throughout the translation. - StringBuffer ucode_disasm_buffer_; - // Current line number in the disasm, which can be used for source annotation. - size_t ucode_disasm_line_number_ = 0; - // Last offset used when scanning for line numbers. - size_t previous_ucode_disasm_scan_offset_ = 0; - // Kept for supporting vfetch_mini. ucode::VertexFetchInstruction previous_vfetch_full_; - - // Labels that jumps (explicit or from loops) can be done to, gathered before - // translation. - std::set<uint32_t> label_addresses_; - - // Detected binding information gathered before translation. Must not be - // affected by the modification index. - int total_attrib_count_ = 0; - std::vector<Shader::VertexBinding> vertex_bindings_; - std::vector<Shader::TextureBinding> texture_bindings_; - uint32_t unique_vertex_bindings_ = 0; - uint32_t unique_texture_bindings_ = 0; - - // These all are gathered before translation. - // uses_register_dynamic_addressing_ for writes, writes_color_targets_, - // writes_depth_ don't include empty used write masks. - // Must not be affected by the modification index. - Shader::ConstantRegisterMap constant_register_map_ = {0}; - bool uses_register_dynamic_addressing_ = false; - bool writes_color_targets_[4] = {false, false, false, false}; - bool writes_depth_ = false; - bool kills_pixels_ = false; - - // Memexport info is gathered before translation. - // Must not be affected by the modification index. - uint32_t memexport_alloc_count_ = 0; - // For register allocation in implementations - what was used after each - // `alloc export`. - uint32_t memexport_eA_written_ = 0; - uint8_t memexport_eM_written_[kMaxMemExports] = {0}; - std::set<uint32_t> memexport_stream_constants_; - - static const AluOpcodeInfo alu_vector_opcode_infos_[0x20]; - static const AluOpcodeInfo alu_scalar_opcode_infos_[0x40]; -}; - -class UcodeShaderTranslator : public ShaderTranslator { - public: - UcodeShaderTranslator() = default; - - protected: - std::vector<uint8_t> CompleteTranslation() override; }; } // namespace gpu diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 0ff228d53..1063e8e0c 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -203,7 +203,9 @@ void SpirvShaderTranslator::StartTranslation() { push_consts_ = b.createVariable(spv::StorageClass::StorageClassPushConstant, push_constants_type, "push_consts"); - if (!texture_bindings().empty()) { + const std::vector<Shader::TextureBinding>& texture_bindings = + current_shader().texture_bindings(); + if (!texture_bindings.empty()) { image_2d_type_ = b.makeImageType(float_type_, spv::Dim::Dim2D, false, false, false, 1, spv::ImageFormat::ImageFormatUnknown); @@ -220,7 +222,7 @@ void SpirvShaderTranslator::StartTranslation() { b.makeSampledImageType(image_cube_type_)}; uint32_t num_tex_bindings = 0; - for (const auto& binding : texture_bindings()) { + for (const auto& binding : texture_bindings) { // Calculate the highest binding index. num_tex_bindings = std::max(num_tex_bindings, uint32_t(binding.binding_index + 1)); @@ -241,7 +243,7 @@ void SpirvShaderTranslator::StartTranslation() { } // Set up the map from binding -> ssbo index - for (const auto& binding : texture_bindings()) { + for (const auto& binding : texture_bindings) { tex_binding_map_[binding.fetch_constant] = uint32_t(binding.binding_index); } @@ -254,7 +256,9 @@ void SpirvShaderTranslator::StartTranslation() { // Vertex inputs/outputs // Inputs: 32 SSBOs on DS 2 binding 0 - if (!vertex_bindings().empty()) { + const std::vector<Shader::VertexBinding>& vertex_bindings = + current_shader().vertex_bindings(); + if (!vertex_bindings.empty()) { // Runtime array for vertex data Id vtx_t = b.makeRuntimeArray(uint_type_); b.addDecoration(vtx_t, spv::Decoration::DecorationArrayStride, @@ -269,7 +273,7 @@ void SpirvShaderTranslator::StartTranslation() { // Create the vertex bindings variable. Id vtx_a_t = b.makeArrayType( - vtx_s, b.makeUintConstant(uint32_t(vertex_bindings().size())), 0); + vtx_s, b.makeUintConstant(uint32_t(vertex_bindings.size())), 0); vtx_ = b.createVariable(spv::StorageClass::StorageClassUniform, vtx_a_t, "vertex_bindings"); @@ -279,7 +283,7 @@ void SpirvShaderTranslator::StartTranslation() { b.addDecoration(vtx_, spv::Decoration::DecorationNonWritable); // Set up the map from binding -> ssbo index - for (const auto& binding : vertex_bindings()) { + for (const auto& binding : vertex_bindings) { vtx_binding_map_[binding.fetch_constant] = binding.binding_index; } } @@ -494,7 +498,7 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() { b.addExecutionMode(mainFn, spv::ExecutionModeOriginUpperLeft); // If we write a new depth value, we must declare this mode! - if (writes_depth()) { + if (current_shader().writes_depth()) { b.addExecutionMode(mainFn, spv::ExecutionModeDepthReplacing); } @@ -667,8 +671,12 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() { return spirv_bytes; } -void SpirvShaderTranslator::PostTranslation( - Shader::Translation& translation, bool setup_shader_post_translation_info) { +void SpirvShaderTranslator::PostTranslation() { + Shader::Translation& translation = current_translation(); + if (!translation.is_valid()) { + return; + } + // Validation. if (cvars::spv_validate) { auto validation = validator_.Validate( diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 478aa3428..05e147895 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -58,11 +58,23 @@ class SpirvShaderTranslator : public ShaderTranslator { SpirvShaderTranslator(); ~SpirvShaderTranslator() override; + // Not storing anything else in modifications (as this shader translator is + // being replaced anyway). + uint64_t GetDefaultModification( + xenos::ShaderType shader_type, + uint32_t dynamic_addressable_register_count, + Shader::HostVertexShaderType host_vertex_shader_type = + Shader::HostVertexShaderType::kVertex) const override { + return dynamic_addressable_register_count; + } + protected: + virtual uint32_t GetModificationRegisterCount() const { + return uint32_t(current_translation().modification()); + } void StartTranslation() override; std::vector<uint8_t> CompleteTranslation() override; - void PostTranslation(Shader::Translation& translation, - bool setup_shader_post_translation_info) override; + void PostTranslation() override; void PreProcessControlFlowInstructions( std::vector<ucode::ControlFlowInstruction> instrs) override; diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index ea11f10cd..4570f9515 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -431,15 +431,14 @@ XEPACKEDUNION(ControlFlowInstruction, { static_assert_size(ControlFlowInstruction, 8); inline void UnpackControlFlowInstructions(const uint32_t* dwords, - ControlFlowInstruction* out_a, - ControlFlowInstruction* out_b) { + ControlFlowInstruction* out_ab) { uint32_t dword_0 = dwords[0]; uint32_t dword_1 = dwords[1]; uint32_t dword_2 = dwords[2]; - out_a->dword_0 = dword_0; - out_a->dword_1 = dword_1 & 0xFFFF; - out_b->dword_0 = (dword_1 >> 16) | (dword_2 << 16); - out_b->dword_1 = dword_2 >> 16; + out_ab[0].dword_0 = dword_0; + out_ab[0].dword_1 = dword_1 & 0xFFFF; + out_ab[1].dword_0 = (dword_1 >> 16) | (dword_2 << 16); + out_ab[1].dword_1 = dword_2 >> 16; } enum class FetchOpcode : uint32_t { diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc index 52bb607f4..1fbe5681c 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/pipeline_cache.cc @@ -364,10 +364,11 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state, } bool PipelineCache::TranslateShader( - VulkanShader::VulkanTranslation& translation, reg::SQ_PROGRAM_CNTL cntl) { + VulkanShader::VulkanTranslation& translation) { + translation.shader().AnalyzeUcode(ucode_disasm_buffer_); // Perform translation. // If this fails the shader will be marked as invalid and ignored later. - if (!shader_translator_->Translate(translation, cntl)) { + if (!shader_translator_->TranslateAnalyzedShader(translation)) { XELOGE("Shader translation failed; marking shader as ignored"); return false; } @@ -1071,9 +1072,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( static_cast<VulkanShader::VulkanTranslation*>( vertex_shader->GetOrCreateTranslation( shader_translator_->GetDefaultModification( - xenos::ShaderType::kVertex))); + xenos::ShaderType::kVertex, + vertex_shader->GetDynamicAddressableRegisterCount( + regs.sq_program_cntl.vs_num_reg)))); if (!vertex_shader_translation->is_translated() && - !TranslateShader(*vertex_shader_translation, regs.sq_program_cntl)) { + !TranslateShader(*vertex_shader_translation)) { XELOGE("Failed to translate the vertex shader!"); return UpdateStatus::kError; } @@ -1083,9 +1086,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( pixel_shader_translation = static_cast<VulkanShader::VulkanTranslation*>( pixel_shader->GetOrCreateTranslation( shader_translator_->GetDefaultModification( - xenos::ShaderType::kPixel))); + xenos::ShaderType::kPixel, + pixel_shader->GetDynamicAddressableRegisterCount( + regs.sq_program_cntl.ps_num_reg)))); if (!pixel_shader_translation->is_translated() && - !TranslateShader(*pixel_shader_translation, regs.sq_program_cntl)) { + !TranslateShader(*pixel_shader_translation)) { XELOGE("Failed to translate the pixel shader!"); return UpdateStatus::kError; } diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h index d6a88fdcf..64d319165 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.h +++ b/src/xenia/gpu/vulkan/pipeline_cache.h @@ -12,6 +12,7 @@ #include <unordered_map> +#include "xenia/base/string_buffer.h" #include "xenia/base/xxhash.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/spirv_shader_translator.h" @@ -78,8 +79,7 @@ class PipelineCache { // state. VkPipeline GetPipeline(const RenderState* render_state, uint64_t hash_key); - bool TranslateShader(VulkanShader::VulkanTranslation& translation, - reg::SQ_PROGRAM_CNTL cntl); + bool TranslateShader(VulkanShader::VulkanTranslation& translation); void DumpShaderDisasmAMD(VkPipeline pipeline); void DumpShaderDisasmNV(const VkGraphicsPipelineCreateInfo& info); @@ -92,6 +92,8 @@ class PipelineCache { RegisterFile* register_file_ = nullptr; ui::vulkan::VulkanDevice* device_ = nullptr; + // Temporary storage for AnalyzeUcode calls. + StringBuffer ucode_disasm_buffer_; // Reusable shader translator. std::unique_ptr<ShaderTranslator> shader_translator_ = nullptr; // Disassembler used to get the SPIRV disasm. Only used in debug. diff --git a/src/xenia/gpu/vulkan/vulkan_shader.cc b/src/xenia/gpu/vulkan/vulkan_shader.cc index 2eb41e9e5..99333f062 100644 --- a/src/xenia/gpu/vulkan/vulkan_shader.cc +++ b/src/xenia/gpu/vulkan/vulkan_shader.cc @@ -73,7 +73,7 @@ bool VulkanShader::VulkanTranslation::Prepare() { } Shader::Translation* VulkanShader::CreateTranslationInstance( - uint32_t modification) { + uint64_t modification) { return new VulkanTranslation(*this, modification); } diff --git a/src/xenia/gpu/vulkan/vulkan_shader.h b/src/xenia/gpu/vulkan/vulkan_shader.h index 7d948ac71..76a196bff 100644 --- a/src/xenia/gpu/vulkan/vulkan_shader.h +++ b/src/xenia/gpu/vulkan/vulkan_shader.h @@ -23,7 +23,7 @@ class VulkanShader : public Shader { public: class VulkanTranslation : public Translation { public: - VulkanTranslation(VulkanShader& shader, uint32_t modification) + VulkanTranslation(VulkanShader& shader, uint64_t modification) : Translation(shader, modification) {} ~VulkanTranslation() override; @@ -41,7 +41,7 @@ class VulkanShader : public Shader { uint32_t dword_count); protected: - Translation* CreateTranslationInstance(uint32_t modification) override; + Translation* CreateTranslationInstance(uint64_t modification) override; private: ui::vulkan::VulkanDevice* device_ = nullptr; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 1c21ed8ff..f8e178f15 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -546,33 +546,6 @@ inline int GetVertexFormatComponentCount(VertexFormat format) { } } -inline int GetVertexFormatSizeInWords(VertexFormat format) { - switch (format) { - case VertexFormat::k_8_8_8_8: - case VertexFormat::k_2_10_10_10: - case VertexFormat::k_10_11_11: - case VertexFormat::k_11_11_10: - case VertexFormat::k_16_16: - case VertexFormat::k_16_16_FLOAT: - case VertexFormat::k_32: - case VertexFormat::k_32_FLOAT: - return 1; - case VertexFormat::k_16_16_16_16: - case VertexFormat::k_16_16_16_16_FLOAT: - case VertexFormat::k_32_32: - case VertexFormat::k_32_32_FLOAT: - return 2; - case VertexFormat::k_32_32_32_FLOAT: - return 3; - case VertexFormat::k_32_32_32_32: - case VertexFormat::k_32_32_32_32_FLOAT: - return 4; - default: - assert_unhandled_case(format); - return 1; - } -} - inline uint32_t GetVertexFormatNeededWords(VertexFormat format, uint32_t used_components) { assert_zero(used_components & ~uint32_t(0b1111));