diff --git a/src/xenia/app/xenia_main.cc b/src/xenia/app/xenia_main.cc index d6099a047..342d5cc2b 100644 --- a/src/xenia/app/xenia_main.cc +++ b/src/xenia/app/xenia_main.cc @@ -65,6 +65,14 @@ DEFINE_path( "Root path for guest content storage (saves, etc.), or empty to use the " "content folder under the storage root.", "Storage"); +DEFINE_path( + cache_root, "", + "Root path for files used to speed up certain parts of the emulator or the " + "game. These files may be persistent, but they can be deleted without " + "major side effects such as progress loss. If empty, the cache folder " + "under the storage root, or, if available, the cache directory preferred " + "for the OS, will be used.", + "Storage"); DEFINE_bool(mount_scratch, false, "Enable scratch mount", "Storage"); DEFINE_bool(mount_cache, false, "Enable cache mount", "Storage"); @@ -221,6 +229,8 @@ int xenia_main(const std::vector& args) { #if defined(XE_PLATFORM_WIN32) || defined(XE_PLATFORM_GNU_LINUX) storage_root = storage_root / "Xenia"; #else + // TODO(Triang3l): Point to the app's external storage "files" directory + // on Android. #warning Unhandled platform for the data root. storage_root = storage_root / "Xenia"; #endif @@ -244,13 +254,29 @@ int xenia_main(const std::vector& args) { content_root = std::filesystem::absolute(content_root); XELOGI("Content root: {}", xe::path_to_utf8(content_root)); + std::filesystem::path cache_root = cvars::cache_root; + if (cache_root.empty()) { + cache_root = storage_root / "cache"; + // TODO(Triang3l): Point to the app's external storage "cache" directory on + // Android. + } else { + // If content root isn't an absolute path, then it should be relative to the + // storage root. + if (!cache_root.is_absolute()) { + cache_root = storage_root / cache_root; + } + } + cache_root = std::filesystem::absolute(cache_root); + XELOGI("Cache root: {}", xe::path_to_utf8(cache_root)); + if (cvars::discord) { discord::DiscordPresence::Initialize(); discord::DiscordPresence::NotPlaying(); } // Create the emulator but don't initialize so we can setup the window. - auto emulator = std::make_unique("", storage_root, content_root); + auto emulator = + std::make_unique("", storage_root, content_root, cache_root); // Main emulator display window. auto emulator_window = EmulatorWindow::Create(emulator.get()); diff --git a/src/xenia/emulator.cc b/src/xenia/emulator.cc index 44f284d34..4e6b10783 100644 --- a/src/xenia/emulator.cc +++ b/src/xenia/emulator.cc @@ -59,13 +59,15 @@ namespace xe { Emulator::Emulator(const std::filesystem::path& command_line, const std::filesystem::path& storage_root, - const std::filesystem::path& content_root) + const std::filesystem::path& content_root, + const std::filesystem::path& cache_root) : on_launch(), on_terminate(), on_exit(), command_line_(command_line), storage_root_(storage_root), content_root_(content_root), + cache_root_(cache_root), game_title_(), display_window_(nullptr), memory_(), @@ -689,7 +691,7 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path, // playing before the video can be seen if doing this in parallel with the // main thread. on_shader_storage_initialization(true); - graphics_system_->InitializeShaderStorage(storage_root_, title_id_, true); + graphics_system_->InitializeShaderStorage(cache_root_, title_id_, true); on_shader_storage_initialization(false); auto main_thread = kernel_state_->LaunchModule(module); diff --git a/src/xenia/emulator.h b/src/xenia/emulator.h index df5426227..739c12b51 100644 --- a/src/xenia/emulator.h +++ b/src/xenia/emulator.h @@ -49,7 +49,8 @@ class Emulator { public: explicit Emulator(const std::filesystem::path& command_line, const std::filesystem::path& storage_root, - const std::filesystem::path& content_root); + const std::filesystem::path& content_root, + const std::filesystem::path& cache_root); ~Emulator(); // Full command line used when launching the process. @@ -61,6 +62,9 @@ class Emulator { // Folder guest content is stored in. const std::filesystem::path& content_root() const { return content_root_; } + // Folder files safe to remove without significant side effects are stored in. + const std::filesystem::path& cache_root() const { return cache_root_; } + // Title of the game in the default language. const std::string& game_title() const { return game_title_; } @@ -166,6 +170,7 @@ class Emulator { std::filesystem::path command_line_; std::filesystem::path storage_root_; std::filesystem::path content_root_; + std::filesystem::path cache_root_; std::string game_title_; diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 651952da6..044773161 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -89,8 +89,8 @@ void CommandProcessor::Shutdown() { } void CommandProcessor::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) {} + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { +} void CommandProcessor::RequestFrameTrace( const std::filesystem::path& root_path) { diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index b94562d79..5002f0137 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -133,9 +133,8 @@ class CommandProcessor { // May be called not only from the command processor thread when the command // processor is paused, and the termination of this function may be explicitly // awaited. - virtual void InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking); + virtual void InitializeShaderStorage(const std::filesystem::path& cache_root, + uint32_t title_id, bool blocking); virtual void RequestFrameTrace(const std::filesystem::path& root_path); virtual void BeginTracing(const std::filesystem::path& root_path); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 5b216b22b..ef38ff5b1 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -73,10 +73,9 @@ void D3D12CommandProcessor::ClearCaches() { } void D3D12CommandProcessor::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) { - CommandProcessor::InitializeShaderStorage(storage_root, title_id, blocking); - pipeline_cache_->InitializeShaderStorage(storage_root, title_id, blocking); + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { + CommandProcessor::InitializeShaderStorage(cache_root, title_id, blocking); + pipeline_cache_->InitializeShaderStorage(cache_root, title_id, blocking); } void D3D12CommandProcessor::RequestFrameTrace( @@ -102,7 +101,7 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) { } uint32_t D3D12CommandProcessor::GetCurrentColorMask( - const D3D12Shader* pixel_shader) const { + const Shader* pixel_shader) const { if (pixel_shader == nullptr) { return 0; } @@ -159,25 +158,16 @@ void D3D12CommandProcessor::SubmitBarriers() { } ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( - const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader) { - assert_true(vertex_shader->is_translated()); - + const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, + bool tessellated) { if (bindless_resources_used_) { - return vertex_shader->host_vertex_shader_type() != - Shader::HostVertexShaderType::kVertex - ? root_signature_bindless_ds_ - : root_signature_bindless_vs_; + return tessellated ? root_signature_bindless_ds_ + : root_signature_bindless_vs_; } - assert_true(pixel_shader == nullptr || pixel_shader->is_translated()); - - D3D12_SHADER_VISIBILITY vertex_visibility; - if (vertex_shader->host_vertex_shader_type() != - Shader::HostVertexShaderType::kVertex) { - vertex_visibility = D3D12_SHADER_VISIBILITY_DOMAIN; - } else { - vertex_visibility = D3D12_SHADER_VISIBILITY_VERTEX; - } + D3D12_SHADER_VISIBILITY vertex_visibility = + tessellated ? D3D12_SHADER_VISIBILITY_DOMAIN + : D3D12_SHADER_VISIBILITY_VERTEX; uint32_t texture_count_vertex, sampler_count_vertex; vertex_shader->GetTextureBindings(texture_count_vertex); @@ -393,7 +383,7 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( } uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices( - const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, + const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, RootBindfulExtraParameterIndices& indices_out) { uint32_t texture_count_pixel = 0, sampler_count_pixel = 0; if (pixel_shader != nullptr) { @@ -1202,6 +1192,7 @@ bool D3D12CommandProcessor::SetupContext() { pipeline_cache_ = std::make_unique( *this, *register_file_, bindless_resources_used_, edram_rov_used_, + render_target_cache_->depth_float24_conversion(), texture_cache_->IsResolutionScale2X() ? 2 : 1); if (!pipeline_cache_->Initialize()) { XELOGE("Failed to initialize the graphics pipeline cache"); @@ -1804,8 +1795,7 @@ Shader* D3D12CommandProcessor::LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count) { - return pipeline_cache_->LoadShader(shader_type, guest_address, host_address, - dword_count); + return pipeline_cache_->LoadShader(shader_type, host_address, dword_count); } bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, @@ -1851,21 +1841,30 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Need a pixel shader in normal color mode. return false; } - // Get tessellation info for the current draw for vertex shader translation. - Shader::HostVertexShaderType host_vertex_shader_type = - pipeline_cache_->GetHostVertexShaderTypeIfValid(); - if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { + DxbcShaderTranslator::Modification vertex_shader_modification; + DxbcShaderTranslator::Modification pixel_shader_modification; + if (!pipeline_cache_->GetCurrentShaderModifications( + vertex_shader_modification, pixel_shader_modification)) { return false; } + D3D12Shader::D3D12Translation* vertex_shader_translation = + static_cast( + vertex_shader->GetOrCreateTranslation( + vertex_shader_modification.value)); + D3D12Shader::D3D12Translation* pixel_shader_translation = + pixel_shader ? static_cast( + pixel_shader->GetOrCreateTranslation( + pixel_shader_modification.value)) + : nullptr; // Translate the shaders now to get memexport configuration and color mask, - // which is needed by the render target cache, to check the possibility of - // doing early depth/stencil, and also to get used textures and samplers. - if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader, pixel_shader, - host_vertex_shader_type)) { + // which is needed by the render target cache, and also to get used textures + // and samplers. + if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader_translation, + pixel_shader_translation)) { return false; } - bool tessellated = - host_vertex_shader_type != Shader::HostVertexShaderType::kVertex; + bool tessellated = vertex_shader_modification.host_vertex_shader_type != + Shader::HostVertexShaderType::kVertex; // Check if memexport is used. If it is, we can't skip draw calls that have no // visual effect. @@ -1967,26 +1966,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, (pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0); texture_cache_->RequestTextures(used_texture_mask); - // Check if early depth/stencil can be enabled. - bool early_z; - if (pixel_shader) { - auto rb_colorcontrol = regs.Get(); - early_z = pixel_shader->implicit_early_z_allowed() && - (!rb_colorcontrol.alpha_test_enable || - rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) && - !rb_colorcontrol.alpha_to_mask_enable; - } else { - early_z = true; - } - // Create the pipeline if needed and bind it. void* pipeline_handle; ID3D12RootSignature* root_signature; if (!pipeline_cache_->ConfigurePipeline( - vertex_shader, pixel_shader, primitive_type_converted, + vertex_shader_translation, pixel_shader_translation, + primitive_type_converted, indexed ? index_buffer_info->format : xenos::IndexFormat::kInt16, - early_z, pipeline_render_targets, &pipeline_handle, - &root_signature)) { + pipeline_render_targets, &pipeline_handle, &root_signature)) { return false; } if (current_cached_pipeline_ != pipeline_handle) { @@ -2014,11 +2001,18 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, pixel_size_x *= 2; pixel_size_y *= 2; } + flags::DepthFloat24Conversion depth_float24_conversion = + render_target_cache_->depth_float24_conversion(); draw_util::ViewportInfo viewport_info; - draw_util::GetHostViewportInfo(regs, float(pixel_size_x), float(pixel_size_y), - true, float(D3D12_VIEWPORT_BOUNDS_MAX), - float(D3D12_VIEWPORT_BOUNDS_MAX), false, - viewport_info); + draw_util::GetHostViewportInfo( + regs, float(pixel_size_x), float(pixel_size_y), true, + float(D3D12_VIEWPORT_BOUNDS_MAX), float(D3D12_VIEWPORT_BOUNDS_MAX), false, + !edram_rov_used_ && + (depth_float24_conversion == + flags::DepthFloat24Conversion::kOnOutputTruncating || + depth_float24_conversion == + flags::DepthFloat24Conversion::kOnOutputRounding), + viewport_info); draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); scissor.left *= pixel_size_x; @@ -2033,7 +2027,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, UpdateSystemConstantValues( memexport_used, primitive_polygonal, line_loop_closing_index, indexed ? index_buffer_info->endianness : xenos::Endian::kNone, - viewport_info, pixel_size_x, pixel_size_y, used_texture_mask, early_z, + viewport_info, pixel_size_x, pixel_size_y, used_texture_mask, GetCurrentColorMask(pixel_shader), pipeline_render_targets); // Update constant buffers, descriptors and root parameters. @@ -2873,8 +2867,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( bool shared_memory_is_uav, bool primitive_polygonal, uint32_t line_loop_closing_index, xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x, - uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z, - uint32_t color_mask, + uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); @@ -2992,14 +2985,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( flags |= DxbcShaderTranslator::kSysFlag_KillIfAnyVertexKilled; } // Alpha test. - if (rb_colorcontrol.alpha_test_enable) { - flags |= uint32_t(rb_colorcontrol.alpha_func) - << DxbcShaderTranslator::kSysFlag_AlphaPassIfLess_Shift; - } else { - flags |= DxbcShaderTranslator::kSysFlag_AlphaPassIfLess | - DxbcShaderTranslator::kSysFlag_AlphaPassIfEqual | - DxbcShaderTranslator::kSysFlag_AlphaPassIfGreater; - } + xenos::CompareFunction alpha_test_function = + rb_colorcontrol.alpha_test_enable ? rb_colorcontrol.alpha_func + : xenos::CompareFunction::kAlways; + flags |= uint32_t(alpha_test_function) + << DxbcShaderTranslator::kSysFlag_AlphaPassIfLess_Shift; // Gamma writing. for (uint32_t i = 0; i < 4; ++i) { if (color_infos[i].color_format == @@ -3028,7 +3018,9 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( if (rb_depthcontrol.stencil_enable) { flags |= DxbcShaderTranslator::kSysFlag_ROVStencilTest; } - if (early_z) { + // Hint - if not applicable to the shader, will not have effect. + if (alpha_test_function == xenos::CompareFunction::kAlways && + !rb_colorcontrol.alpha_to_mask_enable) { flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencilEarlyWrite; } } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index c75b5c203..a9181f1c3 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -27,6 +27,7 @@ #include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/dxbc_shader.h" #include "xenia/gpu/dxbc_shader_translator.h" #include "xenia/gpu/xenos.h" #include "xenia/kernel/kernel_state.h" @@ -47,7 +48,7 @@ class D3D12CommandProcessor : public CommandProcessor { void ClearCaches() override; - void InitializeShaderStorage(const std::filesystem::path& storage_root, + void InitializeShaderStorage(const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) override; void RequestFrameTrace(const std::filesystem::path& root_path) override; @@ -88,7 +89,7 @@ class D3D12CommandProcessor : public CommandProcessor { // there are 4 render targets bound with the same EDRAM base (clearly not // correct usage), but the shader only clears 1, and then EDRAM buffer stores // conflict with each other. - uint32_t GetCurrentColorMask(const D3D12Shader* pixel_shader) const; + uint32_t GetCurrentColorMask(const Shader* pixel_shader) const; void PushTransitionBarrier( ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state, @@ -100,8 +101,9 @@ class D3D12CommandProcessor : public CommandProcessor { void SubmitBarriers(); // Finds or creates root signature for a pipeline. - ID3D12RootSignature* GetRootSignature(const D3D12Shader* vertex_shader, - const D3D12Shader* pixel_shader); + ID3D12RootSignature* GetRootSignature(const DxbcShader* vertex_shader, + const DxbcShader* pixel_shader, + bool tessellated); ui::d3d12::D3D12UploadBufferPool& GetConstantBufferPool() const { return *constant_buffer_pool_; @@ -300,7 +302,7 @@ class D3D12CommandProcessor : public CommandProcessor { // Gets the indices of optional root parameters. Returns the total parameter // count. static uint32_t GetRootBindfulExtraParameterIndices( - const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, + const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, RootBindfulExtraParameterIndices& indices_out); // BeginSubmission and EndSubmission may be called at any time. If there's an @@ -353,8 +355,7 @@ class D3D12CommandProcessor : public CommandProcessor { bool shared_memory_is_uav, bool primitive_polygonal, uint32_t line_loop_closing_index, xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x, - uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z, - uint32_t color_mask, + uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]); bool UpdateBindings(const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, diff --git a/src/xenia/gpu/d3d12/d3d12_shader.cc b/src/xenia/gpu/d3d12/d3d12_shader.cc index 0b5296a4f..672f1e37d 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.cc +++ b/src/xenia/gpu/d3d12/d3d12_shader.cc @@ -10,9 +10,11 @@ #include "xenia/gpu/d3d12/d3d12_shader.h" #include +#include #include "xenia/base/assert.h" #include "xenia/base/logging.h" +#include "xenia/gpu/dxbc_shader.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/ui/d3d12/d3d12_api.h" @@ -22,51 +24,13 @@ namespace d3d12 { D3D12Shader::D3D12Shader(xenos::ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count) - : Shader(shader_type, data_hash, dword_ptr, dword_count) {} + : DxbcShader(shader_type, data_hash, dword_ptr, dword_count) {} -void D3D12Shader::SetTexturesAndSamplers( - const DxbcShaderTranslator::TextureBinding* texture_bindings, - uint32_t texture_binding_count, - const DxbcShaderTranslator::SamplerBinding* sampler_bindings, - uint32_t sampler_binding_count) { - texture_bindings_.clear(); - texture_bindings_.reserve(texture_binding_count); - used_texture_mask_ = 0; - for (uint32_t i = 0; i < texture_binding_count; ++i) { - TextureBinding& binding = texture_bindings_.emplace_back(); - // For a stable hash. - std::memset(&binding, 0, sizeof(binding)); - const DxbcShaderTranslator::TextureBinding& translator_binding = - texture_bindings[i]; - binding.bindless_descriptor_index = - translator_binding.bindless_descriptor_index; - binding.fetch_constant = translator_binding.fetch_constant; - binding.dimension = translator_binding.dimension; - binding.is_signed = translator_binding.is_signed; - used_texture_mask_ |= 1u << translator_binding.fetch_constant; - } - sampler_bindings_.clear(); - sampler_bindings_.reserve(sampler_binding_count); - for (uint32_t i = 0; i < sampler_binding_count; ++i) { - SamplerBinding binding; - const DxbcShaderTranslator::SamplerBinding& translator_binding = - sampler_bindings[i]; - binding.bindless_descriptor_index = - translator_binding.bindless_descriptor_index; - binding.fetch_constant = translator_binding.fetch_constant; - binding.mag_filter = translator_binding.mag_filter; - binding.min_filter = translator_binding.min_filter; - binding.mip_filter = translator_binding.mip_filter; - binding.aniso_filter = translator_binding.aniso_filter; - sampler_bindings_.push_back(binding); - } -} - -void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, - bool disassemble_dxbc, - IDxbcConverter* dxbc_converter, - IDxcUtils* dxc_utils, - IDxcCompiler* dxc_compiler) { +void D3D12Shader::D3D12Translation::DisassembleDxbcAndDxil( + const ui::d3d12::D3D12Provider& provider, bool disassemble_dxbc, + IDxbcConverter* dxbc_converter, IDxcUtils* dxc_utils, + IDxcCompiler* dxc_compiler) { + std::string disassembly; bool is_first_disassembly = true; if (disassemble_dxbc) { ID3DBlob* dxbc_disassembly; @@ -77,11 +41,12 @@ void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, nullptr, &dxbc_disassembly))) { assert_true(is_first_disassembly); is_first_disassembly = false; - host_disassembly_.append( + disassembly.append( reinterpret_cast(dxbc_disassembly->GetBufferPointer())); dxbc_disassembly->Release(); } else { - XELOGE("Failed to disassemble DXBC shader {:016X}", ucode_data_hash()); + XELOGE("Failed to disassemble DXBC shader {:016X}", + shader().ucode_data_hash()); } } if (dxbc_converter && dxc_utils && dxc_compiler) { @@ -106,29 +71,36 @@ void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, dxil_disassembly->Release(); if (dxil_disassembly_got_utf8) { if (!is_first_disassembly) { - host_disassembly_.append("\n\n"); + disassembly.append("\n\n"); } is_first_disassembly = false; - host_disassembly_.append(reinterpret_cast( + disassembly.append(reinterpret_cast( dxil_disassembly_utf8->GetStringPointer())); dxil_disassembly_utf8->Release(); } else { XELOGE("Failed to get DXIL shader {:016X} disassembly as UTF-8", - ucode_data_hash()); + shader().ucode_data_hash()); } } else { XELOGE("Failed to disassemble DXIL shader {:016X}", - ucode_data_hash()); + shader().ucode_data_hash()); } } else { XELOGE("Failed to create a blob with DXIL shader {:016X}", - ucode_data_hash()); + shader().ucode_data_hash()); CoTaskMemFree(dxil); } } else { - XELOGE("Failed to convert shader {:016X} to DXIL", ucode_data_hash()); + XELOGE("Failed to convert shader {:016X} to DXIL", + shader().ucode_data_hash()); } } + set_host_disassembly(std::move(disassembly)); +} + +Shader::Translation* D3D12Shader::CreateTranslationInstance( + uint32_t modification) { + return new D3D12Translation(*this, modification); } } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h index c24d6a00a..384e48a8a 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.h +++ b/src/xenia/gpu/d3d12/d3d12_shader.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * + * Copyright 2020 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -10,106 +10,62 @@ #ifndef XENIA_GPU_D3D12_D3D12_SHADER_H_ #define XENIA_GPU_D3D12_D3D12_SHADER_H_ -#include +#include -#include "xenia/gpu/dxbc_shader_translator.h" -#include "xenia/gpu/shader.h" -#include "xenia/gpu/xenos.h" +#include "xenia/gpu/dxbc_shader.h" #include "xenia/ui/d3d12/d3d12_provider.h" namespace xe { namespace gpu { namespace d3d12 { -class D3D12Shader : public Shader { +class D3D12Shader : public DxbcShader { public: + class D3D12Translation : public DxbcTranslation { + public: + D3D12Translation(D3D12Shader& shader, uint32_t modification) + : DxbcTranslation(shader, modification) {} + + void DisassembleDxbcAndDxil(const ui::d3d12::D3D12Provider& provider, + bool disassemble_dxbc, + IDxbcConverter* dxbc_converter = nullptr, + IDxcUtils* dxc_utils = nullptr, + IDxcCompiler* dxc_compiler = nullptr); + }; + D3D12Shader(xenos::ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count); - void SetTexturesAndSamplers( - const DxbcShaderTranslator::TextureBinding* texture_bindings, - uint32_t texture_binding_count, - const DxbcShaderTranslator::SamplerBinding* sampler_bindings, - uint32_t sampler_binding_count); - - void SetForcedEarlyZShaderObject(const std::vector& shader_object) { - forced_early_z_shader_ = shader_object; - } - // Returns the shader with forced early depth/stencil set with - // SetForcedEarlyZShader after translation. If there's none (for example, - // if the shader discards pixels or writes to the depth buffer), an empty - // vector is returned. - const std::vector& GetForcedEarlyZShaderObject() const { - return forced_early_z_shader_; - } - - void DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, - bool disassemble_dxbc, - IDxbcConverter* dxbc_converter = nullptr, - IDxcUtils* dxc_utils = nullptr, - IDxcCompiler* dxc_compiler = nullptr); - - static constexpr uint32_t kMaxTextureBindingIndexBits = - DxbcShaderTranslator::kMaxTextureBindingIndexBits; - static constexpr uint32_t kMaxTextureBindings = - DxbcShaderTranslator::kMaxTextureBindings; - struct TextureBinding { - uint32_t bindless_descriptor_index; - uint32_t fetch_constant; - // Stacked and 3D are separate TextureBindings, even for bindless for null - // descriptor handling simplicity. - xenos::FetchOpDimension dimension; - bool is_signed; - }; - // Safe to hash and compare with memcmp for layout hashing. - const TextureBinding* GetTextureBindings(uint32_t& count_out) const { - count_out = uint32_t(texture_bindings_.size()); - return texture_bindings_.data(); - } - const uint32_t GetUsedTextureMask() const { return used_texture_mask_; } - - static constexpr uint32_t kMaxSamplerBindingIndexBits = - DxbcShaderTranslator::kMaxSamplerBindingIndexBits; - static constexpr uint32_t kMaxSamplerBindings = - DxbcShaderTranslator::kMaxSamplerBindings; - struct SamplerBinding { - uint32_t bindless_descriptor_index; - uint32_t fetch_constant; - xenos::TextureFilter mag_filter; - xenos::TextureFilter min_filter; - xenos::TextureFilter mip_filter; - xenos::AnisoFilter aniso_filter; - }; - const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const { - count_out = uint32_t(sampler_bindings_.size()); - return sampler_bindings_.data(); - } - - // For owning subsystems like the pipeline cache, accessors for unique + // For owning subsystem like the pipeline cache, accessors for unique // identifiers (used instead of hashes to make sure collisions can't happen) // of binding layouts used by the shader, for invalidation if a shader with an // incompatible layout was bound. size_t GetTextureBindingLayoutUserUID() const { return texture_binding_layout_user_uid_; } - void SetTextureBindingLayoutUserUID(size_t uid) { - texture_binding_layout_user_uid_ = uid; - } size_t GetSamplerBindingLayoutUserUID() const { return sampler_binding_layout_user_uid_; } + // Modifications of the same shader can be translated on different threads. + // The "set" function must only be called if "enter" returned true - these are + // set up only once. + bool EnterBindingLayoutUserUIDSetup() { + return !binding_layout_user_uids_set_up_.test_and_set(); + } + void SetTextureBindingLayoutUserUID(size_t uid) { + texture_binding_layout_user_uid_ = uid; + } void SetSamplerBindingLayoutUserUID(size_t uid) { sampler_binding_layout_user_uid_ = uid; } + protected: + Translation* CreateTranslationInstance(uint32_t modification) override; + private: - std::vector texture_bindings_; - std::vector sampler_bindings_; + std::atomic_flag binding_layout_user_uids_set_up_ = ATOMIC_FLAG_INIT; size_t texture_binding_layout_user_uid_ = 0; size_t sampler_binding_layout_user_uid_ = 0; - uint32_t used_texture_mask_ = 0; - - std::vector forced_early_z_shader_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index b36ecfea8..e1b1cbeaf 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "third_party/fmt/include/fmt/format.h" @@ -63,19 +64,23 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/continuous_triangle_hs.h" #include "xenia/gpu/d3d12/shaders/dxbc/discrete_quad_hs.h" #include "xenia/gpu/d3d12/shaders/dxbc/discrete_triangle_hs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h" +#include "xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h" #include "xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.h" #include "xenia/gpu/d3d12/shaders/dxbc/primitive_quad_list_gs.h" #include "xenia/gpu/d3d12/shaders/dxbc/primitive_rectangle_list_gs.h" #include "xenia/gpu/d3d12/shaders/dxbc/tessellation_vs.h" -PipelineCache::PipelineCache(D3D12CommandProcessor& command_processor, - const RegisterFile& register_file, - bool bindless_resources_used, bool edram_rov_used, - uint32_t resolution_scale) +PipelineCache::PipelineCache( + D3D12CommandProcessor& command_processor, const RegisterFile& register_file, + bool bindless_resources_used, bool edram_rov_used, + flags::DepthFloat24Conversion depth_float24_conversion, + uint32_t resolution_scale) : command_processor_(command_processor), register_file_(register_file), bindless_resources_used_(bindless_resources_used), edram_rov_used_(edram_rov_used), + depth_float24_conversion_(depth_float24_conversion), resolution_scale_(resolution_scale) { auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); @@ -177,10 +182,10 @@ void PipelineCache::Shutdown() { void PipelineCache::ClearCache(bool shutting_down) { bool reinitialize_shader_storage = !shutting_down && storage_write_thread_ != nullptr; - std::filesystem::path shader_storage_root; + std::filesystem::path shader_storage_cache_root; uint32_t shader_storage_title_id = shader_storage_title_id_; if (reinitialize_shader_storage) { - shader_storage_root = shader_storage_root_; + shader_storage_cache_root = shader_storage_cache_root_; } ShutdownShaderStorage(); @@ -226,19 +231,19 @@ void PipelineCache::ClearCache(bool shutting_down) { delete it.second; } shaders_.clear(); + shader_storage_index_ = 0; if (reinitialize_shader_storage) { - InitializeShaderStorage(shader_storage_root, shader_storage_title_id, + InitializeShaderStorage(shader_storage_cache_root, shader_storage_title_id, false); } } void PipelineCache::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) { + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { ShutdownShaderStorage(); - auto shader_storage_root = storage_root / "shaders"; + auto shader_storage_root = cache_root / "shaders"; // For files that can be moved between different hosts. // Host PSO blobs - if ever added - should be stored in shaders/local/ (they // currently aren't used because because they may be not very practical - @@ -256,6 +261,90 @@ void PipelineCache::InitializeShaderStorage( } } + // Initialize the pipeline storage stream - read pipeline descriptions and + // collect used shader modifications to translate. + std::vector pipeline_stored_descriptions; + // . + std::set> shader_translations_needed; + auto pipeline_storage_file_path = + shader_storage_shareable_root / + fmt::format("{:08X}.{}.d3d12.xpso", title_id, + edram_rov_used_ ? "rov" : "rtv"); + pipeline_storage_file_ = + xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b"); + if (!pipeline_storage_file_) { + XELOGE( + "Failed to open the Direct3D 12 pipeline description storage file for " + "writing, persistent shader storage will be disabled: {}", + xe::path_to_utf8(pipeline_storage_file_path)); + return; + } + pipeline_storage_file_flush_needed_ = false; + // 'XEPS'. + const uint32_t pipeline_storage_magic = 0x53504558; + // 'DXRO' or 'DXRT'. + const uint32_t pipeline_storage_magic_api = + edram_rov_used_ ? 0x4F525844 : 0x54525844; + const uint32_t pipeline_storage_version_swapped = + xe::byte_swap(std::max(PipelineDescription::kVersion, + DxbcShaderTranslator::Modification::kVersion)); + struct { + uint32_t magic; + uint32_t magic_api; + uint32_t version_swapped; + uint32_t device_features; + } pipeline_storage_file_header; + if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), + 1, pipeline_storage_file_) && + pipeline_storage_file_header.magic == pipeline_storage_magic && + pipeline_storage_file_header.magic_api == pipeline_storage_magic_api && + pipeline_storage_file_header.version_swapped == + pipeline_storage_version_swapped) { + xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END); + int64_t pipeline_storage_told_end = + xe::filesystem::Tell(pipeline_storage_file_); + size_t pipeline_storage_told_count = + size_t(pipeline_storage_told_end >= + int64_t(sizeof(pipeline_storage_file_header)) + ? (uint64_t(pipeline_storage_told_end) - + sizeof(pipeline_storage_file_header)) / + sizeof(PipelineStoredDescription) + : 0); + if (pipeline_storage_told_count && + xe::filesystem::Seek(pipeline_storage_file_, + int64_t(sizeof(pipeline_storage_file_header)), + SEEK_SET)) { + pipeline_stored_descriptions.resize(pipeline_storage_told_count); + pipeline_stored_descriptions.resize( + fread(pipeline_stored_descriptions.data(), + sizeof(PipelineStoredDescription), pipeline_storage_told_count, + pipeline_storage_file_)); + size_t pipeline_storage_read_count = pipeline_stored_descriptions.size(); + for (size_t i = 0; i < pipeline_storage_read_count; ++i) { + const PipelineStoredDescription& pipeline_stored_description = + pipeline_stored_descriptions[i]; + // Validate file integrity, stop and truncate the stream if data is + // corrupted. + if (XXH64(&pipeline_stored_description.description, + sizeof(pipeline_stored_description.description), + 0) != pipeline_stored_description.description_hash) { + pipeline_stored_descriptions.resize(i); + break; + } + // Mark the shader modifications as needed for translation. + shader_translations_needed.emplace( + pipeline_stored_description.description.vertex_shader_hash, + pipeline_stored_description.description.vertex_shader_modification); + if (pipeline_stored_description.description.pixel_shader_hash) { + shader_translations_needed.emplace( + pipeline_stored_description.description.pixel_shader_hash, + pipeline_stored_description.description + .pixel_shader_modification); + } + } + } + } + size_t logical_processor_count = xe::threading::logical_processor_count(); if (!logical_processor_count) { // Pick some reasonable amount if couldn't determine the number of cores. @@ -274,8 +363,11 @@ void PipelineCache::InitializeShaderStorage( "Failed to open the guest shader storage file for writing, persistent " "shader storage will be disabled: {}", xe::path_to_utf8(shader_storage_file_path)); + fclose(pipeline_storage_file_); + pipeline_storage_file_ = nullptr; return; } + ++shader_storage_index_; shader_storage_file_flush_needed_ = false; struct { uint32_t magic; @@ -299,12 +391,12 @@ void PipelineCache::InitializeShaderStorage( // Threads overlapping file reading. std::mutex shaders_translation_thread_mutex; std::condition_variable shaders_translation_thread_cond; - std::deque> + std::deque> shaders_to_translate; size_t shader_translation_threads_busy = 0; bool shader_translation_threads_shutdown = false; std::mutex shaders_failed_to_translate_mutex; - std::vector shaders_failed_to_translate; + std::vector shaders_failed_to_translate; auto shader_translation_thread_function = [&]() { auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); DxbcShaderTranslator translator( @@ -324,7 +416,8 @@ void PipelineCache::InitializeShaderStorage( IID_PPV_ARGS(&dxc_compiler)); } for (;;) { - std::pair shader_to_translate; + std::pair + shader_to_translate; for (;;) { std::unique_lock lock(shaders_translation_thread_mutex); if (shaders_to_translate.empty()) { @@ -340,11 +433,9 @@ void PipelineCache::InitializeShaderStorage( break; } assert_not_null(shader_to_translate.second); - if (!TranslateShader( - translator, *shader_to_translate.second, - shader_to_translate.first.sq_program_cntl, dxbc_converter, - dxc_utils, dxc_compiler, - shader_to_translate.first.host_vertex_shader_type)) { + if (!TranslateShader(translator, *shader_to_translate.second, + shader_to_translate.first.sq_program_cntl, + dxbc_converter, dxc_utils, dxc_compiler)) { std::lock_guard lock(shaders_failed_to_translate_mutex); shaders_failed_to_translate.push_back(shader_to_translate.second); } @@ -373,18 +464,6 @@ void PipelineCache::InitializeShaderStorage( } size_t ucode_byte_count = shader_header.ucode_dword_count * sizeof(uint32_t); - if (shaders_.find(shader_header.ucode_data_hash) != shaders_.end()) { - // Already added - usually shaders aren't added without the intention of - // translating them imminently, so don't do additional checks to - // actually ensure that translation happens right now (they would cause - // a race condition with shaders currently queued for translation). - if (!xe::filesystem::Seek(shader_storage_file_, - int64_t(ucode_byte_count), SEEK_CUR)) { - break; - } - shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count; - continue; - } ucode_dwords.resize(shader_header.ucode_dword_count); if (shader_header.ucode_dword_count && !fread(ucode_dwords.data(), ucode_byte_count, 1, @@ -397,34 +476,60 @@ void PipelineCache::InitializeShaderStorage( // Validation failed. break; } - D3D12Shader* shader = - new D3D12Shader(shader_header.type, ucode_data_hash, - ucode_dwords.data(), shader_header.ucode_dword_count); - shaders_.emplace(ucode_data_hash, shader); - // Create new threads if the currently existing threads can't keep up with - // file reading, but not more than the number of logical processors minus - // one. - size_t shader_translation_threads_needed; - { - std::lock_guard lock(shaders_translation_thread_mutex); - shader_translation_threads_needed = - std::min(shader_translation_threads_busy + - shaders_to_translate.size() + size_t(1), - logical_processor_count - size_t(1)); - } - while (shader_translation_threads.size() < - shader_translation_threads_needed) { - shader_translation_threads.push_back(xe::threading::Thread::Create( - {}, shader_translation_thread_function)); - shader_translation_threads.back()->set_name("Shader Translation"); - } - { - std::lock_guard lock(shaders_translation_thread_mutex); - shaders_to_translate.emplace_back(shader_header, shader); - } - shaders_translation_thread_cond.notify_one(); shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count; - ++shaders_translated; + // Only add the shader if needed. + auto modification_it = shader_translations_needed.lower_bound( + std::make_pair(ucode_data_hash, uint32_t(0))); + if (modification_it == shader_translations_needed.end() || + modification_it->first != ucode_data_hash) { + continue; + } + D3D12Shader* shader = + LoadShader(shader_header.type, ucode_dwords.data(), + shader_header.ucode_dword_count, ucode_data_hash); + // Loaded from the current storage - don't write again. + shader->set_ucode_storage_index(shader_storage_index_); + // Translate all the needed modifications. + for (; modification_it != shader_translations_needed.end() && + modification_it->first == ucode_data_hash; + ++modification_it) { + bool translation_is_new; + D3D12Shader::D3D12Translation* translation = + static_cast( + shader->GetOrCreateTranslation(modification_it->second, + &translation_is_new)); + if (!translation_is_new) { + // Already added - usually shaders aren't added without the intention + // of translating them imminently, so don't do additional checks to + // actually ensure that translation happens right now (they would + // cause a race condition with shaders currently queued for + // translation). + continue; + } + // Create new threads if the currently existing threads can't keep up + // with file reading, but not more than the number of logical processors + // minus one. + size_t shader_translation_threads_needed; + { + std::lock_guard lock(shaders_translation_thread_mutex); + shader_translation_threads_needed = + std::min(shader_translation_threads_busy + + shaders_to_translate.size() + size_t(1), + logical_processor_count - size_t(1)); + } + while (shader_translation_threads.size() < + shader_translation_threads_needed) { + shader_translation_threads.push_back(xe::threading::Thread::Create( + {}, shader_translation_thread_function)); + shader_translation_threads.back()->set_name("Shader Translation"); + } + { + std::lock_guard lock(shaders_translation_thread_mutex); + shaders_to_translate.emplace_back(shader_header, translation); + } + shaders_translation_thread_cond.notify_one(); + ++shaders_translated; + } } if (!shader_translation_threads.empty()) { { @@ -436,9 +541,14 @@ void PipelineCache::InitializeShaderStorage( xe::threading::Wait(shader_translation_thread.get(), false); } shader_translation_threads.clear(); - for (D3D12Shader* shader : shaders_failed_to_translate) { - shaders_.erase(shader->ucode_data_hash()); - delete shader; + for (D3D12Shader::D3D12Translation* translation : + shaders_failed_to_translate) { + D3D12Shader* shader = static_cast(&translation->shader()); + shader->DestroyTranslation(translation->modification()); + if (shader->translations().empty()) { + shaders_.erase(shader->ucode_data_hash()); + delete shader; + } } } XELOGGPU("Translated {} shaders from the storage in {} milliseconds", @@ -457,220 +567,177 @@ void PipelineCache::InitializeShaderStorage( shader_storage_file_); } - // 'DXRO' or 'DXRT'. - const uint32_t pipeline_storage_magic_api = - edram_rov_used_ ? 0x4F525844 : 0x54525844; + // Create the pipelines. + if (!pipeline_stored_descriptions.empty()) { + uint64_t pipeline_creation_start_ = xe::Clock::QueryHostTickCount(); - // Initialize the pipeline storage stream. - uint64_t pipeline_storage_initialization_start_ = - xe::Clock::QueryHostTickCount(); - auto pipeline_storage_file_path = - shader_storage_shareable_root / - fmt::format("{:08X}.{}.d3d12.xpso", title_id, - edram_rov_used_ ? "rov" : "rtv"); - pipeline_storage_file_ = - xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b"); - if (!pipeline_storage_file_) { - XELOGE( - "Failed to open the Direct3D 12 pipeline description storage file for " - "writing, persistent shader storage will be disabled: {}", - xe::path_to_utf8(pipeline_storage_file_path)); - fclose(shader_storage_file_); - shader_storage_file_ = nullptr; - return; - } - pipeline_storage_file_flush_needed_ = false; - // 'XEPS'. - const uint32_t pipeline_storage_magic = 0x53504558; - struct { - uint32_t magic; - uint32_t magic_api; - uint32_t version_swapped; - } pipeline_storage_file_header; - if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), - 1, pipeline_storage_file_) && - pipeline_storage_file_header.magic == pipeline_storage_magic && - pipeline_storage_file_header.magic_api == pipeline_storage_magic_api && - xe::byte_swap(pipeline_storage_file_header.version_swapped) == - PipelineDescription::kVersion) { - uint64_t pipeline_storage_valid_bytes = - sizeof(pipeline_storage_file_header); - // Enqueue pipeline descriptions written by previous Xenia executions until - // the end of the file or until a corrupted one is detected. - xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END); - int64_t pipeline_storage_told_end = - xe::filesystem::Tell(pipeline_storage_file_); - size_t pipeline_storage_told_count = size_t( - pipeline_storage_told_end >= int64_t(pipeline_storage_valid_bytes) - ? (uint64_t(pipeline_storage_told_end) - - pipeline_storage_valid_bytes) / - sizeof(PipelineStoredDescription) - : 0); - if (pipeline_storage_told_count && - xe::filesystem::Seek(pipeline_storage_file_, - int64_t(pipeline_storage_valid_bytes), SEEK_SET)) { - std::vector pipeline_stored_descriptions; - pipeline_stored_descriptions.resize(pipeline_storage_told_count); - pipeline_stored_descriptions.resize( - fread(pipeline_stored_descriptions.data(), - sizeof(PipelineStoredDescription), pipeline_storage_told_count, - pipeline_storage_file_)); - if (!pipeline_stored_descriptions.empty()) { - // Launch additional creation threads to use all cores to create - // pipelines faster. Will also be using the main thread, so minus 1. - size_t creation_thread_original_count = creation_threads_.size(); - size_t creation_thread_needed_count = - std::max(std::min(pipeline_stored_descriptions.size(), - logical_processor_count) - - size_t(1), - creation_thread_original_count); - while (creation_threads_.size() < creation_thread_original_count) { - size_t creation_thread_index = creation_threads_.size(); - std::unique_ptr creation_thread = - xe::threading::Thread::Create( - {}, [this, creation_thread_index]() { - CreationThread(creation_thread_index); - }); - creation_thread->set_name("D3D12 Pipelines"); - creation_threads_.push_back(std::move(creation_thread)); - } - size_t pipelines_created = 0; - for (const PipelineStoredDescription& pipeline_stored_description : - pipeline_stored_descriptions) { - const PipelineDescription& pipeline_description = - pipeline_stored_description.description; - // Validate file integrity, stop and truncate the stream if data is - // corrupted. - if (XXH64(&pipeline_stored_description.description, - sizeof(pipeline_stored_description.description), - 0) != pipeline_stored_description.description_hash) { - break; - } - pipeline_storage_valid_bytes += sizeof(PipelineStoredDescription); - // Skip already known pipelines - those have already been enqueued. - auto found_range = pipelines_.equal_range( - pipeline_stored_description.description_hash); - bool pipeline_found = false; - for (auto it = found_range.first; it != found_range.second; ++it) { - Pipeline* found_pipeline = it->second; - if (!std::memcmp(&found_pipeline->description.description, - &pipeline_description, - sizeof(pipeline_description))) { - pipeline_found = true; - break; - } - } - if (pipeline_found) { - continue; - } + // Launch additional creation threads to use all cores to create + // pipelines faster. Will also be using the main thread, so minus 1. + size_t creation_thread_original_count = creation_threads_.size(); + size_t creation_thread_needed_count = std::max( + std::min(pipeline_stored_descriptions.size(), logical_processor_count) - + size_t(1), + creation_thread_original_count); + while (creation_threads_.size() < creation_thread_original_count) { + size_t creation_thread_index = creation_threads_.size(); + std::unique_ptr creation_thread = + xe::threading::Thread::Create({}, [this, creation_thread_index]() { + CreationThread(creation_thread_index); + }); + creation_thread->set_name("D3D12 Pipelines"); + creation_threads_.push_back(std::move(creation_thread)); + } - PipelineRuntimeDescription pipeline_runtime_description; - auto vertex_shader_it = - shaders_.find(pipeline_description.vertex_shader_hash); - if (vertex_shader_it == shaders_.end()) { - continue; - } - pipeline_runtime_description.vertex_shader = vertex_shader_it->second; - if (!pipeline_runtime_description.vertex_shader->is_valid()) { - continue; - } - if (pipeline_description.pixel_shader_hash) { - auto pixel_shader_it = - shaders_.find(pipeline_description.pixel_shader_hash); - if (pixel_shader_it == shaders_.end()) { - continue; - } - pipeline_runtime_description.pixel_shader = pixel_shader_it->second; - if (!pipeline_runtime_description.pixel_shader->is_valid()) { - continue; - } - } else { - pipeline_runtime_description.pixel_shader = nullptr; - } - pipeline_runtime_description.root_signature = - command_processor_.GetRootSignature( - pipeline_runtime_description.vertex_shader, - pipeline_runtime_description.pixel_shader); - if (!pipeline_runtime_description.root_signature) { - continue; - } - std::memcpy(&pipeline_runtime_description.description, - &pipeline_description, sizeof(pipeline_description)); + size_t pipelines_created = 0; + for (const PipelineStoredDescription& pipeline_stored_description : + pipeline_stored_descriptions) { + const PipelineDescription& pipeline_description = + pipeline_stored_description.description; + // Skip already known pipelines - those have already been enqueued. + auto found_range = + pipelines_.equal_range(pipeline_stored_description.description_hash); + bool pipeline_found = false; + for (auto it = found_range.first; it != found_range.second; ++it) { + Pipeline* found_pipeline = it->second; + if (!std::memcmp(&found_pipeline->description.description, + &pipeline_description, sizeof(pipeline_description))) { + pipeline_found = true; + break; + } + } + if (pipeline_found) { + continue; + } - Pipeline* new_pipeline = new Pipeline; - new_pipeline->state = nullptr; - std::memcpy(&new_pipeline->description, &pipeline_runtime_description, - sizeof(pipeline_runtime_description)); - pipelines_.emplace(pipeline_stored_description.description_hash, - new_pipeline); - COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); - if (!creation_threads_.empty()) { - // Submit the pipeline for creation to any available thread. - { - std::lock_guard lock(creation_request_lock_); - creation_queue_.push_back(new_pipeline); - } - creation_request_cond_.notify_one(); - } else { - new_pipeline->state = - CreateD3D12Pipeline(pipeline_runtime_description); - } - ++pipelines_created; + PipelineRuntimeDescription pipeline_runtime_description; + auto vertex_shader_it = + shaders_.find(pipeline_description.vertex_shader_hash); + if (vertex_shader_it == shaders_.end()) { + continue; + } + D3D12Shader* vertex_shader = vertex_shader_it->second; + pipeline_runtime_description.vertex_shader = + static_cast( + vertex_shader->GetTranslation( + pipeline_description.vertex_shader_modification)); + if (!pipeline_runtime_description.vertex_shader || + !pipeline_runtime_description.vertex_shader->is_valid()) { + continue; + } + D3D12Shader* pixel_shader; + if (pipeline_description.pixel_shader_hash) { + auto pixel_shader_it = + shaders_.find(pipeline_description.pixel_shader_hash); + if (pixel_shader_it == shaders_.end()) { + continue; } - CreateQueuedPipelinesOnProcessorThread(); - if (creation_threads_.size() > creation_thread_original_count) { - { - std::lock_guard lock(creation_request_lock_); - creation_threads_shutdown_from_ = creation_thread_original_count; - // Assuming the queue is empty because of - // CreateQueuedPipelinesOnProcessorThread. - } - creation_request_cond_.notify_all(); - while (creation_threads_.size() > creation_thread_original_count) { - xe::threading::Wait(creation_threads_.back().get(), false); - creation_threads_.pop_back(); - } - bool await_creation_completion_event; - { - // Cleanup so additional threads can be created later again. - std::lock_guard lock(creation_request_lock_); - creation_threads_shutdown_from_ = SIZE_MAX; - // If the invocation is blocking, all the shader storage - // initialization is expected to be done before proceeding, to avoid - // latency in the command processor after the invocation. - await_creation_completion_event = - blocking && creation_threads_busy_ != 0; - if (await_creation_completion_event) { - creation_completion_event_->Reset(); - creation_completion_set_event_ = true; - } - } - if (await_creation_completion_event) { - creation_request_cond_.notify_one(); - xe::threading::Wait(creation_completion_event_.get(), false); - } + pixel_shader = pixel_shader_it->second; + pipeline_runtime_description.pixel_shader = + static_cast( + pixel_shader->GetTranslation( + pipeline_description.pixel_shader_modification)); + if (!pipeline_runtime_description.pixel_shader || + !pipeline_runtime_description.pixel_shader->is_valid()) { + continue; } - XELOGGPU( - "Created {} graphics pipelines from the storage in {} milliseconds", - pipelines_created, - (xe::Clock::QueryHostTickCount() - - pipeline_storage_initialization_start_) * - 1000 / xe::Clock::QueryHostTickFrequency()); + } else { + pixel_shader = nullptr; + pipeline_runtime_description.pixel_shader = nullptr; + } + pipeline_runtime_description.root_signature = + command_processor_.GetRootSignature( + vertex_shader, pixel_shader, + DxbcShaderTranslator::Modification( + pipeline_description.vertex_shader_modification) + .host_vertex_shader_type != + Shader::HostVertexShaderType::kVertex); + if (!pipeline_runtime_description.root_signature) { + continue; + } + std::memcpy(&pipeline_runtime_description.description, + &pipeline_description, sizeof(pipeline_description)); + + Pipeline* new_pipeline = new Pipeline; + new_pipeline->state = nullptr; + std::memcpy(&new_pipeline->description, &pipeline_runtime_description, + sizeof(pipeline_runtime_description)); + pipelines_.emplace(pipeline_stored_description.description_hash, + new_pipeline); + COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); + if (!creation_threads_.empty()) { + // Submit the pipeline for creation to any available thread. + { + std::lock_guard lock(creation_request_lock_); + creation_queue_.push_back(new_pipeline); + } + creation_request_cond_.notify_one(); + } else { + new_pipeline->state = CreateD3D12Pipeline(pipeline_runtime_description); + } + ++pipelines_created; + } + + CreateQueuedPipelinesOnProcessorThread(); + if (creation_threads_.size() > creation_thread_original_count) { + { + std::lock_guard lock(creation_request_lock_); + creation_threads_shutdown_from_ = creation_thread_original_count; + // Assuming the queue is empty because of + // CreateQueuedPipelinesOnProcessorThread. + } + creation_request_cond_.notify_all(); + while (creation_threads_.size() > creation_thread_original_count) { + xe::threading::Wait(creation_threads_.back().get(), false); + creation_threads_.pop_back(); + } + bool await_creation_completion_event; + { + // Cleanup so additional threads can be created later again. + std::lock_guard lock(creation_request_lock_); + creation_threads_shutdown_from_ = SIZE_MAX; + // If the invocation is blocking, all the shader storage initialization + // is expected to be done before proceeding, to avoid latency in the + // command processor after the invocation. + await_creation_completion_event = + blocking && creation_threads_busy_ != 0; + if (await_creation_completion_event) { + creation_completion_event_->Reset(); + creation_completion_set_event_ = true; + } + } + if (await_creation_completion_event) { + creation_request_cond_.notify_one(); + xe::threading::Wait(creation_completion_event_.get(), false); } } - xe::filesystem::TruncateStdioFile(pipeline_storage_file_, - pipeline_storage_valid_bytes); + + XELOGGPU( + "Created {} graphics pipelines (not including reading the " + "descriptions) from the storage in {} milliseconds", + pipelines_created, + (xe::Clock::QueryHostTickCount() - pipeline_creation_start_) * 1000 / + xe::Clock::QueryHostTickFrequency()); + // If any pipeline descriptions were corrupted (or the whole file has excess + // bytes in the end), truncate to the last valid pipeline description. + xe::filesystem::TruncateStdioFile( + pipeline_storage_file_, + uint64_t(sizeof(pipeline_storage_file_header) + + sizeof(PipelineStoredDescription) * + pipeline_stored_descriptions.size())); } else { xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0); pipeline_storage_file_header.magic = pipeline_storage_magic; pipeline_storage_file_header.magic_api = pipeline_storage_magic_api; pipeline_storage_file_header.version_swapped = - xe::byte_swap(PipelineDescription::kVersion); + pipeline_storage_version_swapped; + // Reserved for future (for Vulkan) - host device features affecting legal + // pipeline descriptions. + pipeline_storage_file_header.device_features = 0; fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), 1, pipeline_storage_file_); } - shader_storage_root_ = storage_root; + shader_storage_cache_root_ = cache_root; shader_storage_title_id_ = title_id; // Start the storage writing thread. @@ -706,7 +773,7 @@ void PipelineCache::ShutdownShaderStorage() { shader_storage_file_flush_needed_ = false; } - shader_storage_root_.clear(); + shader_storage_cache_root_.clear(); shader_storage_title_id_ = 0; } @@ -757,11 +824,17 @@ bool PipelineCache::IsCreatingPipelines() { } D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, - uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count) { // Hash the input memory and lookup the shader. - uint64_t data_hash = XXH64(host_address, dword_count * sizeof(uint32_t), 0); + return LoadShader(shader_type, host_address, dword_count, + XXH64(host_address, dword_count * sizeof(uint32_t), 0)); +} + +D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, + const uint32_t* host_address, + uint32_t dword_count, + uint64_t data_hash) { auto it = shaders_.find(data_hash); if (it != shaders_.end()) { // Shader has been previously loaded. @@ -774,12 +847,64 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, D3D12Shader* shader = new D3D12Shader(shader_type, data_hash, host_address, dword_count); shaders_.emplace(data_hash, shader); + if (!cvars::dump_shaders.empty()) { + shader->DumpUcodeBinary(cvars::dump_shaders); + } return shader; } -Shader::HostVertexShaderType PipelineCache::GetHostVertexShaderTypeIfValid() - const { +bool PipelineCache::GetCurrentShaderModifications( + DxbcShaderTranslator::Modification& vertex_shader_modification_out, + DxbcShaderTranslator::Modification& pixel_shader_modification_out) const { + Shader::HostVertexShaderType host_vertex_shader_type = + GetCurrentHostVertexShaderTypeIfValid(); + if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { + return false; + } + vertex_shader_modification_out = DxbcShaderTranslator::Modification( + shader_translator_->GetDefaultModification(xenos::ShaderType::kVertex, + host_vertex_shader_type)); + DxbcShaderTranslator::Modification pixel_shader_modification( + shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel)); + if (!edram_rov_used_) { + const auto& regs = register_file_; + using DepthStencilMode = + DxbcShaderTranslator::Modification::DepthStencilMode; + if ((depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputTruncating || + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputRounding) && + regs.Get().z_enable && + regs.Get().depth_format == + xenos::DepthRenderTargetFormat::kD24FS8) { + pixel_shader_modification.depth_stencil_mode = + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputTruncating + ? DepthStencilMode::kFloat24Truncating + : DepthStencilMode::kFloat24Rounding; + } else { + // Hint to enable early depth/stencil writing if possible - whether it + // will actually take effect depends on the shader itself, it's not known + // before translation. + auto rb_colorcontrol = regs.Get(); + if ((!rb_colorcontrol.alpha_test_enable || + rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) && + !rb_colorcontrol.alpha_to_mask_enable) { + pixel_shader_modification.depth_stencil_mode = + DepthStencilMode::kEarlyHint; + } else { + pixel_shader_modification.depth_stencil_mode = + DepthStencilMode::kNoModifiers; + } + } + } + pixel_shader_modification_out = pixel_shader_modification; + return true; +} + +Shader::HostVertexShaderType +PipelineCache::GetCurrentHostVertexShaderTypeIfValid() const { // If the values this functions returns are changed, INVALIDATE THE SHADER // STORAGE (increase kVersion for BOTH shaders and pipelines)! The exception // is when the function originally returned "unsupported", but started to @@ -855,8 +980,8 @@ Shader::HostVertexShaderType PipelineCache::GetHostVertexShaderTypeIfValid() } bool PipelineCache::EnsureShadersTranslated( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, - Shader::HostVertexShaderType host_vertex_shader_type) { + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader) { const auto& regs = register_file_; auto sq_program_cntl = regs.Get(); @@ -869,18 +994,19 @@ bool PipelineCache::EnsureShadersTranslated( if (!vertex_shader->is_translated()) { if (!TranslateShader(*shader_translator_, *vertex_shader, sq_program_cntl, - dxbc_converter_, dxc_utils_, dxc_compiler_, - host_vertex_shader_type)) { + dxbc_converter_, dxc_utils_, dxc_compiler_)) { XELOGE("Failed to translate the vertex shader!"); return false; } - if (shader_storage_file_) { + if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() != + shader_storage_index_) { + vertex_shader->shader().set_ucode_storage_index(shader_storage_index_); assert_not_null(storage_write_thread_); shader_storage_file_flush_needed_ = true; { std::lock_guard lock(storage_write_request_lock_); storage_write_shader_queue_.push_back( - std::make_pair(vertex_shader, sq_program_cntl)); + std::make_pair(&vertex_shader->shader(), sq_program_cntl)); } storage_write_request_cond_.notify_all(); } @@ -892,13 +1018,15 @@ bool PipelineCache::EnsureShadersTranslated( XELOGE("Failed to translate the pixel shader!"); return false; } - if (shader_storage_file_) { + if (shader_storage_file_ && + pixel_shader->shader().ucode_storage_index() != shader_storage_index_) { + pixel_shader->shader().set_ucode_storage_index(shader_storage_index_); assert_not_null(storage_write_thread_); shader_storage_file_flush_needed_ = true; { std::lock_guard lock(storage_write_request_lock_); storage_write_shader_queue_.push_back( - std::make_pair(pixel_shader, sq_program_cntl)); + std::make_pair(&pixel_shader->shader(), sq_program_cntl)); } storage_write_request_cond_.notify_all(); } @@ -908,9 +1036,9 @@ bool PipelineCache::EnsureShadersTranslated( } bool PipelineCache::ConfigurePipeline( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES @@ -922,7 +1050,7 @@ bool PipelineCache::ConfigurePipeline( PipelineRuntimeDescription runtime_description; if (!GetCurrentStateDescription(vertex_shader, pixel_shader, primitive_type, - index_format, early_z, render_targets, + index_format, render_targets, runtime_description)) { return false; } @@ -950,9 +1078,7 @@ bool PipelineCache::ConfigurePipeline( } } - if (!EnsureShadersTranslated( - vertex_shader, pixel_shader, - Shader::HostVertexShaderType(description.host_vertex_shader_type))) { + if (!EnsureShadersTranslated(vertex_shader, pixel_shader)) { return false; } @@ -995,14 +1121,17 @@ bool PipelineCache::ConfigurePipeline( return true; } -bool PipelineCache::TranslateShader( - DxbcShaderTranslator& translator, D3D12Shader& shader, - reg::SQ_PROGRAM_CNTL cntl, IDxbcConverter* dxbc_converter, - IDxcUtils* dxc_utils, IDxcCompiler* dxc_compiler, - Shader::HostVertexShaderType host_vertex_shader_type) { +bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, + D3D12Shader::D3D12Translation& translation, + reg::SQ_PROGRAM_CNTL cntl, + IDxbcConverter* dxbc_converter, + IDxcUtils* dxc_utils, + IDxcCompiler* dxc_compiler) { + D3D12Shader& shader = static_cast(translation.shader()); + // Perform translation. // If this fails the shader will be marked as invalid and ignored later. - if (!translator.Translate(&shader, cntl, host_vertex_shader_type)) { + if (!translator.Translate(translation, cntl)) { XELOGE("Shader {:016X} translation failed; marking as ignored", shader.ucode_data_hash()); return false; @@ -1010,7 +1139,8 @@ bool PipelineCache::TranslateShader( const char* host_shader_type; if (shader.type() == xenos::ShaderType::kVertex) { - switch (shader.host_vertex_shader_type()) { + DxbcShaderTranslator::Modification modification(translation.modification()); + switch (modification.host_vertex_shader_type) { case Shader::HostVertexShaderType::kLineDomainCPIndexed: host_shader_type = "control-point-indexed line domain"; break; @@ -1039,169 +1169,156 @@ bool PipelineCache::TranslateShader( shader.ucode_dword_count() * 4, shader.ucode_data_hash(), shader.ucode_disassembly().c_str()); - // Set up texture and sampler bindings. - uint32_t texture_binding_count; - const DxbcShaderTranslator::TextureBinding* translator_texture_bindings = - translator.GetTextureBindings(texture_binding_count); - uint32_t sampler_binding_count; - const DxbcShaderTranslator::SamplerBinding* sampler_bindings = - translator.GetSamplerBindings(sampler_binding_count); - shader.SetTexturesAndSamplers(translator_texture_bindings, - texture_binding_count, sampler_bindings, - sampler_binding_count); - assert_false(bindless_resources_used_ && - texture_binding_count + sampler_binding_count > - D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4); - // Get hashable texture bindings, without translator-specific info. - const D3D12Shader::TextureBinding* texture_bindings = - shader.GetTextureBindings(texture_binding_count); - size_t texture_binding_layout_bytes = - texture_binding_count * sizeof(*texture_bindings); - uint64_t texture_binding_layout_hash = 0; - if (texture_binding_count) { - texture_binding_layout_hash = - XXH64(texture_bindings, texture_binding_layout_bytes, 0); - } - uint32_t bindless_sampler_count = - bindless_resources_used_ ? sampler_binding_count : 0; - uint64_t bindless_sampler_layout_hash = 0; - if (bindless_sampler_count) { - XXH64_state_t hash_state; - XXH64_reset(&hash_state, 0); - for (uint32_t i = 0; i < bindless_sampler_count; ++i) { - XXH64_update(&hash_state, &sampler_bindings[i].bindless_descriptor_index, - sizeof(sampler_bindings[i].bindless_descriptor_index)); - } - bindless_sampler_layout_hash = XXH64_digest(&hash_state); - } - // Obtain the unique IDs of binding layouts if there are any texture bindings - // or bindless samplers, for invalidation in the command processor. - size_t texture_binding_layout_uid = kLayoutUIDEmpty; - // Use sampler count for the bindful case because it's the only thing that - // must be the same for layouts to be compatible in this case - // (instruction-specified parameters are used as overrides for actual - // samplers). - static_assert( - kLayoutUIDEmpty == 0, - "Empty layout UID is assumed to be 0 because for bindful samplers, the " - "UID is their count"); - size_t sampler_binding_layout_uid = bindless_resources_used_ - ? kLayoutUIDEmpty - : size_t(sampler_binding_count); - if (texture_binding_count || bindless_sampler_count) { - std::lock_guard layouts_mutex_(layouts_mutex_); + // Set up texture and sampler binding layouts. + if (shader.EnterBindingLayoutUserUIDSetup()) { + uint32_t texture_binding_count; + const D3D12Shader::TextureBinding* texture_bindings = + shader.GetTextureBindings(texture_binding_count); + uint32_t sampler_binding_count; + const D3D12Shader::SamplerBinding* sampler_bindings = + shader.GetSamplerBindings(sampler_binding_count); + assert_false(bindless_resources_used_ && + texture_binding_count + sampler_binding_count > + D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4); + size_t texture_binding_layout_bytes = + texture_binding_count * sizeof(*texture_bindings); + uint64_t texture_binding_layout_hash = 0; if (texture_binding_count) { - auto found_range = - texture_binding_layout_map_.equal_range(texture_binding_layout_hash); - for (auto it = found_range.first; it != found_range.second; ++it) { - if (it->second.vector_span_length == texture_binding_count && - !std::memcmp( - texture_binding_layouts_.data() + it->second.vector_span_offset, - texture_bindings, texture_binding_layout_bytes)) { - texture_binding_layout_uid = it->second.uid; - break; - } - } - if (texture_binding_layout_uid == kLayoutUIDEmpty) { - static_assert( - kLayoutUIDEmpty == 0, - "Layout UID is size + 1 because it's assumed that 0 is the UID for " - "an empty layout"); - texture_binding_layout_uid = texture_binding_layout_map_.size() + 1; - LayoutUID new_uid; - new_uid.uid = texture_binding_layout_uid; - new_uid.vector_span_offset = texture_binding_layouts_.size(); - new_uid.vector_span_length = texture_binding_count; - texture_binding_layouts_.resize(new_uid.vector_span_offset + - texture_binding_count); - std::memcpy( - texture_binding_layouts_.data() + new_uid.vector_span_offset, - texture_bindings, texture_binding_layout_bytes); - texture_binding_layout_map_.emplace(texture_binding_layout_hash, - new_uid); - } + texture_binding_layout_hash = + XXH64(texture_bindings, texture_binding_layout_bytes, 0); } + uint32_t bindless_sampler_count = + bindless_resources_used_ ? sampler_binding_count : 0; + uint64_t bindless_sampler_layout_hash = 0; if (bindless_sampler_count) { - auto found_range = - bindless_sampler_layout_map_.equal_range(sampler_binding_layout_uid); - for (auto it = found_range.first; it != found_range.second; ++it) { - if (it->second.vector_span_length != bindless_sampler_count) { - continue; - } - sampler_binding_layout_uid = it->second.uid; - const uint32_t* vector_bindless_sampler_layout = - bindless_sampler_layouts_.data() + it->second.vector_span_offset; - for (uint32_t i = 0; i < bindless_sampler_count; ++i) { - if (vector_bindless_sampler_layout[i] != - sampler_bindings[i].bindless_descriptor_index) { - sampler_binding_layout_uid = kLayoutUIDEmpty; + XXH64_state_t hash_state; + XXH64_reset(&hash_state, 0); + for (uint32_t i = 0; i < bindless_sampler_count; ++i) { + XXH64_update(&hash_state, + &sampler_bindings[i].bindless_descriptor_index, + sizeof(sampler_bindings[i].bindless_descriptor_index)); + } + bindless_sampler_layout_hash = XXH64_digest(&hash_state); + } + // Obtain the unique IDs of binding layouts if there are any texture + // bindings or bindless samplers, for invalidation in the command processor. + size_t texture_binding_layout_uid = kLayoutUIDEmpty; + // Use sampler count for the bindful case because it's the only thing that + // must be the same for layouts to be compatible in this case + // (instruction-specified parameters are used as overrides for actual + // samplers). + static_assert( + kLayoutUIDEmpty == 0, + "Empty layout UID is assumed to be 0 because for bindful samplers, the " + "UID is their count"); + size_t sampler_binding_layout_uid = bindless_resources_used_ + ? kLayoutUIDEmpty + : size_t(sampler_binding_count); + if (texture_binding_count || bindless_sampler_count) { + std::lock_guard layouts_mutex_(layouts_mutex_); + if (texture_binding_count) { + auto found_range = texture_binding_layout_map_.equal_range( + texture_binding_layout_hash); + for (auto it = found_range.first; it != found_range.second; ++it) { + if (it->second.vector_span_length == texture_binding_count && + !std::memcmp(texture_binding_layouts_.data() + + it->second.vector_span_offset, + texture_bindings, texture_binding_layout_bytes)) { + texture_binding_layout_uid = it->second.uid; break; } } - if (sampler_binding_layout_uid != kLayoutUIDEmpty) { - break; + if (texture_binding_layout_uid == kLayoutUIDEmpty) { + static_assert( + kLayoutUIDEmpty == 0, + "Layout UID is size + 1 because it's assumed that 0 is the UID " + "for an empty layout"); + texture_binding_layout_uid = texture_binding_layout_map_.size() + 1; + LayoutUID new_uid; + new_uid.uid = texture_binding_layout_uid; + new_uid.vector_span_offset = texture_binding_layouts_.size(); + new_uid.vector_span_length = texture_binding_count; + texture_binding_layouts_.resize(new_uid.vector_span_offset + + texture_binding_count); + std::memcpy( + texture_binding_layouts_.data() + new_uid.vector_span_offset, + texture_bindings, texture_binding_layout_bytes); + texture_binding_layout_map_.emplace(texture_binding_layout_hash, + new_uid); } } - if (sampler_binding_layout_uid == kLayoutUIDEmpty) { - sampler_binding_layout_uid = bindless_sampler_layout_map_.size(); - LayoutUID new_uid; - static_assert( - kLayoutUIDEmpty == 0, - "Layout UID is size + 1 because it's assumed that 0 is the UID for " - "an empty layout"); - new_uid.uid = sampler_binding_layout_uid + 1; - new_uid.vector_span_offset = bindless_sampler_layouts_.size(); - new_uid.vector_span_length = sampler_binding_count; - bindless_sampler_layouts_.resize(new_uid.vector_span_offset + - sampler_binding_count); - uint32_t* vector_bindless_sampler_layout = - bindless_sampler_layouts_.data() + new_uid.vector_span_offset; - for (uint32_t i = 0; i < bindless_sampler_count; ++i) { - vector_bindless_sampler_layout[i] = - sampler_bindings[i].bindless_descriptor_index; + if (bindless_sampler_count) { + auto found_range = bindless_sampler_layout_map_.equal_range( + sampler_binding_layout_uid); + for (auto it = found_range.first; it != found_range.second; ++it) { + if (it->second.vector_span_length != bindless_sampler_count) { + continue; + } + sampler_binding_layout_uid = it->second.uid; + const uint32_t* vector_bindless_sampler_layout = + bindless_sampler_layouts_.data() + it->second.vector_span_offset; + for (uint32_t i = 0; i < bindless_sampler_count; ++i) { + if (vector_bindless_sampler_layout[i] != + sampler_bindings[i].bindless_descriptor_index) { + sampler_binding_layout_uid = kLayoutUIDEmpty; + break; + } + } + if (sampler_binding_layout_uid != kLayoutUIDEmpty) { + break; + } + } + if (sampler_binding_layout_uid == kLayoutUIDEmpty) { + sampler_binding_layout_uid = bindless_sampler_layout_map_.size(); + LayoutUID new_uid; + static_assert( + kLayoutUIDEmpty == 0, + "Layout UID is size + 1 because it's assumed that 0 is the UID " + "for an empty layout"); + new_uid.uid = sampler_binding_layout_uid + 1; + new_uid.vector_span_offset = bindless_sampler_layouts_.size(); + new_uid.vector_span_length = sampler_binding_count; + bindless_sampler_layouts_.resize(new_uid.vector_span_offset + + sampler_binding_count); + uint32_t* vector_bindless_sampler_layout = + bindless_sampler_layouts_.data() + new_uid.vector_span_offset; + for (uint32_t i = 0; i < bindless_sampler_count; ++i) { + vector_bindless_sampler_layout[i] = + sampler_bindings[i].bindless_descriptor_index; + } + bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash, + new_uid); } - bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash, - new_uid); } } - } - shader.SetTextureBindingLayoutUserUID(texture_binding_layout_uid); - shader.SetSamplerBindingLayoutUserUID(sampler_binding_layout_uid); - - // Create a version of the shader with early depth/stencil forced by Xenia - // itself when it's safe to do so or when EARLY_Z_ENABLE is set in - // RB_DEPTHCONTROL. - if (shader.type() == xenos::ShaderType::kPixel && !edram_rov_used_ && - !shader.writes_depth()) { - shader.SetForcedEarlyZShaderObject( - std::move(DxbcShaderTranslator::ForceEarlyDepthStencil( - shader.translated_binary().data()))); + shader.SetTextureBindingLayoutUserUID(texture_binding_layout_uid); + shader.SetSamplerBindingLayoutUserUID(sampler_binding_layout_uid); } // Disassemble the shader for dumping. auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); if (cvars::d3d12_dxbc_disasm_dxilconv) { - shader.DisassembleDxbc(provider, cvars::d3d12_dxbc_disasm, dxbc_converter, - dxc_utils, dxc_compiler); + translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm, + dxbc_converter, dxc_utils, dxc_compiler); } else { - shader.DisassembleDxbc(provider, cvars::d3d12_dxbc_disasm); + translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm); } // Dump shader files if desired. if (!cvars::dump_shaders.empty()) { - shader.Dump(cvars::dump_shaders, - (shader.type() == xenos::ShaderType::kPixel) - ? (edram_rov_used_ ? "d3d12_rov" : "d3d12_rtv") - : "d3d12"); + translation.Dump(cvars::dump_shaders, + (shader.type() == xenos::ShaderType::kPixel) + ? (edram_rov_used_ ? "d3d12_rov" : "d3d12_rtv") + : "d3d12"); } - return shader.is_valid(); + return translation.is_valid(); } bool PipelineCache::GetCurrentStateDescription( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], PipelineRuntimeDescription& runtime_description_out) { PipelineDescription& description_out = runtime_description_out.description; @@ -1212,19 +1329,30 @@ bool PipelineCache::GetCurrentStateDescription( // Initialize all unused fields to zero for comparison/hashing. std::memset(&runtime_description_out, 0, sizeof(runtime_description_out)); + bool tessellated = + DxbcShaderTranslator::Modification(vertex_shader->modification()) + .host_vertex_shader_type != Shader::HostVertexShaderType::kVertex; + // Root signature. - runtime_description_out.root_signature = - command_processor_.GetRootSignature(vertex_shader, pixel_shader); + runtime_description_out.root_signature = command_processor_.GetRootSignature( + static_cast(&vertex_shader->shader()), + pixel_shader ? static_cast(&pixel_shader->shader()) + : nullptr, + tessellated); if (runtime_description_out.root_signature == nullptr) { return false; } // Shaders. runtime_description_out.vertex_shader = vertex_shader; - description_out.vertex_shader_hash = vertex_shader->ucode_data_hash(); + description_out.vertex_shader_hash = + vertex_shader->shader().ucode_data_hash(); + description_out.vertex_shader_modification = vertex_shader->modification(); if (pixel_shader) { runtime_description_out.pixel_shader = pixel_shader; - description_out.pixel_shader_hash = pixel_shader->ucode_data_hash(); + description_out.pixel_shader_hash = + pixel_shader->shader().ucode_data_hash(); + description_out.pixel_shader_modification = pixel_shader->modification(); } // Index buffer strip cut value. @@ -1239,13 +1367,10 @@ bool PipelineCache::GetCurrentStateDescription( } // Host vertex shader type and primitive topology. - Shader::HostVertexShaderType host_vertex_shader_type = - GetHostVertexShaderTypeIfValid(); - if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { - return false; - } - description_out.host_vertex_shader_type = host_vertex_shader_type; - if (host_vertex_shader_type == Shader::HostVertexShaderType::kVertex) { + if (tessellated) { + description_out.primitive_topology_type_or_tessellation_mode = + uint32_t(regs.Get().tess_mode); + } else { switch (primitive_type) { case xenos::PrimitiveType::kPointList: description_out.primitive_topology_type_or_tessellation_mode = @@ -1280,14 +1405,10 @@ bool PipelineCache::GetCurrentStateDescription( description_out.geometry_shader = PipelineGeometryShader::kNone; break; } - } else { - description_out.primitive_topology_type_or_tessellation_mode = - uint32_t(regs.Get().tess_mode); } - bool primitive_polygonal = xenos::IsPrimitivePolygonal( - host_vertex_shader_type != Shader::HostVertexShaderType::kVertex, - primitive_type); + bool primitive_polygonal = + xenos::IsPrimitivePolygonal(tessellated, primitive_type); // Rasterizer state. // Because Direct3D 12 doesn't support per-side fill mode and depth bias, the @@ -1386,8 +1507,7 @@ bool PipelineCache::GetCurrentStateDescription( description_out.depth_bias_slope_scaled = poly_offset_scale * (1.0f / 16.0f); } - if (cvars::d3d12_tessellation_wireframe && - host_vertex_shader_type != Shader::HostVertexShaderType::kVertex) { + if (tessellated && cvars::d3d12_tessellation_wireframe) { description_out.fill_mode_wireframe = 1; } description_out.depth_clip = !regs.Get().clip_disable; @@ -1453,13 +1573,11 @@ bool PipelineCache::GetCurrentStateDescription( } else { description_out.depth_func = xenos::CompareFunction::kAlways; } - if (early_z) { - description_out.force_early_z = 1; - } // Render targets and blending state. 32 because of 0x1F mask, for safety // (all unknown to zero). - uint32_t color_mask = command_processor_.GetCurrentColorMask(pixel_shader); + uint32_t color_mask = command_processor_.GetCurrentColorMask( + pixel_shader ? &pixel_shader->shader() : nullptr); static const PipelineBlendFactor kBlendFactorMap[32] = { /* 0 */ PipelineBlendFactor::kZero, /* 1 */ PipelineBlendFactor::kOne, @@ -1550,11 +1668,11 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( if (runtime_description.pixel_shader != nullptr) { XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash(), + runtime_description.pixel_shader->shader().ucode_data_hash()); } else { XELOGGPU("Creating graphics pipeline with VS {:016X}", - runtime_description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash()); } D3D12_GRAPHICS_PIPELINE_STATE_DESC state_desc; @@ -1580,21 +1698,14 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( // Primitive topology, vertex, hull, domain and geometry shaders. if (!runtime_description.vertex_shader->is_translated()) { XELOGE("Vertex shader {:016X} not translated", - runtime_description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash()); assert_always(); return nullptr; } Shader::HostVertexShaderType host_vertex_shader_type = - description.host_vertex_shader_type; - if (runtime_description.vertex_shader->host_vertex_shader_type() != - host_vertex_shader_type) { - XELOGE( - "Vertex shader {:016X} translated into the wrong host shader " - "type", - runtime_description.vertex_shader->ucode_data_hash()); - assert_always(); - return nullptr; - } + DxbcShaderTranslator::Modification( + runtime_description.vertex_shader->modification()) + .host_vertex_shader_type; if (host_vertex_shader_type == Shader::HostVertexShaderType::kVertex) { state_desc.VS.pShaderBytecode = runtime_description.vertex_shader->translated_binary().data(); @@ -1704,24 +1815,34 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( if (runtime_description.pixel_shader != nullptr) { if (!runtime_description.pixel_shader->is_translated()) { XELOGE("Pixel shader {:016X} not translated", - runtime_description.pixel_shader->ucode_data_hash()); + runtime_description.pixel_shader->shader().ucode_data_hash()); assert_always(); return nullptr; } - const auto& forced_early_z_shader = - runtime_description.pixel_shader->GetForcedEarlyZShaderObject(); - if (description.force_early_z && forced_early_z_shader.size() != 0) { - state_desc.PS.pShaderBytecode = forced_early_z_shader.data(); - state_desc.PS.BytecodeLength = forced_early_z_shader.size(); - } else { - state_desc.PS.pShaderBytecode = - runtime_description.pixel_shader->translated_binary().data(); - state_desc.PS.BytecodeLength = - runtime_description.pixel_shader->translated_binary().size(); - } + state_desc.PS.pShaderBytecode = + runtime_description.pixel_shader->translated_binary().data(); + state_desc.PS.BytecodeLength = + runtime_description.pixel_shader->translated_binary().size(); } else if (edram_rov_used_) { state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data(); state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size(); + } else { + if ((description.depth_func != xenos::CompareFunction::kAlways || + description.depth_write) && + description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { + switch (depth_float24_conversion_) { + case flags::DepthFloat24Conversion::kOnOutputTruncating: + state_desc.PS.pShaderBytecode = float24_truncate_ps; + state_desc.PS.BytecodeLength = sizeof(float24_truncate_ps); + break; + case flags::DepthFloat24Conversion::kOnOutputRounding: + state_desc.PS.pShaderBytecode = float24_round_ps; + state_desc.PS.BytecodeLength = sizeof(float24_round_ps); + break; + default: + break; + } + } } // Rasterizer state. @@ -1810,9 +1931,6 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( state_desc.DSVFormat = RenderTargetCache::GetDepthDXGIFormat(description.depth_format); } - // TODO(Triang3l): EARLY_Z_ENABLE (needs to be enabled in shaders, but alpha - // test is dynamic - should be enabled anyway if there's no alpha test, - // discarding and depth output). // Render targets and blending. state_desc.BlendState.IndependentBlendEnable = TRUE; @@ -1874,22 +1992,24 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( IID_PPV_ARGS(&state)))) { if (runtime_description.pixel_shader != nullptr) { XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash(), + runtime_description.pixel_shader->shader().ucode_data_hash()); } else { XELOGE("Failed to create graphics pipeline with VS {:016X}", - runtime_description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash()); } return nullptr; } std::wstring name; if (runtime_description.pixel_shader != nullptr) { - name = fmt::format(L"VS {:016X}, PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + name = fmt::format( + L"VS {:016X}, PS {:016X}", + runtime_description.vertex_shader->shader().ucode_data_hash(), + runtime_description.pixel_shader->shader().ucode_data_hash()); } else { - name = fmt::format(L"VS {:016X}", - runtime_description.vertex_shader->ucode_data_hash()); + name = fmt::format( + L"VS {:016X}", + runtime_description.vertex_shader->shader().ucode_data_hash()); } state->SetName(name.c_str()); return state; @@ -1954,7 +2074,6 @@ void PipelineCache::StorageWriteThread() { shader_header.ucode_data_hash = shader->ucode_data_hash(); shader_header.ucode_dword_count = shader->ucode_dword_count(); shader_header.type = shader->type(); - shader_header.host_vertex_shader_type = shader->host_vertex_shader_type(); shader_header.sq_program_cntl = shader_pair.second; assert_not_null(shader_storage_file_); fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 8159416d0..d09d373b8 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -27,6 +27,7 @@ #include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/dxbc_shader_translator.h" +#include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/xenos.h" #include "xenia/ui/d3d12/d3d12_api.h" @@ -43,36 +44,39 @@ class PipelineCache { PipelineCache(D3D12CommandProcessor& command_processor, const RegisterFile& register_file, bool bindless_resources_used, - bool edram_rov_used, uint32_t resolution_scale); + bool edram_rov_used, + flags::DepthFloat24Conversion depth_float24_conversion, + uint32_t resolution_scale); ~PipelineCache(); bool Initialize(); void Shutdown(); void ClearCache(bool shutting_down = false); - void InitializeShaderStorage(const std::filesystem::path& storage_root, + void InitializeShaderStorage(const std::filesystem::path& cache_root, uint32_t title_id, bool blocking); void ShutdownShaderStorage(); void EndSubmission(); bool IsCreatingPipelines(); - D3D12Shader* LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, + D3D12Shader* LoadShader(xenos::ShaderType shader_type, const uint32_t* host_address, uint32_t dword_count); - // Returns the host vertex shader type for the current draw if it's valid and - // supported, or Shader::HostVertexShaderType(-1) if not. - Shader::HostVertexShaderType GetHostVertexShaderTypeIfValid() const; + // Retrieves the shader modifications for the current state, and returns + // whether they are valid. + bool GetCurrentShaderModifications( + DxbcShaderTranslator::Modification& vertex_shader_modification_out, + DxbcShaderTranslator::Modification& pixel_shader_modification_out) const; // Translates shaders if needed, also making shader info up to date. - bool EnsureShadersTranslated( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, - Shader::HostVertexShaderType host_vertex_shader_type); + bool EnsureShadersTranslated(D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader); bool ConfigurePipeline( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], void** pipeline_handle_out, ID3D12RootSignature** root_signature_out); @@ -86,13 +90,12 @@ class PipelineCache { XEPACKEDSTRUCT(ShaderStoredHeader, { uint64_t ucode_data_hash; - uint32_t ucode_dword_count : 16; + uint32_t ucode_dword_count : 31; xenos::ShaderType type : 1; - Shader::HostVertexShaderType host_vertex_shader_type : 3; reg::SQ_PROGRAM_CNTL sq_program_cntl; - static constexpr uint32_t kVersion = 0x20200405; + static constexpr uint32_t kVersion = 0x20201129; }); // Update PipelineDescription::kVersion if any of the Pipeline* enums are @@ -170,28 +173,28 @@ class PipelineCache { uint64_t vertex_shader_hash; // 0 if drawing without a pixel shader. uint64_t pixel_shader_hash; + uint32_t vertex_shader_modification; + uint32_t pixel_shader_modification; int32_t depth_bias; float depth_bias_slope_scaled; PipelineStripCutIndex strip_cut_index : 2; // 2 - Shader::HostVertexShaderType host_vertex_shader_type : 3; // 5 // PipelinePrimitiveTopologyType for a vertex shader. // xenos::TessellationMode for a domain shader. - uint32_t primitive_topology_type_or_tessellation_mode : 2; // 7 + uint32_t primitive_topology_type_or_tessellation_mode : 2; // 4 // Zero for non-kVertex host_vertex_shader_type. - PipelineGeometryShader geometry_shader : 2; // 9 - uint32_t fill_mode_wireframe : 1; // 10 - PipelineCullMode cull_mode : 2; // 12 - uint32_t front_counter_clockwise : 1; // 13 - uint32_t depth_clip : 1; // 14 - uint32_t rov_msaa : 1; // 15 - xenos::DepthRenderTargetFormat depth_format : 1; // 16 - xenos::CompareFunction depth_func : 3; // 19 - uint32_t depth_write : 1; // 20 - uint32_t stencil_enable : 1; // 21 - uint32_t stencil_read_mask : 8; // 29 - uint32_t force_early_z : 1; // 30 + PipelineGeometryShader geometry_shader : 2; // 6 + uint32_t fill_mode_wireframe : 1; // 7 + PipelineCullMode cull_mode : 2; // 9 + uint32_t front_counter_clockwise : 1; // 10 + uint32_t depth_clip : 1; // 11 + uint32_t rov_msaa : 1; // 12 + xenos::DepthRenderTargetFormat depth_format : 1; // 13 + xenos::CompareFunction depth_func : 3; // 16 + uint32_t depth_write : 1; // 17 + uint32_t stencil_enable : 1; // 18 + uint32_t stencil_read_mask : 8; // 26 uint32_t stencil_write_mask : 8; // 8 xenos::StencilOp stencil_front_fail_op : 3; // 11 @@ -205,7 +208,7 @@ class PipelineCache { PipelineRenderTarget render_targets[4]; - static constexpr uint32_t kVersion = 0x20200405; + static constexpr uint32_t kVersion = 0x20201202; }); XEPACKEDSTRUCT(PipelineStoredDescription, { @@ -215,24 +218,31 @@ class PipelineCache { struct PipelineRuntimeDescription { ID3D12RootSignature* root_signature; - D3D12Shader* vertex_shader; - D3D12Shader* pixel_shader; + D3D12Shader::D3D12Translation* vertex_shader; + D3D12Shader::D3D12Translation* pixel_shader; PipelineDescription description; }; + // Returns the host vertex shader type for the current draw if it's valid and + // supported, or Shader::HostVertexShaderType(-1) if not. + Shader::HostVertexShaderType GetCurrentHostVertexShaderTypeIfValid() const; + + D3D12Shader* LoadShader(xenos::ShaderType shader_type, + const uint32_t* host_address, uint32_t dword_count, + uint64_t data_hash); + // Can be called from multiple threads. - bool TranslateShader(DxbcShaderTranslator& translator, D3D12Shader& shader, + bool TranslateShader(DxbcShaderTranslator& translator, + D3D12Shader::D3D12Translation& translation, reg::SQ_PROGRAM_CNTL cntl, IDxbcConverter* dxbc_converter = nullptr, IDxcUtils* dxc_utils = nullptr, - IDxcCompiler* dxc_compiler = nullptr, - Shader::HostVertexShaderType host_vertex_shader_type = - Shader::HostVertexShaderType::kVertex); + IDxcCompiler* dxc_compiler = nullptr); bool GetCurrentStateDescription( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], PipelineRuntimeDescription& runtime_description_out); @@ -243,6 +253,8 @@ class PipelineCache { const RegisterFile& register_file_; bool bindless_resources_used_; bool edram_rov_used_; + // 20e4 depth conversion mode to use for non-ROV output. + flags::DepthFloat24Conversion depth_float24_conversion_; uint32_t resolution_scale_; // Reusable shader translator. @@ -300,11 +312,14 @@ class PipelineCache { Pipeline* current_pipeline_ = nullptr; // Currently open shader storage path. - std::filesystem::path shader_storage_root_; + std::filesystem::path shader_storage_cache_root_; uint32_t shader_storage_title_id_ = 0; // Shader storage output stream, for preload in the next emulator runs. FILE* shader_storage_file_ = nullptr; + // For only writing shaders to the currently open storage once, incremented + // when switching the storage. + uint32_t shader_storage_index_ = 0; bool shader_storage_file_flush_needed_ = false; // Pipeline storage output stream, for preload in the next emulator runs. diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 66ef2ba9f..8669d58a3 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -40,11 +40,13 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_7e3_cs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_7e3_cs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/resolve_clear_32bpp_2xres_cs.h" @@ -87,6 +89,12 @@ const RenderTargetCache::EdramLoadStoreModeInfo {edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs), L"EDRAM Load Float Depth", edram_store_depth_float_cs, sizeof(edram_store_depth_float_cs), L"EDRAM Store Float Depth"}, + {edram_load_depth_float24and32_cs, + sizeof(edram_load_depth_float24and32_cs), + L"EDRAM Load 24-bit & 32-bit Float Depth", + edram_store_depth_float24and32_cs, + sizeof(edram_store_depth_float24and32_cs), + L"EDRAM Store 24-bit & 32-bit Float Depth"}, }; const std::pair @@ -126,6 +134,8 @@ RenderTargetCache::RenderTargetCache(D3D12CommandProcessor& command_processor, RenderTargetCache::~RenderTargetCache() { Shutdown(); } bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { + depth_float24_conversion_ = flags::GetDepthFloat24Conversion(); + // EDRAM buffer size depends on this. resolution_scale_2x_ = texture_cache.IsResolutionScale2X(); assert_false(resolution_scale_2x_ && !edram_rov_used_); @@ -420,7 +430,8 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { return false; } resolve_clear_64bpp_pipeline_->SetName(L"Resolve Clear 64bpp"); - if (!edram_rov_used_) { + if (!edram_rov_used_ && + depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy) { assert_false(resolution_scale_2x_); resolve_clear_depth_24_32_pipeline_ = ui::d3d12::util::CreateComputePipeline( @@ -434,7 +445,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { Shutdown(); return false; } - resolve_clear_64bpp_pipeline_->SetName( + resolve_clear_depth_24_32_pipeline_->SetName( L"Resolve Clear 24-bit & 32-bit Depth"); } @@ -1266,10 +1277,12 @@ bool RenderTargetCache::Resolve(const Memory& memory, if (clear_depth) { // Also clear the host 32-bit floating-point depth used for loaing and // storing 24-bit floating-point depth at full precision. - bool clear_float32_depth = - !edram_rov_used_ && xenos::DepthRenderTargetFormat( - resolve_info.depth_edram_info.format) == - xenos::DepthRenderTargetFormat::kD24FS8; + bool clear_float32_depth = !edram_rov_used_ && + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnCopy && + xenos::DepthRenderTargetFormat( + resolve_info.depth_edram_info.format) == + xenos::DepthRenderTargetFormat::kD24FS8; draw_util::ResolveClearShaderConstants depth_clear_constants; resolve_info.GetDepthClearShaderConstants(clear_float32_depth, depth_clear_constants); @@ -1558,7 +1571,8 @@ void RenderTargetCache::RestoreEdramSnapshot(const void* snapshot) { uint32_t RenderTargetCache::GetEdramBufferSize() const { uint32_t size = xenos::kEdramSizeBytes; - if (!edram_rov_used_) { + if (!edram_rov_used_ && + depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy) { // Two 10 MB pages, one containing color and integer depth data, another // with 32-bit float depth when 20e4 depth is used to allow for multipass // drawing without precision loss in case of EDRAM store/load. @@ -1831,12 +1845,15 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( } RenderTargetCache::EdramLoadStoreMode RenderTargetCache::GetLoadStoreMode( - bool is_depth, uint32_t format) { + bool is_depth, uint32_t format) const { if (is_depth) { - return xenos::DepthRenderTargetFormat(format) == - xenos::DepthRenderTargetFormat::kD24FS8 - ? EdramLoadStoreMode::kDepthFloat - : EdramLoadStoreMode::kDepthUnorm; + if (xenos::DepthRenderTargetFormat(format) == + xenos::DepthRenderTargetFormat::kD24FS8) { + return depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy + ? EdramLoadStoreMode::kDepthFloat24And32 + : EdramLoadStoreMode::kDepthFloat; + } + return EdramLoadStoreMode::kDepthUnorm; } xenos::ColorRenderTargetFormat color_format = xenos::ColorRenderTargetFormat(format); diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index 5069b3cb7..2f71c13c8 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -18,6 +18,7 @@ #include "xenia/gpu/d3d12/d3d12_shared_memory.h" #include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/trace_writer.h" #include "xenia/gpu/xenos.h" @@ -259,6 +260,10 @@ class RenderTargetCache { void Shutdown(); void ClearCache(); + flags::DepthFloat24Conversion depth_float24_conversion() const { + return depth_float24_conversion_; + } + void CompletedSubmissionUpdated(); void BeginSubmission(); void EndFrame(); @@ -318,6 +323,7 @@ class RenderTargetCache { kColor7e3, kDepthUnorm, kDepthFloat, + kDepthFloat24And32, kCount }; @@ -424,7 +430,7 @@ class RenderTargetCache { uint32_t instance); #endif - static EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format); + EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format) const; // Must be in a frame to call. Stores the dirty areas of the currently bound // render targets and marks them as clean. @@ -442,6 +448,9 @@ class RenderTargetCache { bool bindless_resources_used_; bool edram_rov_used_; + // 20e4 depth conversion mode to use for non-ROV output. + flags::DepthFloat24Conversion depth_float24_conversion_; + // Whether 1 guest pixel is rendered as 2x2 host pixels (currently only // supported with ROV). bool resolution_scale_2x_ = false; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.cso new file mode 100644 index 000000000..c389242a1 Binary files /dev/null and b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.cso differ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h new file mode 100644 index 000000000..34f44b18c --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h @@ -0,0 +1,296 @@ +// generated from `xb buildhlsl` +// source: edram_load_depth_float24and32.cs.hlsl +const uint8_t edram_load_depth_float24and32_cs[] = { + 0x44, 0x58, 0x42, 0x43, 0xF3, 0xA3, 0xA4, 0x14, 0x0A, 0x50, 0x56, 0x49, + 0x5D, 0x09, 0x6C, 0xBF, 0x33, 0xC9, 0xC1, 0x9A, 0x01, 0x00, 0x00, 0x00, + 0xAC, 0x0D, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00, + 0x10, 0x0D, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00, + 0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xB4, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xCF, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x6C, 0x6F, 0x61, + 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x73, 0x6F, 0x75, 0x72, + 0x63, 0x65, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, + 0x6C, 0x6F, 0x61, 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x64, + 0x65, 0x73, 0x74, 0x00, 0x58, 0x65, 0x45, 0x64, 0x72, 0x61, 0x6D, 0x4C, + 0x6F, 0x61, 0x64, 0x53, 0x74, 0x6F, 0x72, 0x65, 0x43, 0x6F, 0x6E, 0x73, + 0x74, 0x61, 0x6E, 0x74, 0x73, 0x00, 0xAB, 0xAB, 0xE8, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x1C, 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x4E, 0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x69, 0x02, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x83, 0x02, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, + 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, 0x72, 0x5F, 0x64, 0x65, + 0x70, 0x74, 0x68, 0x5F, 0x6F, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x64, + 0x77, 0x6F, 0x72, 0x64, 0x00, 0xAB, 0xAB, 0xAB, 0x00, 0x00, 0x13, 0x00, + 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, + 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, + 0x72, 0x5F, 0x64, 0x65, 0x70, 0x74, 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, + 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, + 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, 0x5F, 0x6F, 0x66, + 0x66, 0x73, 0x65, 0x74, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, + 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, + 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, + 0x72, 0x61, 0x6D, 0x5F, 0x62, 0x61, 0x73, 0x65, 0x5F, 0x73, 0x61, 0x6D, + 0x70, 0x6C, 0x65, 0x73, 0x5F, 0x32, 0x78, 0x5F, 0x64, 0x65, 0x70, 0x74, + 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x4D, 0x69, 0x63, 0x72, + 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, 0x29, 0x20, 0x48, 0x4C, + 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, 0x72, 0x20, 0x43, 0x6F, + 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2E, 0x31, 0x00, + 0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58, + 0xDC, 0x09, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x77, 0x02, 0x00, 0x00, + 0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06, + 0x46, 0x7E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x06, + 0x46, 0xEE, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02, + 0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00, + 0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, + 0x07, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C, + 0x62, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, + 0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, + 0xA2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF, + 0x1E, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, + 0x55, 0x00, 0x00, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x08, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x10, 0x02, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07, + 0x00, 0xD0, 0x00, 0x00, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, + 0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x00, 0x00, 0xA0, 0x00, 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x7E, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x05, 0xF2, 0x00, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, + 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, 0xF2, 0x00, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, + 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0x37, 0x00, 0x00, 0x09, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, + 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, + 0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, + 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, + 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, + 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, + 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09, + 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x02, 0x00, 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x56, 0x05, 0x02, 0x00, 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x86, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, 0xF2, 0xE0, 0x21, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x14, 0xE2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x56, 0x0E, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xE6, 0x0A, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, + 0x12, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, + 0x94, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1F, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.txt new file mode 100644 index 000000000..4ad3f4288 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.txt @@ -0,0 +1,117 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// Buffer Definitions: +// +// cbuffer XeEdramLoadStoreConstants +// { +// +// uint xe_edram_rt_color_depth_offset;// Offset: 0 Size: 4 +// uint xe_edram_rt_color_depth_pitch;// Offset: 4 Size: 4 +// uint xe_edram_rt_stencil_offset; // Offset: 8 Size: 4 +// uint xe_edram_rt_stencil_pitch; // Offset: 12 Size: 4 +// uint xe_edram_base_samples_2x_depth_pitch;// Offset: 16 Size: 4 +// +// } +// +// +// Resource Bindings: +// +// Name Type Format Dim ID HLSL Bind Count +// ------------------------------ ---------- ------- ----------- ------- -------------- ------ +// xe_edram_load_store_source texture byte r/o T0 t0 1 +// xe_edram_load_store_dest UAV byte r/w U0 u0 1 +// XeEdramLoadStoreConstants cbuffer NA NA CB0 cb0 1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Input +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Output +cs_5_1 +dcl_globalFlags refactoringAllowed +dcl_constantbuffer CB0[0:0][2], immediateIndexed, space=0 +dcl_resource_raw T0[0:0], space=0 +dcl_uav_raw U0[0:0], space=0 +dcl_input vThreadGroupID.xy +dcl_input vThreadIDInGroup.xy +dcl_input vThreadID.xy +dcl_temps 7 +dcl_thread_group 20, 16, 1 +ishl r0.x, vThreadIDInGroup.x, l(2) +and r0.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0) +if_nz r0.y + ult r0.y, vThreadIDInGroup.x, l(10) + uge r0.w, vThreadIDInGroup.x, l(10) + and r0.yw, r0.yyyw, l(0, 40, 0, -40) + iadd r0.y, r0.w, r0.y + iadd r0.x, r0.y, r0.x +endif +ushr r0.y, CB0[0][1].x, l(16) +imad r0.y, vThreadGroupID.y, r0.y, r0.z +iadd r0.y, r0.y, vThreadGroupID.x +imul null, r0.z, vThreadIDInGroup.y, l(320) +imad r0.y, r0.y, l(5120), r0.z +ishl r0.x, r0.x, l(2) +iadd r0.x, r0.x, r0.y +ubfe r0.y, l(1), l(13), CB0[0][1].x +ishl r0.y, r0.y, l(1) +ishl r0.x, r0.x, r0.y +ld_raw r1.xyzw, r0.x, T0[0].xyzw +ushr r2.xyzw, r1.xyzw, l(8, 8, 8, 8) +iadd r0.x, r0.x, l(0x00a00000) +ld_raw r0.xyzw, r0.x, T0[0].xyzw +ubfe r3.xyzw, l(20, 20, 20, 20), l(8, 8, 8, 8), r1.xyzw +ushr r4.xyzw, r2.xyzw, l(20, 20, 20, 20) +firstbit_hi r5.xyzw, r3.xyzw +iadd r5.xyzw, r5.xyzw, l(-11, -11, -11, -11) +movc r5.xyzw, r3.xyzw, r5.xyzw, l(21,21,21,21) +iadd r6.xyzw, -r5.xyzw, l(1, 1, 1, 1) +movc r6.xyzw, r4.xyzw, r4.xyzw, r6.xyzw +ishl r5.xyzw, r3.xyzw, r5.xyzw +and r5.xyzw, r5.xyzw, l(0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff) +movc r3.xyzw, r4.xyzw, r3.xyzw, r5.xyzw +ishl r4.xyzw, r6.xyzw, l(23, 23, 23, 23) +iadd r4.xyzw, r4.xyzw, l(0x38000000, 0x38000000, 0x38000000, 0x38000000) +ishl r3.xyzw, r3.xyzw, l(3, 3, 3, 3) +iadd r3.xyzw, r4.xyzw, r3.xyzw +movc r3.xyzw, r2.xyzw, r3.xyzw, l(0,0,0,0) +iadd r4.xyzw, r0.xyzw, -r3.xyzw +uge r5.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r0.xyzw +and r0.xyzw, r0.xyzw, r5.xyzw +umin r0.xyzw, r0.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8) +bfi r5.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r0.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000) +ushr r6.xyzw, r0.xyzw, l(23, 23, 23, 23) +iadd r6.xyzw, -r6.xyzw, l(113, 113, 113, 113) +umin r6.xyzw, r6.xyzw, l(24, 24, 24, 24) +ushr r5.xyzw, r5.xyzw, r6.xyzw +ult r6.xyzw, r0.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000) +iadd r0.xyzw, r0.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000) +movc r0.xyzw, r6.xyzw, r5.xyzw, r0.xyzw +iadd r5.xyzw, r0.xyzw, l(3, 3, 3, 3) +ubfe r0.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r0.xyzw +iadd r0.xyzw, r0.xyzw, r5.xyzw +ubfe r0.xyzw, l(24, 24, 24, 24), l(3, 3, 3, 3), r0.xyzw +ieq r0.xyzw, r2.xyzw, r0.xyzw +and r0.xyzw, r0.xyzw, l(1, 1, 1, 1) +imad r0.xyzw, r4.xyzw, r0.xyzw, r3.xyzw +ishl r2.xy, vThreadID.xxxx, l(4, 2, 0, 0) +imad r2.xy, vThreadID.yyyy, CB0[0][0].ywyy, r2.xyxx +iadd r2.xy, r2.xyxx, CB0[0][0].xzxx +store_raw U0[0].xyzw, r2.x, r0.xyzw +and r0.x, r1.x, l(255) +bfi r0.yzw, l(0, 8, 8, 8), l(0, 8, 16, 24), r1.yyzw, l(0, 0, 0, 0) +iadd r0.xy, r0.zwzz, r0.xyxx +iadd r0.x, r0.y, r0.x +store_raw U0[0].x, r2.y, r0.x +ret +// Approximately 67 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso index c389242a1..01be358b0 100644 Binary files a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso and b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso differ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h index c7e561ddc..54c523930 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h @@ -1,11 +1,11 @@ // generated from `xb buildhlsl` // source: edram_load_depth_float.cs.hlsl const uint8_t edram_load_depth_float_cs[] = { - 0x44, 0x58, 0x42, 0x43, 0xF3, 0xA3, 0xA4, 0x14, 0x0A, 0x50, 0x56, 0x49, - 0x5D, 0x09, 0x6C, 0xBF, 0x33, 0xC9, 0xC1, 0x9A, 0x01, 0x00, 0x00, 0x00, - 0xAC, 0x0D, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x42, 0x43, 0x17, 0xEE, 0x03, 0x06, 0xD3, 0x6E, 0x58, 0x75, + 0x66, 0x3B, 0x5B, 0x87, 0x2F, 0xF9, 0x44, 0x9E, 0x01, 0x00, 0x00, 0x00, + 0x64, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00, - 0x10, 0x0D, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, + 0xC8, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00, 0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, @@ -69,7 +69,7 @@ const uint8_t edram_load_depth_float_cs[] = { 0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58, - 0xDC, 0x09, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x77, 0x02, 0x00, 0x00, + 0x94, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0xA5, 0x01, 0x00, 0x00, 0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06, @@ -126,168 +126,98 @@ const uint8_t edram_load_depth_float_cs[] = { 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, - 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x09, + 0x32, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x02, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x0A, + 0x32, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x56, 0x05, 0x02, 0x00, + 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x46, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x86, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, - 0x00, 0x00, 0xA0, 0x00, 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x46, 0x7E, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x05, 0xF2, 0x00, 0x10, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, - 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, 0xF2, 0x00, 0x10, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x29, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x05, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, - 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0x37, 0x00, 0x00, 0x09, - 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, - 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, - 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, - 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, - 0x41, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A, - 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, - 0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, - 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14, - 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, - 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, + 0x02, 0x40, 0x00, 0x00, 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, + 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x00, 0x0C, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, - 0x41, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, - 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07, - 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, - 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, - 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09, - 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, - 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, - 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x23, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x29, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x02, 0x00, 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x23, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x56, 0x05, 0x02, 0x00, 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x86, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, + 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, + 0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, + 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, 0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x14, 0xE2, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x56, 0x0E, 0x10, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xE6, 0x0A, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0xE6, 0x0A, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, 0x12, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, - 0x94, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x1F, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.txt index 4ad3f4288..bdc770e2c 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.txt +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.txt @@ -66,11 +66,12 @@ iadd r0.x, r0.x, r0.y ubfe r0.y, l(1), l(13), CB0[0][1].x ishl r0.y, r0.y, l(1) ishl r0.x, r0.x, r0.y -ld_raw r1.xyzw, r0.x, T0[0].xyzw -ushr r2.xyzw, r1.xyzw, l(8, 8, 8, 8) -iadd r0.x, r0.x, l(0x00a00000) ld_raw r0.xyzw, r0.x, T0[0].xyzw -ubfe r3.xyzw, l(20, 20, 20, 20), l(8, 8, 8, 8), r1.xyzw +ishl r1.xy, vThreadID.xxxx, l(4, 2, 0, 0) +imad r1.xy, vThreadID.yyyy, CB0[0][0].ywyy, r1.xyxx +iadd r1.xy, r1.xyxx, CB0[0][0].xzxx +ushr r2.xyzw, r0.xyzw, l(8, 8, 8, 8) +ubfe r3.xyzw, l(20, 20, 20, 20), l(8, 8, 8, 8), r0.xyzw ushr r4.xyzw, r2.xyzw, l(20, 20, 20, 20) firstbit_hi r5.xyzw, r3.xyzw iadd r5.xyzw, r5.xyzw, l(-11, -11, -11, -11) @@ -84,34 +85,12 @@ ishl r4.xyzw, r6.xyzw, l(23, 23, 23, 23) iadd r4.xyzw, r4.xyzw, l(0x38000000, 0x38000000, 0x38000000, 0x38000000) ishl r3.xyzw, r3.xyzw, l(3, 3, 3, 3) iadd r3.xyzw, r4.xyzw, r3.xyzw -movc r3.xyzw, r2.xyzw, r3.xyzw, l(0,0,0,0) -iadd r4.xyzw, r0.xyzw, -r3.xyzw -uge r5.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r0.xyzw -and r0.xyzw, r0.xyzw, r5.xyzw -umin r0.xyzw, r0.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8) -bfi r5.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r0.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000) -ushr r6.xyzw, r0.xyzw, l(23, 23, 23, 23) -iadd r6.xyzw, -r6.xyzw, l(113, 113, 113, 113) -umin r6.xyzw, r6.xyzw, l(24, 24, 24, 24) -ushr r5.xyzw, r5.xyzw, r6.xyzw -ult r6.xyzw, r0.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000) -iadd r0.xyzw, r0.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000) -movc r0.xyzw, r6.xyzw, r5.xyzw, r0.xyzw -iadd r5.xyzw, r0.xyzw, l(3, 3, 3, 3) -ubfe r0.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r0.xyzw -iadd r0.xyzw, r0.xyzw, r5.xyzw -ubfe r0.xyzw, l(24, 24, 24, 24), l(3, 3, 3, 3), r0.xyzw -ieq r0.xyzw, r2.xyzw, r0.xyzw -and r0.xyzw, r0.xyzw, l(1, 1, 1, 1) -imad r0.xyzw, r4.xyzw, r0.xyzw, r3.xyzw -ishl r2.xy, vThreadID.xxxx, l(4, 2, 0, 0) -imad r2.xy, vThreadID.yyyy, CB0[0][0].ywyy, r2.xyxx -iadd r2.xy, r2.xyxx, CB0[0][0].xzxx -store_raw U0[0].xyzw, r2.x, r0.xyzw -and r0.x, r1.x, l(255) -bfi r0.yzw, l(0, 8, 8, 8), l(0, 8, 16, 24), r1.yyzw, l(0, 0, 0, 0) -iadd r0.xy, r0.zwzz, r0.xyxx +movc r2.xyzw, r2.xyzw, r3.xyzw, l(0,0,0,0) +store_raw U0[0].xyzw, r1.x, r2.xyzw +and r2.x, r0.x, l(255) +bfi r2.yzw, l(0, 8, 8, 8), l(0, 8, 16, 24), r0.yyzw, l(0, 0, 0, 0) +iadd r0.xy, r2.zwzz, r2.xyxx iadd r0.x, r0.y, r0.x -store_raw U0[0].x, r2.y, r0.x +store_raw U0[0].x, r1.y, r0.x ret -// Approximately 67 instruction slots used +// Approximately 46 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.cso new file mode 100644 index 000000000..1dd12cb19 Binary files /dev/null and b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.cso differ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h new file mode 100644 index 000000000..c5a2d2118 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h @@ -0,0 +1,226 @@ +// generated from `xb buildhlsl` +// source: edram_store_depth_float24and32.cs.hlsl +const uint8_t edram_store_depth_float24and32_cs[] = { + 0x44, 0x58, 0x42, 0x43, 0xC6, 0x10, 0x80, 0x14, 0x97, 0x01, 0xE4, 0x46, + 0x76, 0xF1, 0x67, 0xD3, 0xDF, 0x50, 0x25, 0xF7, 0x01, 0x00, 0x00, 0x00, + 0x64, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00, + 0xC8, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00, + 0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xB4, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xCF, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x6C, 0x6F, 0x61, + 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x73, 0x6F, 0x75, 0x72, + 0x63, 0x65, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, + 0x6C, 0x6F, 0x61, 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x64, + 0x65, 0x73, 0x74, 0x00, 0x58, 0x65, 0x45, 0x64, 0x72, 0x61, 0x6D, 0x4C, + 0x6F, 0x61, 0x64, 0x53, 0x74, 0x6F, 0x72, 0x65, 0x43, 0x6F, 0x6E, 0x73, + 0x74, 0x61, 0x6E, 0x74, 0x73, 0x00, 0xAB, 0xAB, 0xE8, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x1C, 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x4E, 0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x69, 0x02, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x83, 0x02, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, + 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, 0x72, 0x5F, 0x64, 0x65, + 0x70, 0x74, 0x68, 0x5F, 0x6F, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x64, + 0x77, 0x6F, 0x72, 0x64, 0x00, 0xAB, 0xAB, 0xAB, 0x00, 0x00, 0x13, 0x00, + 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, + 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, + 0x72, 0x5F, 0x64, 0x65, 0x70, 0x74, 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, + 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, + 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, 0x5F, 0x6F, 0x66, + 0x66, 0x73, 0x65, 0x74, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, + 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, + 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, + 0x72, 0x61, 0x6D, 0x5F, 0x62, 0x61, 0x73, 0x65, 0x5F, 0x73, 0x61, 0x6D, + 0x70, 0x6C, 0x65, 0x73, 0x5F, 0x32, 0x78, 0x5F, 0x64, 0x65, 0x70, 0x74, + 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x4D, 0x69, 0x63, 0x72, + 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, 0x29, 0x20, 0x48, 0x4C, + 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, 0x72, 0x20, 0x43, 0x6F, + 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2E, 0x31, 0x00, + 0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58, + 0x94, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0xA5, 0x01, 0x00, 0x00, + 0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06, + 0x46, 0x7E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x06, + 0x46, 0xEE, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02, + 0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00, + 0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, + 0x05, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x09, + 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x02, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x0A, + 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x56, 0x05, 0x02, 0x00, + 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, + 0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, + 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, + 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, + 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, + 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xA5, 0x00, 0x00, 0x08, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x70, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xE2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x11, + 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06, 0x12, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C, 0x62, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0xFF, 0x07, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06, 0x82, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xA2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF, 0x1E, 0x00, 0x00, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x09, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x08, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x10, 0x02, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07, 0x00, 0xD0, 0x00, 0x00, + 0x42, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x09, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, 0x22, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, + 0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0xA0, 0x00, 0xA6, 0x00, 0x00, 0x08, + 0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, + 0x94, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.txt new file mode 100644 index 000000000..1a0cc82cc --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.txt @@ -0,0 +1,95 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// Buffer Definitions: +// +// cbuffer XeEdramLoadStoreConstants +// { +// +// uint xe_edram_rt_color_depth_offset;// Offset: 0 Size: 4 +// uint xe_edram_rt_color_depth_pitch;// Offset: 4 Size: 4 +// uint xe_edram_rt_stencil_offset; // Offset: 8 Size: 4 +// uint xe_edram_rt_stencil_pitch; // Offset: 12 Size: 4 +// uint xe_edram_base_samples_2x_depth_pitch;// Offset: 16 Size: 4 +// +// } +// +// +// Resource Bindings: +// +// Name Type Format Dim ID HLSL Bind Count +// ------------------------------ ---------- ------- ----------- ------- -------------- ------ +// xe_edram_load_store_source texture byte r/o T0 t0 1 +// xe_edram_load_store_dest UAV byte r/w U0 u0 1 +// XeEdramLoadStoreConstants cbuffer NA NA CB0 cb0 1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Input +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Output +cs_5_1 +dcl_globalFlags refactoringAllowed +dcl_constantbuffer CB0[0:0][2], immediateIndexed, space=0 +dcl_resource_raw T0[0:0], space=0 +dcl_uav_raw U0[0:0], space=0 +dcl_input vThreadGroupID.xy +dcl_input vThreadIDInGroup.xy +dcl_input vThreadID.xy +dcl_temps 5 +dcl_thread_group 20, 16, 1 +ishl r0.xy, vThreadID.xxxx, l(4, 2, 0, 0) +imad r0.xy, vThreadID.yyyy, CB0[0][0].ywyy, r0.xyxx +iadd r0.xy, r0.xyxx, CB0[0][0].xzxx +ld_raw r1.xyzw, r0.x, T0[0].xyzw +uge r2.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r1.xyzw +and r2.xyzw, r1.xyzw, r2.xyzw +umin r2.xyzw, r2.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8) +bfi r3.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r2.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000) +ushr r4.xyzw, r2.xyzw, l(23, 23, 23, 23) +iadd r4.xyzw, -r4.xyzw, l(113, 113, 113, 113) +umin r4.xyzw, r4.xyzw, l(24, 24, 24, 24) +ushr r3.xyzw, r3.xyzw, r4.xyzw +ult r4.xyzw, r2.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000) +iadd r2.xyzw, r2.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000) +movc r2.xyzw, r4.xyzw, r3.xyzw, r2.xyzw +iadd r3.xyzw, r2.xyzw, l(3, 3, 3, 3) +ubfe r2.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r2.xyzw +iadd r2.xyzw, r2.xyzw, r3.xyzw +ushr r2.xyzw, r2.xyzw, l(3, 3, 3, 3) +ld_raw r0.x, r0.y, T0[0].xxxx +ushr r0.yzw, r0.xxxx, l(0, 8, 16, 24) +bfi r0.xyzw, l(24, 24, 24, 24), l(8, 8, 8, 8), r2.xyzw, r0.xyzw +ishl r2.x, vThreadIDInGroup.x, l(2) +and r2.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0) +if_nz r2.y + ult r2.y, vThreadIDInGroup.x, l(10) + uge r2.w, vThreadIDInGroup.x, l(10) + and r2.yw, r2.yyyw, l(0, 40, 0, -40) + iadd r2.y, r2.w, r2.y + iadd r2.x, r2.y, r2.x +endif +ushr r2.y, CB0[0][1].x, l(16) +imad r2.y, vThreadGroupID.y, r2.y, r2.z +iadd r2.y, r2.y, vThreadGroupID.x +imul null, r2.z, vThreadIDInGroup.y, l(320) +imad r2.y, r2.y, l(5120), r2.z +ishl r2.x, r2.x, l(2) +iadd r2.x, r2.x, r2.y +ubfe r2.y, l(1), l(13), CB0[0][1].x +ishl r2.y, r2.y, l(1) +ishl r2.x, r2.x, r2.y +store_raw U0[0].xyzw, r2.x, r0.xyzw +iadd r0.x, r2.x, l(0x00a00000) +store_raw U0[0].xyzw, r0.x, r1.xyzw +ret +// Approximately 45 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.cso index 1dd12cb19..b636e8d75 100644 Binary files a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.cso and b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.cso differ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h index d9b5e944f..843cf9070 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h @@ -1,11 +1,11 @@ // generated from `xb buildhlsl` // source: edram_store_depth_float.cs.hlsl const uint8_t edram_store_depth_float_cs[] = { - 0x44, 0x58, 0x42, 0x43, 0xC6, 0x10, 0x80, 0x14, 0x97, 0x01, 0xE4, 0x46, - 0x76, 0xF1, 0x67, 0xD3, 0xDF, 0x50, 0x25, 0xF7, 0x01, 0x00, 0x00, 0x00, - 0x64, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x42, 0x43, 0xF1, 0x72, 0x64, 0x54, 0x9D, 0xF6, 0x79, 0x48, + 0x2F, 0x8C, 0xD1, 0x59, 0x56, 0x1C, 0x90, 0x9A, 0x01, 0x00, 0x00, 0x00, + 0x28, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00, - 0xC8, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, + 0x8C, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00, 0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, @@ -69,7 +69,7 @@ const uint8_t edram_store_depth_float_cs[] = { 0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58, - 0x94, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0xA5, 0x01, 0x00, 0x00, + 0x58, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x96, 0x01, 0x00, 0x00, 0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06, @@ -79,7 +79,7 @@ const uint8_t edram_store_depth_float_cs[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, - 0x05, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x02, 0x00, 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -96,53 +96,53 @@ const uint8_t edram_store_depth_float_cs[] = { 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14, - 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, - 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, - 0x41, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07, - 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09, + 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, - 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xA5, 0x00, 0x00, 0x08, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, @@ -155,64 +155,59 @@ const uint8_t edram_store_depth_float_cs[] = { 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06, 0x12, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C, 0x62, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06, 0x82, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xA2, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF, 0x1E, 0x00, 0x00, 0x07, - 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x09, - 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, + 0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x09, + 0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x08, - 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x10, 0x02, 0x00, - 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x10, 0x02, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07, 0x00, 0xD0, 0x00, 0x00, - 0x42, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x20, 0x02, 0x00, + 0x42, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x09, - 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, - 0x2A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, - 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, 0x22, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, 0x22, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, + 0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, 0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0xA0, 0x00, 0xA6, 0x00, 0x00, 0x08, - 0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, - 0x94, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, + 0x94, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -222,5 +217,5 @@ const uint8_t edram_store_depth_float_cs[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.txt index 1a0cc82cc..ef8dcdfe6 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.txt +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.txt @@ -45,51 +45,49 @@ dcl_uav_raw U0[0:0], space=0 dcl_input vThreadGroupID.xy dcl_input vThreadIDInGroup.xy dcl_input vThreadID.xy -dcl_temps 5 +dcl_temps 4 dcl_thread_group 20, 16, 1 ishl r0.xy, vThreadID.xxxx, l(4, 2, 0, 0) imad r0.xy, vThreadID.yyyy, CB0[0][0].ywyy, r0.xyxx iadd r0.xy, r0.xyxx, CB0[0][0].xzxx ld_raw r1.xyzw, r0.x, T0[0].xyzw uge r2.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r1.xyzw -and r2.xyzw, r1.xyzw, r2.xyzw -umin r2.xyzw, r2.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8) -bfi r3.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r2.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000) -ushr r4.xyzw, r2.xyzw, l(23, 23, 23, 23) -iadd r4.xyzw, -r4.xyzw, l(113, 113, 113, 113) -umin r4.xyzw, r4.xyzw, l(24, 24, 24, 24) -ushr r3.xyzw, r3.xyzw, r4.xyzw -ult r4.xyzw, r2.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000) -iadd r2.xyzw, r2.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000) -movc r2.xyzw, r4.xyzw, r3.xyzw, r2.xyzw -iadd r3.xyzw, r2.xyzw, l(3, 3, 3, 3) -ubfe r2.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r2.xyzw -iadd r2.xyzw, r2.xyzw, r3.xyzw -ushr r2.xyzw, r2.xyzw, l(3, 3, 3, 3) +and r1.xyzw, r1.xyzw, r2.xyzw +umin r1.xyzw, r1.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8) +bfi r2.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r1.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000) +ushr r3.xyzw, r1.xyzw, l(23, 23, 23, 23) +iadd r3.xyzw, -r3.xyzw, l(113, 113, 113, 113) +umin r3.xyzw, r3.xyzw, l(24, 24, 24, 24) +ushr r2.xyzw, r2.xyzw, r3.xyzw +ult r3.xyzw, r1.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000) +iadd r1.xyzw, r1.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000) +movc r1.xyzw, r3.xyzw, r2.xyzw, r1.xyzw +iadd r2.xyzw, r1.xyzw, l(3, 3, 3, 3) +ubfe r1.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r1.xyzw +iadd r1.xyzw, r1.xyzw, r2.xyzw +ushr r1.xyzw, r1.xyzw, l(3, 3, 3, 3) ld_raw r0.x, r0.y, T0[0].xxxx ushr r0.yzw, r0.xxxx, l(0, 8, 16, 24) -bfi r0.xyzw, l(24, 24, 24, 24), l(8, 8, 8, 8), r2.xyzw, r0.xyzw -ishl r2.x, vThreadIDInGroup.x, l(2) -and r2.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0) -if_nz r2.y - ult r2.y, vThreadIDInGroup.x, l(10) - uge r2.w, vThreadIDInGroup.x, l(10) - and r2.yw, r2.yyyw, l(0, 40, 0, -40) - iadd r2.y, r2.w, r2.y - iadd r2.x, r2.y, r2.x +bfi r0.xyzw, l(24, 24, 24, 24), l(8, 8, 8, 8), r1.xyzw, r0.xyzw +ishl r1.x, vThreadIDInGroup.x, l(2) +and r1.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0) +if_nz r1.y + ult r1.y, vThreadIDInGroup.x, l(10) + uge r1.w, vThreadIDInGroup.x, l(10) + and r1.yw, r1.yyyw, l(0, 40, 0, -40) + iadd r1.y, r1.w, r1.y + iadd r1.x, r1.y, r1.x endif -ushr r2.y, CB0[0][1].x, l(16) -imad r2.y, vThreadGroupID.y, r2.y, r2.z -iadd r2.y, r2.y, vThreadGroupID.x -imul null, r2.z, vThreadIDInGroup.y, l(320) -imad r2.y, r2.y, l(5120), r2.z -ishl r2.x, r2.x, l(2) -iadd r2.x, r2.x, r2.y -ubfe r2.y, l(1), l(13), CB0[0][1].x -ishl r2.y, r2.y, l(1) -ishl r2.x, r2.x, r2.y -store_raw U0[0].xyzw, r2.x, r0.xyzw -iadd r0.x, r2.x, l(0x00a00000) -store_raw U0[0].xyzw, r0.x, r1.xyzw +ushr r1.y, CB0[0][1].x, l(16) +imad r1.y, vThreadGroupID.y, r1.y, r1.z +iadd r1.y, r1.y, vThreadGroupID.x +imul null, r1.z, vThreadIDInGroup.y, l(320) +imad r1.y, r1.y, l(5120), r1.z +ishl r1.x, r1.x, l(2) +iadd r1.x, r1.x, r1.y +ubfe r1.y, l(1), l(13), CB0[0][1].x +ishl r1.y, r1.y, l(1) +ishl r1.x, r1.x, r1.y +store_raw U0[0].xyzw, r1.x, r0.xyzw ret -// Approximately 45 instruction slots used +// Approximately 43 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.cso b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.cso new file mode 100644 index 000000000..a55fc7376 Binary files /dev/null and b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.cso differ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h new file mode 100644 index 000000000..b155ee5d0 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h @@ -0,0 +1,156 @@ +// generated from `xb buildhlsl` +// source: float24_round.ps.hlsl +const uint8_t float24_round_ps[] = { + 0x44, 0x58, 0x42, 0x43, 0xDF, 0x71, 0xF3, 0x0A, 0x4A, 0xDB, 0xC3, 0x80, + 0x1E, 0xE4, 0x39, 0x21, 0x59, 0x07, 0x78, 0x97, 0x01, 0x00, 0x00, 0x00, + 0x18, 0x07, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0xA0, 0x00, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00, 0xC4, 0x02, 0x00, 0x00, + 0x7C, 0x06, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0xFF, 0xFF, 0x00, 0x05, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x4D, 0x69, 0x63, 0x72, 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, + 0x29, 0x20, 0x48, 0x4C, 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, + 0x72, 0x20, 0x43, 0x6F, 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, + 0x30, 0x2E, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4E, 0xE8, 0x01, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xD9, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x0F, 0x04, 0x00, 0x00, 0x54, 0x45, 0x58, 0x43, + 0x4F, 0x4F, 0x52, 0x44, 0x00, 0x53, 0x56, 0x5F, 0x50, 0x6F, 0x73, 0x69, + 0x74, 0x69, 0x6F, 0x6E, 0x00, 0xAB, 0xAB, 0xAB, 0x4F, 0x53, 0x47, 0x4E, + 0x2C, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x0E, 0x00, 0x00, + 0x53, 0x56, 0x5F, 0x44, 0x65, 0x70, 0x74, 0x68, 0x00, 0xAB, 0xAB, 0xAB, + 0x53, 0x48, 0x45, 0x58, 0xB0, 0x03, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, + 0xEC, 0x00, 0x00, 0x00, 0x6A, 0x08, 0x00, 0x01, 0x64, 0x38, 0x00, 0x04, + 0x42, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x65, 0x00, 0x00, 0x02, 0x01, 0xC0, 0x00, 0x00, 0x68, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x00, 0x36, 0x20, 0x08, 0x05, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2A, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x7F, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x08, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x54, 0x00, 0x08, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, + 0x8C, 0x00, 0x10, 0x0B, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x20, 0x07, + 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x20, 0x08, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x20, 0x07, + 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x08, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, + 0x37, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x08, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x38, 0x0F, + 0x72, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x40, 0x05, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x40, 0x07, 0x82, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x40, 0x09, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x08, 0x08, + 0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x37, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x40, 0x07, 0x82, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x40, 0x07, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, + 0x37, 0x00, 0x10, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x10, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x08, 0x08, 0x01, 0xC0, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, 0x94, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.txt b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.txt new file mode 100644 index 000000000..c9661e6ac --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.txt @@ -0,0 +1,74 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xyzw 0 NONE float +// TEXCOORD 1 xyzw 1 NONE float +// TEXCOORD 2 xyzw 2 NONE float +// TEXCOORD 3 xyzw 3 NONE float +// TEXCOORD 4 xyzw 4 NONE float +// TEXCOORD 5 xyzw 5 NONE float +// TEXCOORD 6 xyzw 6 NONE float +// TEXCOORD 7 xyzw 7 NONE float +// TEXCOORD 8 xyzw 8 NONE float +// TEXCOORD 9 xyzw 9 NONE float +// TEXCOORD 10 xyzw 10 NONE float +// TEXCOORD 11 xyzw 11 NONE float +// TEXCOORD 12 xyzw 12 NONE float +// TEXCOORD 13 xyzw 13 NONE float +// TEXCOORD 14 xyzw 14 NONE float +// TEXCOORD 15 xyzw 15 NONE float +// TEXCOORD 16 xyz 16 NONE float +// TEXCOORD 17 xy 17 NONE float +// SV_Position 0 xyzw 18 POS float z +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_Depth 0 N/A oDepth DEPTH float YES +// +// Pixel Shader runs at sample frequency +// +ps_5_1 +dcl_globalFlags refactoringAllowed +dcl_input_ps_siv linear noperspective sample v18.z, position +dcl_output oDepth +dcl_temps 2 +mov_sat [precise(x)] r0.x, v18.z +uge [precise(y)] r0.y, l(0x7fffffff), r0.x +and [precise(x)] r0.x, r0.x, r0.y +umin [precise(x)] r0.x, r0.x, l(0x3ffffff8) +bfi [precise(y)] r0.y, l(23), l(0), r0.x, l(0x00800000) +ushr [precise(z)] r0.z, r0.x, l(23) +iadd [precise(z)] r0.z, -r0.z, l(113) +umin [precise(z)] r0.z, r0.z, l(24) +ushr [precise(y)] r0.y, r0.y, r0.z +ult [precise(z)] r0.z, r0.x, l(0x38800000) +iadd [precise(x)] r0.x, r0.x, l(0xc8000000) +movc [precise(x)] r0.x, r0.z, r0.y, r0.x +iadd [precise(y)] r0.y, r0.x, l(3) +ubfe [precise(x)] r0.x, l(1), l(3), r0.x +iadd [precise(x)] r0.x, r0.x, r0.y +ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx +firstbit_hi [precise(w)] r0.w, r0.y +iadd [precise(w)] r0.w, r0.w, l(-11) +movc [precise(w)] r0.w, r0.y, r0.w, l(21) +iadd [precise(x)] r1.x, -r0.w, l(1) +movc [precise(x)] r1.x, r0.z, r0.z, r1.x +ishl [precise(w)] r0.w, r0.y, r0.w +and [precise(w)] r0.w, r0.w, l(0x000fffff) +movc [precise(y)] r0.y, r0.z, r0.y, r0.w +ishl [precise(z)] r0.z, r1.x, l(23) +iadd [precise(z)] r0.z, r0.z, l(0x38000000) +ishl [precise(y)] r0.y, r0.y, l(3) +iadd [precise(y)] r0.y, r0.z, r0.y +movc [precise(x)] oDepth, r0.x, r0.y, l(0) +ret +// Approximately 30 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.cso b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.cso new file mode 100644 index 000000000..a22366f58 Binary files /dev/null and b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.cso differ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h new file mode 100644 index 000000000..b8d1d7bb7 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h @@ -0,0 +1,100 @@ +// generated from `xb buildhlsl` +// source: float24_truncate.ps.hlsl +const uint8_t float24_truncate_ps[] = { + 0x44, 0x58, 0x42, 0x43, 0xB8, 0x51, 0x55, 0x1D, 0xF4, 0xF1, 0xC9, 0xC0, + 0x0C, 0x22, 0xD3, 0x43, 0x94, 0xDF, 0x83, 0x9D, 0x01, 0x00, 0x00, 0x00, + 0x7C, 0x04, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0xA0, 0x00, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00, 0xCC, 0x02, 0x00, 0x00, + 0xE0, 0x03, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0xFF, 0xFF, 0x00, 0x05, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x4D, 0x69, 0x63, 0x72, 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, + 0x29, 0x20, 0x48, 0x4C, 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, + 0x72, 0x20, 0x43, 0x6F, 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, + 0x30, 0x2E, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4E, 0xE8, 0x01, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xD9, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x0F, 0x04, 0x00, 0x00, 0x54, 0x45, 0x58, 0x43, + 0x4F, 0x4F, 0x52, 0x44, 0x00, 0x53, 0x56, 0x5F, 0x50, 0x6F, 0x73, 0x69, + 0x74, 0x69, 0x6F, 0x6E, 0x00, 0xAB, 0xAB, 0xAB, 0x4F, 0x53, 0x47, 0x4E, + 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x0E, 0x00, 0x00, + 0x53, 0x56, 0x5F, 0x44, 0x65, 0x70, 0x74, 0x68, 0x4C, 0x65, 0x73, 0x73, + 0x45, 0x71, 0x75, 0x61, 0x6C, 0x00, 0xAB, 0xAB, 0x53, 0x48, 0x45, 0x58, + 0x0C, 0x01, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x6A, 0x08, 0x00, 0x01, 0x64, 0x38, 0x00, 0x04, 0x42, 0x10, 0x10, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x02, + 0x01, 0x70, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00, 0x00, + 0x36, 0x20, 0x08, 0x05, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x10, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x2E, + 0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x10, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x10, 0x08, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x24, 0x00, 0x10, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x8C, 0x00, 0x08, 0x0A, 0x01, 0x70, 0x02, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x01, 0x36, 0x00, 0x08, 0x04, + 0x01, 0x70, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x01, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, + 0x94, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.txt b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.txt new file mode 100644 index 000000000..dd969f04d --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.txt @@ -0,0 +1,55 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xyzw 0 NONE float +// TEXCOORD 1 xyzw 1 NONE float +// TEXCOORD 2 xyzw 2 NONE float +// TEXCOORD 3 xyzw 3 NONE float +// TEXCOORD 4 xyzw 4 NONE float +// TEXCOORD 5 xyzw 5 NONE float +// TEXCOORD 6 xyzw 6 NONE float +// TEXCOORD 7 xyzw 7 NONE float +// TEXCOORD 8 xyzw 8 NONE float +// TEXCOORD 9 xyzw 9 NONE float +// TEXCOORD 10 xyzw 10 NONE float +// TEXCOORD 11 xyzw 11 NONE float +// TEXCOORD 12 xyzw 12 NONE float +// TEXCOORD 13 xyzw 13 NONE float +// TEXCOORD 14 xyzw 14 NONE float +// TEXCOORD 15 xyzw 15 NONE float +// TEXCOORD 16 xyz 16 NONE float +// TEXCOORD 17 xy 17 NONE float +// SV_Position 0 xyzw 18 POS float z +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_DepthLessEqual 0 N/A oDepthLE DEPTHLE float YES +// +// Pixel Shader runs at sample frequency +// +ps_5_1 +dcl_globalFlags refactoringAllowed +dcl_input_ps_siv linear noperspective sample v18.z, position +dcl_output oDepthLE +dcl_temps 1 +mov_sat [precise(x)] r0.x, v18.z +uge [precise(y)] r0.y, r0.x, l(0x2e800000) +if_nz r0.y + ubfe [precise(y)] r0.y, l(8), l(23), r0.x + iadd [precise(y)] r0.y, -r0.y, l(116) + imax [precise(y)] r0.y, r0.y, l(3) + bfi [precise(x)] oDepthLE, r0.y, l(0), l(0), r0.x +else + mov [precise(x)] oDepthLE, l(0) +endif +ret +// Approximately 11 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl index bc02b4623..ef72713a3 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl @@ -7,22 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_thread_id : SV_DispatchThreadID) { uint2 tile_sample_index = xe_group_thread_id.xy; tile_sample_index.x *= 4u; - uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); - uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset); - uint4 depth24 = depth24_stencil >> 8u; - uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset); - // Depth. If the stored 32-bit depth converted to 24-bit is the same as the - // stored 24-bit depth, load the 32-bit value because it has more precision - // (and multipass rendering is possible), if it's not, convert the 24-bit - // depth because it was overwritten by aliasing. - uint4 depth24to32 = XeFloat20e4To32(depth24); - uint4 depth = depth24to32 + (depth32 - depth24to32) * - uint4(XeFloat32To20e4(depth32) == depth24); + uint4 samples = xe_edram_load_store_source.Load4( + XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index)); + // Depth (exact conversion ensured during drawing). uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - xe_edram_load_store_dest.Store4(rt_offset, depth); + xe_edram_load_store_dest.Store4(rt_offset, XeFloat20e4To32(samples >> 8u)); // Stencil. - uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u); + uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u); stencil.xy |= stencil.zw; stencil.x |= stencil.y; rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float24and32.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float24and32.cs.hlsl new file mode 100644 index 000000000..bc02b4623 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float24and32.cs.hlsl @@ -0,0 +1,31 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); + uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset); + uint4 depth24 = depth24_stencil >> 8u; + uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset); + // Depth. If the stored 32-bit depth converted to 24-bit is the same as the + // stored 24-bit depth, load the 32-bit value because it has more precision + // (and multipass rendering is possible), if it's not, convert the 24-bit + // depth because it was overwritten by aliasing. + uint4 depth24to32 = XeFloat20e4To32(depth24); + uint4 depth = depth24to32 + (depth32 - depth24to32) * + uint4(XeFloat32To20e4(depth32) == depth24); + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; + xe_edram_load_store_dest.Store4(rt_offset, depth); + // Stencil. + uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u); + stencil.xy |= stencil.zw; + stencil.x |= stencil.y; + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; + xe_edram_load_store_dest.Store(rt_offset, stencil.x); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl index ac7626721..d0123c69f 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl @@ -5,21 +5,18 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { - // Depth. + // Depth (exact conversion ensured during drawing). uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset); - uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u; + uint4 samples = + XeFloat32To20e4(xe_edram_load_store_source.Load4(rt_offset)) << 8u; // Stencil. rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + xe_edram_rt_stencil_offset; - depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> - uint4(0u, 8u, 16u, 24u)) & 0xFFu; + samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> + uint4(0u, 8u, 16u, 24u)) & 0xFFu; uint2 tile_sample_index = xe_group_thread_id.xy; tile_sample_index.x *= 4u; - uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); - // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date. - xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil); - // Store 32-bit depth so precision isn't lost when doing multipass rendering. - xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32); + xe_edram_load_store_dest.Store4( + XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index), samples); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float24and32.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float24and32.cs.hlsl new file mode 100644 index 000000000..ac7626721 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float24and32.cs.hlsl @@ -0,0 +1,25 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + // Depth. + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; + uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset); + uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u; + // Stencil. + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; + depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> + uint4(0u, 8u, 16u, 24u)) & 0xFFu; + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); + // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date. + xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil); + // Store 32-bit depth so precision isn't lost when doing multipass rendering. + xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl index d5e782bbb..093f533af 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl @@ -7,8 +7,7 @@ void main(uint3 xe_group_id : SV_GroupID, // Depth. uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 samples = - (xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u; + uint4 samples = xe_edram_load_store_source.Load4(rt_offset) << 8u; // Stencil. rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + xe_edram_rt_stencil_offset; diff --git a/src/xenia/gpu/d3d12/shaders/float24_round.ps.hlsl b/src/xenia/gpu/d3d12/shaders/float24_round.ps.hlsl new file mode 100644 index 000000000..346b21b4f --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/float24_round.ps.hlsl @@ -0,0 +1,13 @@ +#include "pixel_formats.hlsli" +#include "xenos_draw.hlsli" + +struct XePSInput { + XeVertexPrePS pre_ps; + sample float4 position : SV_Position; +}; + +precise float main(XePSInput xe_input) : SV_Depth { + // Input Z may be outside the viewport range (it's clamped after the shader). + return asfloat( + XeFloat20e4To32(XeFloat32To20e4(asuint(saturate(xe_input.position.z))))); +} diff --git a/src/xenia/gpu/d3d12/shaders/float24_truncate.ps.hlsl b/src/xenia/gpu/d3d12/shaders/float24_truncate.ps.hlsl new file mode 100644 index 000000000..83a5d08d9 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/float24_truncate.ps.hlsl @@ -0,0 +1,38 @@ +#include "pixel_formats.hlsli" +#include "xenos_draw.hlsli" + +struct XePSInput { + XeVertexPrePS pre_ps; + sample float4 position : SV_Position; +}; + +precise float main(XePSInput xe_input) : SV_DepthLessEqual { + // Simplified conversion, always less than or equal to the original value - + // just drop the lower bits. + // The float32 exponent bias is 127. + // After saturating, the exponent range is -127...0. + // The smallest normalized 20e4 exponent is -14 - should drop 3 mantissa bits + // at -14 or above. + // The smallest denormalized 20e4 number is -34 - should drop 23 mantissa bits + // at -34. + // Anything smaller than 2^-34 becomes 0. + // Input Z may be outside the viewport range (it's clamped after the shader). + precise uint depth = asuint(saturate(xe_input.position.z)); + // Check if the number is representable as a float24 after truncation - the + // exponent is at least -34. + if (depth >= 0x2E800000u) { + // Extract the biased float32 exponent: + // 113+ at exponent -14+. + // 93 at exponent -34. + uint exponent = (depth >> 23u) & 0xFFu; + // Convert exponent to the shift amount. + // 116 - 113 = 3. + // 116 - 93 = 23. + uint shift = asuint(max(116 - asint(exponent), 3)); + depth = depth >> shift << shift; + } else { + // The number is not representable as float24 after truncation - zero. + depth = 0u; + } + return asfloat(depth); +} diff --git a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli index 1e7f5e319..e3654211d 100644 --- a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli +++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli @@ -495,6 +495,16 @@ void XeR11G11B10SNormToRGBA16(uint4 packed_texels, out uint4 out_01, // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). // We also can't clamp the stored value to 1 as load->store->load must be exact. +uint XeFloat32To20e4(uint f32u32) { + // Keep only positive (high bit set means negative for both float and int) and + // saturate to the maximum representable value near 2 (also dropping NaNs). + f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u); + uint denormalized = + ((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u); + uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u); + return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; +} + uint4 XeFloat32To20e4(uint4 f32u32) { // Keep only positive (high bit set means negative for both float and int) and // saturate to the maximum representable value near 2 (also dropping NaNs). @@ -505,6 +515,21 @@ uint4 XeFloat32To20e4(uint4 f32u32) { return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; } +uint XeFloat20e4To32(uint f24u32) { + uint mantissa = f24u32 & 0xFFFFFu; + uint exponent = f24u32 >> 20u; + // Normalize the values for the denormalized components. + // Exponent = 1; + // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0); + bool is_denormalized = exponent == 0u; + uint mantissa_lzcnt = 20u - firstbithigh(mantissa); + exponent = is_denormalized ? (1u - mantissa_lzcnt) : exponent; + mantissa = + is_denormalized ? ((mantissa << mantissa_lzcnt) & 0xFFFFFu) : mantissa; + // Combine into 32-bit float bits and clear zeros. + return (f24u32 != 0u) ? (((exponent + 112u) << 23u) | (mantissa << 3u)) : 0u; +} + uint4 XeFloat20e4To32(uint4 f24u32) { uint4 mantissa = f24u32 & 0xFFFFFu; uint4 exponent = f24u32 >> 20u; diff --git a/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl b/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl index 33d5a5c48..ab165504a 100644 --- a/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl @@ -10,9 +10,9 @@ void main(point XeVertexPreGS xe_in[1], } XeVertexPostGS xe_out; - xe_out.interpolators = xe_in[0].post_gs.interpolators; - xe_out.point_params.z = xe_in[0].post_gs.point_params.z; - xe_out.clip_space_zw = xe_in[0].post_gs.clip_space_zw; + xe_out.pre_ps.interpolators = xe_in[0].post_gs.pre_ps.interpolators; + xe_out.pre_ps.point_params.z = xe_in[0].post_gs.pre_ps.point_params.z; + xe_out.pre_ps.clip_space_zw = xe_in[0].post_gs.pre_ps.clip_space_zw; xe_out.position.zw = xe_in[0].post_gs.position.zw; xe_out.clip_distance_0123 = xe_in[0].post_gs.clip_distance_0123; xe_out.clip_distance_45 = xe_in[0].post_gs.clip_distance_45; @@ -20,26 +20,27 @@ void main(point XeVertexPreGS xe_in[1], // Shader header writes -1.0f to point_size by default, so any positive value // means that it was overwritten by the translated vertex shader. float2 point_size = - (xe_in[0].post_gs.point_params.z > 0.0f ? xe_in[0].post_gs.point_params.zz - : xe_point_size); + xe_in[0].post_gs.pre_ps.point_params.z > 0.0f + ? xe_in[0].post_gs.pre_ps.point_params.zz + : xe_point_size; point_size = clamp(point_size, xe_point_size_min_max.xx, xe_point_size_min_max.yy) * xe_point_screen_to_ndc * xe_in[0].post_gs.position.w; - xe_out.point_params.xy = float2(0.0, 0.0); + xe_out.pre_ps.point_params.xy = float2(0.0, 0.0); // TODO(Triang3l): On Vulkan, sign of Y needs to inverted because of // upper-left origin. // TODO(Triang3l): Investigate the true signs of point sprites. xe_out.position.xy = xe_in[0].post_gs.position.xy + float2(-point_size.x, point_size.y); xe_stream.Append(xe_out); - xe_out.point_params.xy = float2(0.0, 1.0); + xe_out.pre_ps.point_params.xy = float2(0.0, 1.0); xe_out.position.xy = xe_in[0].post_gs.position.xy - point_size; xe_stream.Append(xe_out); - xe_out.point_params.xy = float2(1.0, 0.0); + xe_out.pre_ps.point_params.xy = float2(1.0, 0.0); xe_out.position.xy = xe_in[0].post_gs.position.xy + point_size; xe_stream.Append(xe_out); - xe_out.point_params.xy = float2(1.0, 1.0); + xe_out.pre_ps.point_params.xy = float2(1.0, 1.0); xe_out.position.xy = xe_in[0].post_gs.position.xy + float2(point_size.x, -point_size.y); xe_stream.Append(xe_out); diff --git a/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl b/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl index 8411e54c2..45b7b05e5 100644 --- a/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl @@ -80,16 +80,19 @@ void main(triangle XeVertexPreGS xe_in[3], v3_signs = float3(1.0f, 1.0f, -1.0f); } [unroll] for (int i = 0; i < 16; ++i) { - xe_out.interpolators[i] = v3_signs.x * xe_in[0].post_gs.interpolators[i] + - v3_signs.y * xe_in[1].post_gs.interpolators[i] + - v3_signs.z * xe_in[2].post_gs.interpolators[i]; + xe_out.pre_ps.interpolators[i] = + v3_signs.x * xe_in[0].post_gs.pre_ps.interpolators[i] + + v3_signs.y * xe_in[1].post_gs.pre_ps.interpolators[i] + + v3_signs.z * xe_in[2].post_gs.pre_ps.interpolators[i]; } - xe_out.point_params = v3_signs.x * xe_in[0].post_gs.point_params + - v3_signs.y * xe_in[1].post_gs.point_params + - v3_signs.z * xe_in[2].post_gs.point_params; - xe_out.clip_space_zw = v3_signs.x * xe_in[0].post_gs.clip_space_zw + - v3_signs.y * xe_in[1].post_gs.clip_space_zw + - v3_signs.z * xe_in[2].post_gs.clip_space_zw; + xe_out.pre_ps.point_params = + v3_signs.x * xe_in[0].post_gs.pre_ps.point_params + + v3_signs.y * xe_in[1].post_gs.pre_ps.point_params + + v3_signs.z * xe_in[2].post_gs.pre_ps.point_params; + xe_out.pre_ps.clip_space_zw = + v3_signs.x * xe_in[0].post_gs.pre_ps.clip_space_zw + + v3_signs.y * xe_in[1].post_gs.pre_ps.clip_space_zw + + v3_signs.z * xe_in[2].post_gs.pre_ps.clip_space_zw; xe_out.position = v3_signs.x * xe_in[0].post_gs.position + v3_signs.y * xe_in[1].post_gs.position + v3_signs.z * xe_in[2].post_gs.position; diff --git a/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli b/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli index a7e841eeb..98c5f26ed 100644 --- a/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli +++ b/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli @@ -63,10 +63,14 @@ struct XeHSControlPointOutput { float index : XEVERTEXID; }; -struct XeVertexPostGS { +struct XeVertexPrePS { float4 interpolators[16] : TEXCOORD0; float3 point_params : TEXCOORD16; float2 clip_space_zw : TEXCOORD17; +}; + +struct XeVertexPostGS { + XeVertexPrePS pre_ps; // Precise needed to preserve NaN - guest primitives may be converted to more // than 1 triangle, so need to kill them entirely manually in GS if any vertex // is NaN. diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 98cc90615..a9d9fff92 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -114,6 +114,7 @@ int32_t FloatToD3D11Fixed16p8(float f32) { void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, float pixel_size_y, bool origin_bottom_left, float x_max, float y_max, bool allow_reverse_z, + bool convert_z_to_float24, ViewportInfo& viewport_info_out) { assert_true(pixel_size_x >= 1.0f); assert_true(pixel_size_y >= 1.0f); @@ -270,6 +271,17 @@ void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, ndc_scale_z = -ndc_scale_z; ndc_offset_z = 1.0f - ndc_offset_z; } + if (convert_z_to_float24 && regs.Get().z_enable && + regs.Get().depth_format == + xenos::DepthRenderTargetFormat::kD24FS8) { + // Need to adjust the bounds that the resulting depth values will be clamped + // to after the pixel shader. Preferring adding some error to interpolated Z + // instead if conversion can't be done exactly, without modifying clipping + // bounds by adjusting Z in vertex shaders, as that may cause polygons + // placed explicitly at Z = 0 or Z = W to be clipped. + viewport_z_min = xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_min)); + viewport_z_max = xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_max)); + } viewport_info_out.left = viewport_left; viewport_info_out.top = viewport_top; diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 1a9798aeb..c47640a20 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -53,6 +53,7 @@ struct ViewportInfo { void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, float pixel_size_y, bool origin_bottom_left, float x_max, float y_max, bool allow_reverse_z, + bool convert_z_to_float24, ViewportInfo& viewport_info_out); struct Scissor { diff --git a/src/xenia/gpu/dxbc_shader.cc b/src/xenia/gpu/dxbc_shader.cc new file mode 100644 index 000000000..144308d57 --- /dev/null +++ b/src/xenia/gpu/dxbc_shader.cc @@ -0,0 +1,27 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/dxbc_shader.h" + +#include + +namespace xe { +namespace gpu { + +DxbcShader::DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash, + const uint32_t* dword_ptr, uint32_t dword_count) + : Shader(shader_type, data_hash, dword_ptr, dword_count) {} + +Shader::Translation* DxbcShader::CreateTranslationInstance( + uint32_t modification) { + return new DxbcTranslation(*this, modification); +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/dxbc_shader.h b/src/xenia/gpu/dxbc_shader.h new file mode 100644 index 000000000..49439a2a6 --- /dev/null +++ b/src/xenia/gpu/dxbc_shader.h @@ -0,0 +1,83 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_DXBC_SHADER_H_ +#define XENIA_GPU_DXBC_SHADER_H_ + +#include + +#include "xenia/gpu/dxbc_shader_translator.h" +#include "xenia/gpu/shader.h" +#include "xenia/gpu/xenos.h" + +namespace xe { +namespace gpu { + +class DxbcShader : public Shader { + public: + class DxbcTranslation : public Translation { + public: + DxbcTranslation(DxbcShader& shader, uint32_t modification) + : Translation(shader, modification) {} + }; + + DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash, + const uint32_t* dword_ptr, uint32_t dword_count); + + static constexpr uint32_t kMaxTextureBindingIndexBits = + DxbcShaderTranslator::kMaxTextureBindingIndexBits; + static constexpr uint32_t kMaxTextureBindings = + DxbcShaderTranslator::kMaxTextureBindings; + struct TextureBinding { + uint32_t bindless_descriptor_index; + uint32_t fetch_constant; + // Stacked and 3D are separate TextureBindings, even for bindless for null + // descriptor handling simplicity. + xenos::FetchOpDimension dimension; + bool is_signed; + }; + // Safe to hash and compare with memcmp for layout hashing. + const TextureBinding* GetTextureBindings(uint32_t& count_out) const { + count_out = uint32_t(texture_bindings_.size()); + return texture_bindings_.data(); + } + const uint32_t GetUsedTextureMask() const { return used_texture_mask_; } + + static constexpr uint32_t kMaxSamplerBindingIndexBits = + DxbcShaderTranslator::kMaxSamplerBindingIndexBits; + static constexpr uint32_t kMaxSamplerBindings = + DxbcShaderTranslator::kMaxSamplerBindings; + struct SamplerBinding { + uint32_t bindless_descriptor_index; + uint32_t fetch_constant; + xenos::TextureFilter mag_filter; + xenos::TextureFilter min_filter; + xenos::TextureFilter mip_filter; + xenos::AnisoFilter aniso_filter; + }; + const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const { + count_out = uint32_t(sampler_bindings_.size()); + return sampler_bindings_.data(); + } + + protected: + Translation* CreateTranslationInstance(uint32_t modification) override; + + private: + friend class DxbcShaderTranslator; + + std::vector texture_bindings_; + std::vector sampler_bindings_; + uint32_t used_texture_mask_ = 0; +}; + +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_DXBC_SHADER_H_ diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index a08cafd5e..865fbd77e 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -19,6 +19,7 @@ #include "xenia/base/assert.h" #include "xenia/base/cvar.h" #include "xenia/base/math.h" +#include "xenia/gpu/dxbc_shader.h" DEFINE_bool(dxbc_switch, true, "Use switch rather than if for flow control. Turning this off or " @@ -76,64 +77,31 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id, } DxbcShaderTranslator::~DxbcShaderTranslator() = default; -std::vector DxbcShaderTranslator::ForceEarlyDepthStencil( - const uint8_t* shader) { - const uint32_t* old_shader = reinterpret_cast(shader); - - // To return something anyway even if patching fails. - std::vector new_shader; - uint32_t shader_size_bytes = old_shader[6]; - new_shader.resize(shader_size_bytes); - std::memcpy(new_shader.data(), shader, shader_size_bytes); - - // Find the SHEX chunk. - uint32_t chunk_count = old_shader[7]; - for (uint32_t i = 0; i < chunk_count; ++i) { - uint32_t chunk_offset_bytes = old_shader[8 + i]; - const uint32_t* chunk = old_shader + chunk_offset_bytes / sizeof(uint32_t); - if (chunk[0] != 'XEHS') { - continue; - } - // Find dcl_globalFlags and patch it. - uint32_t code_size_dwords = chunk[3]; - chunk += 4; - for (uint32_t j = 0; j < code_size_dwords;) { - uint32_t opcode_token = chunk[j]; - uint32_t opcode = DECODE_D3D10_SB_OPCODE_TYPE(opcode_token); - if (opcode == D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) { - opcode_token |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL; - std::memcpy(new_shader.data() + - (chunk_offset_bytes + (4 + j) * sizeof(uint32_t)), - &opcode_token, sizeof(uint32_t)); - // Recalculate the checksum since the shader was modified. - CalculateDXBCChecksum( - reinterpret_cast(new_shader.data()), - shader_size_bytes, - reinterpret_cast(new_shader.data() + - sizeof(uint32_t))); - break; - } - if (opcode == D3D10_SB_OPCODE_CUSTOMDATA) { - j += chunk[j + 1]; - } else { - j += DECODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(opcode_token); - } - } - break; - } - - return std::move(new_shader); -} - std::vector DxbcShaderTranslator::CreateDepthOnlyPixelShader() { - Reset(); + Reset(xenos::ShaderType::kPixel); is_depth_only_pixel_shader_ = true; StartTranslation(); return std::move(CompleteTranslation()); } -void DxbcShaderTranslator::Reset() { - ShaderTranslator::Reset(); +uint32_t DxbcShaderTranslator::GetDefaultModification( + xenos::ShaderType shader_type, + Shader::HostVertexShaderType host_vertex_shader_type) const { + Modification shader_modification; + switch (shader_type) { + case xenos::ShaderType::kVertex: + shader_modification.host_vertex_shader_type = host_vertex_shader_type; + break; + case xenos::ShaderType::kPixel: + shader_modification.depth_stencil_mode = + Modification::DepthStencilMode::kNoModifiers; + break; + } + return shader_modification.value; +} + +void DxbcShaderTranslator::Reset(xenos::ShaderType shader_type) { + ShaderTranslator::Reset(shader_type); shader_code_.clear(); @@ -152,7 +120,7 @@ void DxbcShaderTranslator::Reset() { in_domain_location_used_ = 0; in_primitive_id_used_ = false; in_control_point_index_used_ = false; - in_position_xy_used_ = false; + in_position_used_ = 0; in_front_face_used_ = false; system_temp_count_current_ = 0; @@ -457,7 +425,9 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // Remember that x# are only accessible via mov load or store - use a // temporary variable if need to do any computations! - switch (host_vertex_shader_type()) { + Shader::HostVertexShaderType host_vertex_shader_type = + GetDxbcShaderModification().host_vertex_shader_type; + switch (host_vertex_shader_type) { case Shader::HostVertexShaderType::kVertex: StartVertexShader_LoadVertexIndex(); break; @@ -618,7 +588,7 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { default: // TODO(Triang3l): Support line and non-adaptive quad patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in StartVertexOrDomainShader"); break; @@ -720,7 +690,7 @@ void DxbcShaderTranslator::StartPixelShader() { // faceness as X sign bit. Using Z as scratch register now. if (edram_rov_used_) { // Get XY address of the current host pixel as float. - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpRoundZ(DxbcDest::R(param_gen_temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); // Revert resolution scale - after truncating, so if the pixel position @@ -744,7 +714,7 @@ void DxbcShaderTranslator::StartPixelShader() { } else { // Get XY address of the current SSAA sample by converting // SV_Position.xy to an integer. - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(param_gen_temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); // Undo SSAA that is used instead of MSAA - since it's used as a @@ -870,7 +840,7 @@ void DxbcShaderTranslator::StartPixelShader() { void DxbcShaderTranslator::StartTranslation() { // Allocate global system temporary registers that may also be used in the // epilogue. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { system_temp_position_ = PushSystemTemp(0b1111); system_temp_point_size_edge_flag_kill_vertex_ = PushSystemTemp(0b0100); // Set the point size to a negative value to tell the geometry shader that @@ -879,20 +849,21 @@ void DxbcShaderTranslator::StartTranslation() { DxbcOpMov( DxbcDest::R(system_temp_point_size_edge_flag_kill_vertex_, 0b0001), DxbcSrc::LF(-1.0f)); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { if (edram_rov_used_) { // Will be initialized unconditionally. system_temp_rov_params_ = PushSystemTemp(); - if (ROV_IsDepthStencilEarly() || writes_depth()) { - // If the shader doesn't write to oDepth, each component will be written - // to if depth/stencil is enabled and the respective sample is covered - - // so need to initialize now because the first writes will be - // conditional. If the shader writes to oDepth, this is oDepth of the - // shader, written by the guest code, so initialize because assumptions - // can't be made about the integrity of the guest code. - system_temp_rov_depth_stencil_ = - PushSystemTemp(writes_depth() ? 0b0001 : 0b1111); - } + } + if (IsDepthStencilSystemTempUsed()) { + // If the shader doesn't write to oDepth, and ROV is used, each + // component will be written to if depth/stencil is enabled and the + // respective sample is covered - so need to initialize now because the + // first writes will be conditional. + // If the shader writes to oDepth, this is oDepth of the shader, written + // by the guest code, so initialize because assumptions can't be made + // about the integrity of the guest code. + system_temp_depth_stencil_ = + PushSystemTemp(writes_depth() ? 0b0001 : 0b1111); } for (uint32_t i = 0; i < 4; ++i) { if (writes_color_target(i)) { @@ -942,7 +913,7 @@ void DxbcShaderTranslator::StartTranslation() { // Zero general-purpose registers to prevent crashes when the game // references them after only initializing them conditionally. - for (uint32_t i = IsDxbcPixelShader() ? xenos::kMaxInterpolators : 0; + for (uint32_t i = is_pixel_shader() ? xenos::kMaxInterpolators : 0; i < register_count(); ++i) { DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i) : DxbcDest::R(i), @@ -951,9 +922,9 @@ void DxbcShaderTranslator::StartTranslation() { } // Write stage-specific prologue. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { StartVertexOrDomainShader(); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { StartPixelShader(); } @@ -1168,31 +1139,31 @@ void DxbcShaderTranslator::CompleteShaderCode() { } // Write stage-specific epilogue. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { CompleteVertexOrDomainShader(); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { CompletePixelShader(); } // Return from `main`. DxbcOpRet(); - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { // Release system_temp_position_ and // system_temp_point_size_edge_flag_kill_vertex_. PopSystemTemp(2); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { // Release system_temps_color_. for (int32_t i = 3; i >= 0; --i) { if (writes_color_target(i)) { PopSystemTemp(); } } + if (IsDepthStencilSystemTempUsed()) { + // Release system_temp_depth_stencil_. + PopSystemTemp(); + } if (edram_rov_used_) { - if (ROV_IsDepthStencilEarly() || writes_depth()) { - // Release system_temp_rov_depth_stencil_. - PopSystemTemp(); - } // Release system_temp_rov_params_. PopSystemTemp(); } @@ -1303,6 +1274,44 @@ std::vector DxbcShaderTranslator::CompleteTranslation() { return shader_object_bytes; } +void DxbcShaderTranslator::PostTranslation( + Shader::Translation& translation, bool setup_shader_post_translation_info) { + if (setup_shader_post_translation_info) { + DxbcShader* dxbc_shader = dynamic_cast(&translation.shader()); + if (dxbc_shader) { + dxbc_shader->texture_bindings_.clear(); + dxbc_shader->texture_bindings_.reserve(texture_bindings_.size()); + dxbc_shader->used_texture_mask_ = 0; + for (const TextureBinding& translator_binding : texture_bindings_) { + DxbcShader::TextureBinding& shader_binding = + dxbc_shader->texture_bindings_.emplace_back(); + // For a stable hash. + std::memset(&shader_binding, 0, sizeof(shader_binding)); + shader_binding.bindless_descriptor_index = + translator_binding.bindless_descriptor_index; + shader_binding.fetch_constant = translator_binding.fetch_constant; + shader_binding.dimension = translator_binding.dimension; + shader_binding.is_signed = translator_binding.is_signed; + dxbc_shader->used_texture_mask_ |= 1u + << translator_binding.fetch_constant; + } + dxbc_shader->sampler_bindings_.clear(); + dxbc_shader->sampler_bindings_.reserve(sampler_bindings_.size()); + for (const SamplerBinding& translator_binding : sampler_bindings_) { + DxbcShader::SamplerBinding& shader_binding = + dxbc_shader->sampler_bindings_.emplace_back(); + shader_binding.bindless_descriptor_index = + translator_binding.bindless_descriptor_index; + shader_binding.fetch_constant = translator_binding.fetch_constant; + shader_binding.mag_filter = translator_binding.mag_filter; + shader_binding.min_filter = translator_binding.min_filter; + shader_binding.mip_filter = translator_binding.mip_filter; + shader_binding.aniso_filter = translator_binding.aniso_filter; + } + } + } +} + void DxbcShaderTranslator::EmitInstructionDisassembly() { if (!emit_source_map_) { return; @@ -1527,19 +1536,20 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, } break; case InstructionStorageTarget::kDepth: - // Writes X to scalar oDepth or to X of system_temp_rov_depth_stencil_, no + // Writes X to scalar oDepth or to X of system_temp_depth_stencil_, no // additional swizzling needed. assert_true(used_write_mask == 0b0001); assert_true(writes_depth()); - if (edram_rov_used_) { - dest = DxbcDest::R(system_temp_rov_depth_stencil_); + if (IsDepthStencilSystemTempUsed()) { + dest = DxbcDest::R(system_temp_depth_stencil_); } else { dest = DxbcDest::ODepth(); } - // Depth outside [0, 1] is not safe for use with the ROV code. Though 20e4 - // float depth can store values below 2, it's a very unusual case. - // Direct3D 10+ SV_Depth, however, can accept any values, including - // specials, when the depth buffer is floating-point. + // Depth outside [0, 1] is not safe for use with the ROV code and with + // 20e4-as-32 conversion. Though 20e4 float depth can store values between + // 1 and 2, it's a very unusual case. Direct3D 10+ SV_Depth, however, can + // accept any values, including specials, when the depth buffer is + // floating-point; but depth is clamped to the viewport bounds anyway. is_clamped = true; break; } @@ -2094,7 +2104,7 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { // ds_5_1 shader_object_.push_back(0x44530501u); } else { - assert_true(IsDxbcPixelShader()); + assert_true(is_pixel_shader()); // ps_5_1 shader_object_.push_back(0xFFFF0501u); } @@ -2765,7 +2775,7 @@ void DxbcShaderTranslator::WriteInputSignature() { control_point_index.semantic_name = semantic_offset; } semantic_offset += AppendString(shader_object_, "XEVERTEXID"); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { // Written dynamically, so assume it's always used if it can be written to // any interpolator register. bool param_gen_used = !is_depth_only_pixel_shader_ && register_count() != 0; @@ -2843,7 +2853,7 @@ void DxbcShaderTranslator::WriteInputSignature() { position.component_type = DxbcSignatureRegisterComponentType::kFloat32; position.register_index = uint32_t(InOutRegister::kPSInPosition); position.mask = 0b1111; - position.always_reads_mask = in_position_xy_used_ ? 0b0011 : 0b0000; + position.always_reads_mask = in_position_used_; } // Is front face (SV_IsFrontFace). @@ -2927,7 +2937,9 @@ void DxbcShaderTranslator::WritePatchConstantSignature() { DxbcName tess_factor_edge_system_value = DxbcName::kUndefined; uint32_t tess_factor_inside_count = 0; DxbcName tess_factor_inside_system_value = DxbcName::kUndefined; - switch (host_vertex_shader_type()) { + Shader::HostVertexShaderType host_vertex_shader_type = + GetDxbcShaderModification().host_vertex_shader_type; + switch (host_vertex_shader_type) { case Shader::HostVertexShaderType::kTriangleDomainCPIndexed: case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed: tess_factor_edge_count = 3; @@ -2944,7 +2956,7 @@ void DxbcShaderTranslator::WritePatchConstantSignature() { break; default: // TODO(Triang3l): Support line patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in WritePatchConstantSignature"); } @@ -3033,7 +3045,7 @@ void DxbcShaderTranslator::WriteOutputSignature() { constexpr size_t kParameterDwords = sizeof(DxbcSignatureParameter) / sizeof(uint32_t); - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { // Intepolators (TEXCOORD#). size_t interpolator_position = shader_object_.size(); shader_object_.resize(shader_object_.size() + @@ -3195,7 +3207,7 @@ void DxbcShaderTranslator::WriteOutputSignature() { cull_distance.semantic_name = semantic_offset; } semantic_offset += AppendString(shader_object_, "SV_CullDistance"); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { if (!edram_rov_used_) { // Color render targets (SV_Target#). size_t target_position = SIZE_MAX; @@ -3217,9 +3229,11 @@ void DxbcShaderTranslator::WriteOutputSignature() { } } - // Depth (SV_Depth). + // Depth (SV_Depth or SV_DepthLessEqual). + Modification::DepthStencilMode depth_stencil_mode = + GetDxbcShaderModification().depth_stencil_mode; size_t depth_position = SIZE_MAX; - if (writes_depth()) { + if (writes_depth() || DSV_IsWritingFloat24Depth()) { depth_position = shader_object_.size(); shader_object_.resize(shader_object_.size() + kParameterDwords); ++parameter_count; @@ -3253,7 +3267,15 @@ void DxbcShaderTranslator::WriteOutputSignature() { depth_position); depth.semantic_name = semantic_offset; } - semantic_offset += AppendString(shader_object_, "SV_Depth"); + const char* depth_semantic_name; + if (!writes_depth() && + GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating) { + depth_semantic_name = "SV_DepthLessEqual"; + } else { + depth_semantic_name = "SV_Depth"; + } + semantic_offset += AppendString(shader_object_, depth_semantic_name); } } } @@ -3276,7 +3298,7 @@ void DxbcShaderTranslator::WriteShaderCode() { } else if (IsDxbcDomainShader()) { shader_type = D3D11_SB_DOMAIN_SHADER; } else { - assert_true(IsDxbcPixelShader()); + assert_true(is_pixel_shader()); shader_type = D3D10_SB_PIXEL_SHADER; } shader_object_.push_back( @@ -3296,12 +3318,14 @@ void DxbcShaderTranslator::WriteShaderCode() { // Inputs/outputs have 1D-indexed operands with a component mask and a // register index. + Modification shader_modification = GetDxbcShaderModification(); + if (IsDxbcDomainShader()) { // Not using control point data since Xenos only has a vertex shader acting // as both vertex shader and domain shader. stat_.c_control_points = 3; stat_.tessellator_domain = DxbcTessellatorDomain::kTriangle; - switch (host_vertex_shader_type()) { + switch (shader_modification.host_vertex_shader_type) { case Shader::HostVertexShaderType::kTriangleDomainCPIndexed: case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed: stat_.c_control_points = 3; @@ -3314,7 +3338,7 @@ void DxbcShaderTranslator::WriteShaderCode() { break; default: // TODO(Triang3l): Support line patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(shader_modification.host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in WriteShaderCode"); } @@ -3330,11 +3354,17 @@ void DxbcShaderTranslator::WriteShaderCode() { } // Don't allow refactoring when converting to native code to maintain position - // invariance (needed even in pixel shaders for oDepth invariance). Also this - // dcl will be modified by ForceEarlyDepthStencil. - shader_object_.push_back( + // invariance (needed even in pixel shaders for oDepth invariance). + uint32_t global_flags_opcode = ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1); + if (is_pixel_shader() && + GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kEarlyHint && + !edram_rov_used_ && CanWriteZEarly()) { + global_flags_opcode |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL; + } + shader_object_.push_back(global_flags_opcode); // Constant buffers, from most frequenly accessed to least frequently accessed // (the order is a hint to the driver according to the DXBC header). @@ -3560,7 +3590,7 @@ void DxbcShaderTranslator::WriteShaderCode() { } // Inputs and outputs. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { if (IsDxbcDomainShader()) { if (in_domain_location_used_) { // Domain location input. @@ -3584,7 +3614,7 @@ void DxbcShaderTranslator::WriteShaderCode() { if (in_control_point_index_used_) { // Control point indices as float input. uint32_t control_point_array_size; - switch (host_vertex_shader_type()) { + switch (shader_modification.host_vertex_shader_type) { case Shader::HostVertexShaderType::kTriangleDomainCPIndexed: control_point_array_size = 3; break; @@ -3593,7 +3623,7 @@ void DxbcShaderTranslator::WriteShaderCode() { break; default: // TODO(Triang3l): Support line patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(shader_modification.host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in " "StartVertexOrDomainShader"); @@ -3683,7 +3713,8 @@ void DxbcShaderTranslator::WriteShaderCode() { uint32_t(InOutRegister::kVSDSOutClipDistance45AndCullDistance)); shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_CULL_DISTANCE)); ++stat_.dcl_count; - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { + bool is_writing_float24_depth = DSV_IsWritingFloat24Depth(); // Interpolator input. if (!is_depth_only_pixel_shader_) { uint32_t interpolator_count = @@ -3725,16 +3756,26 @@ void DxbcShaderTranslator::WriteShaderCode() { shader_object_.push_back(uint32_t(InOutRegister::kPSInClipSpaceZW)); ++stat_.dcl_count; } - if (in_position_xy_used_) { - // Position input (only XY needed for ps_param_gen, and the ROV depth code - // calculates the depth from clip space Z and W). + if (in_position_used_) { + // Position input (XY needed for ps_param_gen, Z needed for non-ROV + // float24 conversion; the ROV depth code calculates the depth the from + // clip space Z and W with pull-mode per-sample interpolation instead). + // At the cost of possibility of MSAA with pixel-rate shading, need + // per-sample depth - otherwise intersections cannot be antialiased, and + // with SV_DepthLessEqual, per-sample (or centroid, but this isn't + // applicable here) position is mandatory. However, with depth output, on + // the guest, there's only one depth value for the whole pixel. + D3D10_SB_INTERPOLATION_MODE position_interpolation_mode = + is_writing_float24_depth && !writes_depth() + ? D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE_SAMPLE + : D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE; shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_INPUT_PS_SIV) | ENCODE_D3D10_SB_INPUT_INTERPOLATION_MODE( - D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE) | + position_interpolation_mode) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4)); - shader_object_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_INPUT, 0b0011, 1)); + shader_object_.push_back(EncodeVectorMaskedOperand( + D3D10_SB_OPERAND_TYPE_INPUT, in_position_used_, 1)); shader_object_.push_back(uint32_t(InOutRegister::kPSInPosition)); shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_POSITION)); ++stat_.dcl_count; @@ -3778,12 +3819,19 @@ void DxbcShaderTranslator::WriteShaderCode() { } } // Depth output. - if (writes_depth()) { + if (is_writing_float24_depth || writes_depth()) { + D3D10_SB_OPERAND_TYPE depth_operand_type; + if (!writes_depth() && + GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating) { + depth_operand_type = D3D11_SB_OPERAND_TYPE_OUTPUT_DEPTH_LESS_EQUAL; + } else { + depth_operand_type = D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH; + } shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(2)); - shader_object_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0)); + shader_object_.push_back(EncodeScalarOperand(depth_operand_type, 0)); ++stat_.dcl_count; } } diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 9edc40b56..2ca52e7f5 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -102,6 +102,51 @@ class DxbcShaderTranslator : public ShaderTranslator { bool edram_rov_used, bool force_emit_source_map = false); ~DxbcShaderTranslator() override; + union Modification { + // If anything in this is structure is changed in a way not compatible with + // the previous layout, invalidate the pipeline storages by increasing this + // version number (0xYYYYMMDD)! + static constexpr uint32_t kVersion = 0x20201203; + + enum class DepthStencilMode : uint32_t { + kNoModifiers, + // [earlydepthstencil] - enable if alpha test and alpha to coverage are + // disabled; ignored if anything in the shader blocks early Z writing + // (which is not known before translation, so this will be set anyway). + kEarlyHint, + // Converting the depth to the closest 32-bit float representable exactly + // as a 20e4 float, to support invariance in cases when the guest + // reuploads a previously resolved depth buffer to the EDRAM, rounding + // towards zero (which contradicts the rounding used by the Direct3D 9 + // reference rasterizer, but allows SV_DepthLessEqual to be used to allow + // slightly coarse early Z culling; also truncating regardless of whether + // the shader writes depth and thus always uses SV_Depth, for + // consistency). MSAA is limited - depth must be per-sample + // (SV_DepthLessEqual also explicitly requires sample or centroid position + // interpolation), thus the sampler has to run at sample frequency even if + // the device supports stencil loading and thus true non-ROV MSAA via + // SV_StencilRef. + // Fixed-function viewport depth bounds must be snapped to float24 for + // clamping purposes. + kFloat24Truncating, + // Similar to kFloat24Truncating, but rounding to the nearest even, + // however, always using SV_Depth rather than SV_DepthLessEqual because + // rounding up results in a bigger value. Same viewport usage rules apply. + kFloat24Rounding, + }; + + struct { + // VS - pipeline stage and input configuration. + Shader::HostVertexShaderType host_vertex_shader_type + : Shader::kHostVertexShaderTypeBitCount; + // PS, non-ROV - depth / stencil output mode. + DepthStencilMode depth_stencil_mode : 2; + }; + uint32_t value = 0; + + Modification(uint32_t modification_value = 0) : value(modification_value) {} + }; + // Constant buffer bindings in space 0. enum class CbufferRegister { kSystemConstants, @@ -238,15 +283,15 @@ class DxbcShaderTranslator : public ShaderTranslator { // EDRAM address calculation. uint32_t sample_count_log2[2]; float alpha_test_reference; + // If alpha to mask is disabled, the entire alpha_to_mask value must be 0. + // If alpha to mask is enabled, bits 0:7 are sample offsets, and bit 8 must + // be 1. uint32_t alpha_to_mask; float color_exp_bias[4]; uint32_t color_output_map[4]; - // If alpha to mask is disabled, the entire alpha_to_mask value must be 0. - // If alpha to mask is enabled, bits 0:7 are sample offsets, and bit 8 must - // be 1. uint32_t edram_resolution_square_scale; uint32_t edram_pitch_tiles; union { @@ -358,12 +403,6 @@ class DxbcShaderTranslator : public ShaderTranslator { bool is_signed; std::string name; }; - // The first binding returned is at t[SRVMainRegister::kBindfulTexturesStart] - // of space SRVSpace::kMain. - const TextureBinding* GetTextureBindings(uint32_t& count_out) const { - count_out = uint32_t(texture_bindings_.size()); - return texture_bindings_.data(); - } // Arbitrary limit - there can't be more than 2048 in a shader-visible // descriptor heap, though some older hardware (tier 1 resource binding - @@ -385,16 +424,6 @@ class DxbcShaderTranslator : public ShaderTranslator { xenos::AnisoFilter aniso_filter; std::string name; }; - const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const { - count_out = uint32_t(sampler_bindings_.size()); - return sampler_bindings_.data(); - } - - // Returns the number of texture SRV and sampler offsets that need to be - // passed via a constant buffer to the shader. - uint32_t GetBindlessResourceCount() const { - return uint32_t(texture_bindings_.size() + sampler_bindings_.size()); - } // Unordered access view bindings in space 0. enum class UAVRegister { @@ -402,10 +431,6 @@ class DxbcShaderTranslator : public ShaderTranslator { kEdram, }; - // Creates a copy of the shader with early depth/stencil testing forced, - // overriding that alpha testing is used in the shader. - static std::vector ForceEarlyDepthStencil(const uint8_t* shader); - // Returns the format with internal flags for passing via the // edram_rt_format_flags system constant. static constexpr uint32_t ROV_AddColorFormatFlags( @@ -440,16 +465,22 @@ class DxbcShaderTranslator : public ShaderTranslator { float& clamp_alpha_high, uint32_t& keep_mask_low, uint32_t& keep_mask_high); + uint32_t GetDefaultModification( + xenos::ShaderType shader_type, + Shader::HostVertexShaderType host_vertex_shader_type = + Shader::HostVertexShaderType::kVertex) const override; + // Creates a special pixel shader without color outputs - this resets the // state of the translator. std::vector CreateDepthOnlyPixelShader(); protected: - void Reset() override; + void Reset(xenos::ShaderType shader_type) override; void StartTranslation() override; - std::vector CompleteTranslation() override; + void PostTranslation(Shader::Translation& translation, + bool setup_shader_post_translation_info) override; void ProcessLabel(uint32_t cf_index) override; @@ -650,6 +681,7 @@ class DxbcShaderTranslator : public ShaderTranslator { kInputDomainPoint = 28, kUnorderedAccessView = 30, kInputCoverageMask = 35, + kOutputDepthLessEqual = 39, }; // D3D10_SB_OPERAND_INDEX_DIMENSION @@ -689,6 +721,7 @@ class DxbcShaderTranslator : public ShaderTranslator { return DxbcOperandDimension::kNoData; case DxbcOperandType::kInputPrimitiveID: case DxbcOperandType::kOutputDepth: + case DxbcOperandType::kOutputDepthLessEqual: return DxbcOperandDimension::kScalar; case DxbcOperandType::kInputCoverageMask: return dest_in_dcl ? DxbcOperandDimension::kScalar @@ -860,6 +893,9 @@ class DxbcShaderTranslator : public ShaderTranslator { return DxbcDest(DxbcOperandType::kUnorderedAccessView, write_mask, index_1d, index_2d); } + static DxbcDest ODepthLE() { + return DxbcDest(DxbcOperandType::kOutputDepthLessEqual, 0b0001); + } uint32_t GetMask() const { switch (GetDimension()) { @@ -2145,21 +2181,19 @@ class DxbcShaderTranslator : public ShaderTranslator { (index_representation_1 << 25) | (index_representation_2 << 28); } - // Use these instead of is_vertex_shader/is_pixel_shader because they don't - // take is_depth_only_pixel_shader_ into account. - inline bool IsDxbcVertexOrDomainShader() const { - return !is_depth_only_pixel_shader_ && is_vertex_shader(); + Modification GetDxbcShaderModification() const { + return Modification(modification()); } - inline bool IsDxbcVertexShader() const { - return IsDxbcVertexOrDomainShader() && - host_vertex_shader_type() == Shader::HostVertexShaderType::kVertex; + + bool IsDxbcVertexShader() const { + return is_vertex_shader() && + GetDxbcShaderModification().host_vertex_shader_type == + Shader::HostVertexShaderType::kVertex; } - inline bool IsDxbcDomainShader() const { - return IsDxbcVertexOrDomainShader() && - host_vertex_shader_type() != Shader::HostVertexShaderType::kVertex; - } - inline bool IsDxbcPixelShader() const { - return is_depth_only_pixel_shader_ || is_pixel_shader(); + bool IsDxbcDomainShader() const { + return is_vertex_shader() && + GetDxbcShaderModification().host_vertex_shader_type != + Shader::HostVertexShaderType::kVertex; } // Whether to use switch-case rather than if (pc >= label) for control flow. @@ -2181,10 +2215,37 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t piece_temp_component, uint32_t accumulator_temp, uint32_t accumulator_temp_component); + // Converts the depth value externally clamped to the representable [0, 2) + // range to 20e4 floating point, with zeros in bits 24:31, rounding to the + // nearest even. Source and destination may be the same, temporary must be + // different than both. + void PreClampedDepthTo20e4(uint32_t d24_temp, uint32_t d24_temp_component, + uint32_t d32_temp, uint32_t d32_temp_component, + uint32_t temp_temp, uint32_t temp_temp_component); + bool IsDepthStencilSystemTempUsed() const { + // See system_temp_depth_stencil_ documentation for explanation of cases. + if (edram_rov_used_) { + return writes_depth() || ROV_IsDepthStencilEarly(); + } + return writes_depth() && DSV_IsWritingFloat24Depth(); + } + // Whether the current non-ROV pixel shader should convert the depth to 20e4. + bool DSV_IsWritingFloat24Depth() const { + if (edram_rov_used_) { + return false; + } + Modification::DepthStencilMode depth_stencil_mode = + GetDxbcShaderModification().depth_stencil_mode; + return depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating || + depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Rounding; + } // Whether it's possible and worth skipping running the translated shader for // 2x2 quads. bool ROV_IsDepthStencilEarly() const { - return !is_depth_only_pixel_shader_ && !writes_depth(); + return !is_depth_only_pixel_shader_ && !writes_depth() && + memexport_stream_constants().empty(); } // Converts the depth value to 24-bit (storing the result in bits 0:23 and // zeros in 24:31, not creating room for stencil - since this may be involved @@ -2197,8 +2258,8 @@ class DxbcShaderTranslator : public ShaderTranslator { // Does all the depth/stencil-related things, including or not including // writing based on whether it's late, or on whether it's safe to do it early. // Updates system_temp_rov_params_ result and coverage if allowed and safe, - // updates system_temp_rov_depth_stencil_, and if early and the coverage is - // empty for all pixels in the 2x2 quad and safe to return early (stencil is + // updates system_temp_depth_stencil_, and if early and the coverage is empty + // for all pixels in the 2x2 quad and safe to return early (stencil is // unchanged or known that it's safe not to await kills/alphatest/AtoC), // returns from the shader. void ROV_DepthStencilTest(); @@ -2248,6 +2309,7 @@ class DxbcShaderTranslator : public ShaderTranslator { // Discards the SSAA sample if it's masked out by alpha to coverage. void CompletePixelShader_WriteToRTVs_AlphaToMask(); void CompletePixelShader_WriteToRTVs(); + void CompletePixelShader_DSV_DepthTo24Bit(); // Masks the sample away from system_temp_rov_params_.x if it's not covered. // threshold_offset and temp.temp_component can be the same if needed. void CompletePixelShader_ROV_AlphaToMaskSample( @@ -2333,6 +2395,11 @@ class DxbcShaderTranslator : public ShaderTranslator { xenos::TextureFilter min_filter, xenos::TextureFilter mip_filter, xenos::AnisoFilter aniso_filter); + // Returns the number of texture SRV and sampler offsets that need to be + // passed via a constant buffer to the shader. + uint32_t GetBindlessResourceCount() const { + return uint32_t(texture_bindings_.size() + sampler_bindings_.size()); + } // Marks fetch constants as used by the DXBC shader and returns DxbcSrc // for the words 01 (pair 0), 23 (pair 1) or 45 (pair 2) of the texture fetch // constant. @@ -2364,7 +2431,7 @@ class DxbcShaderTranslator : public ShaderTranslator { static uint32_t AppendString(std::vector& dest, const char* source); // Returns the length of a string as if it was appended to a DWORD stream, in // bytes. - static inline uint32_t GetStringLength(const char* source) { + static uint32_t GetStringLength(const char* source) { return uint32_t(xe::align(std::strlen(source) + 1, sizeof(uint32_t))); } @@ -2479,8 +2546,8 @@ class DxbcShaderTranslator : public ShaderTranslator { bool in_primitive_id_used_; // Whether InOutRegister::kDSInControlPointIndex has been used in the shader. bool in_control_point_index_used_; - // Whether the XY of the pixel position has been used in the pixel shader. - bool in_position_xy_used_; + // Mask of the pixel/sample position actually used in the pixel shader. + uint32_t in_position_used_; // Whether the faceness has been used in the pixel shader. bool in_front_face_used_; @@ -2518,15 +2585,14 @@ class DxbcShaderTranslator : public ShaderTranslator { // W - Base-relative resolution-scaled EDRAM offset for 64bpp color data, in // dwords. uint32_t system_temp_rov_params_; - // ROV only - new depth/stencil data. 4 VGPRs when not writing to oDepth, 1 - // VGPR when writing to oDepth. Not used in the depth-only pixel shader (or, - // more formally, if neither early depth-stencil nor oDepth are used) because - // it always calculates and writes in the same place. - // When not writing to oDepth: New per-sample depth/stencil values, generated - // during early depth/stencil test (actual writing checks coverage bits). - // When writing to oDepth: X also used to hold the depth written by the - // shader, later used as a temporary during depth/stencil testing. - uint32_t system_temp_rov_depth_stencil_; + // Two purposes: + // - When writing to oDepth, and either using ROV or converting the depth to + // float24: X also used to hold the depth written by the shader, + // later used as a temporary during depth/stencil testing. + // - Otherwise, when using ROV output with ROV_IsDepthStencilEarly being true: + // New per-sample depth/stencil values, generated during early depth/stencil + // test (actual writing checks coverage bits). + uint32_t system_temp_depth_stencil_; // Up to 4 color outputs in pixel shaders (because of exponent bias, alpha // test and remapping, and also for ROV writing). uint32_t system_temps_color_[4]; @@ -2587,6 +2653,8 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t srv_index_bindless_textures_3d_; uint32_t srv_index_bindless_textures_cube_; + // The first binding is at t[SRVMainRegister::kBindfulTexturesStart] of space + // SRVSpace::kMain. std::vector texture_bindings_; std::unordered_map texture_bindings_for_bindful_srv_indices_; diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc index 76eed4d10..b4813b381 100644 --- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc +++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc @@ -677,7 +677,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( // Whether to use gradients (implicit or explicit) for LOD calculation. bool use_computed_lod = instr.attributes.use_computed_lod && - (IsDxbcPixelShader() || instr.attributes.use_register_gradients); + (is_pixel_shader() || instr.attributes.use_register_gradients); if (instr.opcode == FetchOpcode::kGetTextureComputedLod && (!use_computed_lod || instr.attributes.use_register_gradients)) { assert_always(); diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index d20cb11bf..5f3d47bc0 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -106,7 +106,7 @@ void DxbcShaderTranslator::ExportToMemory() { kSysConst_Flags_Vec) .Select(kSysConst_Flags_Comp), DxbcSrc::LU(kSysFlag_SharedMemoryIsUAV)); - if (IsDxbcPixelShader()) { + if (is_pixel_shader()) { // Disable memexport in pixel shaders with supersampling since VPOS is // ambiguous. if (edram_rov_used_) { diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index 24963008f..f3b964ae2 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -167,7 +167,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // bigger) to integer to system_temp_rov_params_.zw. // system_temp_rov_params_.z = X host pixel position as uint // system_temp_rov_params_.w = Y host pixel position as uint - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(system_temp_rov_params_, 0b1100), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition), 0b01000000)); // Revert the resolution scale to convert the position to guest pixels. @@ -315,7 +315,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // Add host pixel offsets. // system_temp_rov_params_.y = scaled 32bpp depth/stencil address // system_temp_rov_params_.z = scaled 32bpp color offset if needed - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; for (uint32_t i = 0; i < 2; ++i) { // Convert a position component to integer. DxbcOpFToU(DxbcDest::R(system_temp_rov_params_, 0b0001), @@ -417,23 +417,50 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // With early depth/stencil, depth/stencil writing may be deferred to the // end of the shader to prevent writing in case something (like alpha test, // which is dynamic GPU state) discards the pixel. So, write directly to the - // persistent register, system_temp_rov_depth_stencil_, instead of a local + // persistent register, system_temp_depth_stencil_, instead of a local // temporary register. DxbcDest sample_depth_stencil_dest( - depth_stencil_early - ? DxbcDest::R(system_temp_rov_depth_stencil_, 1 << i) - : temp_x_dest); + depth_stencil_early ? DxbcDest::R(system_temp_depth_stencil_, 1 << i) + : temp_x_dest); DxbcSrc sample_depth_stencil_src( - depth_stencil_early - ? DxbcSrc::R(system_temp_rov_depth_stencil_).Select(i) - : temp_x_src); + depth_stencil_early ? DxbcSrc::R(system_temp_depth_stencil_).Select(i) + : temp_x_src); if (!i) { if (writes_depth()) { + // Clamp oDepth to the lower viewport depth bound (depth clamp happens + // after the pixel shader in the pipeline, at least on Direct3D 11 and + // Vulkan, thus applies to the shader's depth output too). + system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index; + DxbcOpMax(DxbcDest::R(system_temp_depth_stencil_, 0b0001), + DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_EdramDepthRange_Vec) + .Select(kSysConst_EdramDepthRangeOffset_Comp)); + // Calculate the upper Z range bound to temp.x for clamping after + // biasing. + // temp.x = viewport maximum depth + system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index; + DxbcOpAdd(temp_x_dest, + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_EdramDepthRange_Vec) + .Select(kSysConst_EdramDepthRangeOffset_Comp), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_EdramDepthRange_Vec) + .Select(kSysConst_EdramDepthRangeScale_Comp)); + // Clamp oDepth to the upper viewport depth bound (already not above 1, + // but saturate for total safety). + // temp.x = free + DxbcOpMin(DxbcDest::R(system_temp_depth_stencil_, 0b0001), + DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX), + temp_x_src, true); // Convert the shader-generated depth to 24-bit, using temp.x as // temporary. - ROV_DepthTo24Bit(system_temp_rov_depth_stencil_, 0, - system_temp_rov_depth_stencil_, 0, temp, 0); + ROV_DepthTo24Bit(system_temp_depth_stencil_, 0, + system_temp_depth_stencil_, 0, temp, 0); } else { // Load the first sample's Z*W and W to temp.xy - need this regardless // of coverage for polygon offset. @@ -529,14 +556,14 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { } // Get if the current sample is covered to temp.w. - // temp.x = first sample's viewport space Z or 24-bit oDepth + // temp.x = first sample's viewport space Z if not writing to oDepth // temp.y = polygon offset if not writing to oDepth // temp.z = viewport maximum depth if not writing to oDepth // temp.w = coverage of the current sample DxbcOpAnd(temp_w_dest, DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX), DxbcSrc::LU(1 << i)); // Check if the current sample is covered. Release 1 VGPR. - // temp.x = first sample's viewport space Z or 24-bit oDepth + // temp.x = first sample's viewport space Z if not writing to oDepth // temp.y = polygon offset if not writing to oDepth // temp.z = viewport maximum depth if not writing to oDepth // temp.w = free @@ -546,7 +573,7 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // Copy the 24-bit depth common to all samples to sample_depth_stencil. // temp.x = shader-generated 24-bit depth DxbcOpMov(sample_depth_stencil_dest, - DxbcSrc::R(system_temp_rov_depth_stencil_, DxbcSrc::kXXXX)); + DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX)); } else { if (i) { // Sample's depth precalculated for sample 0 (for slope-scaled depth @@ -1720,7 +1747,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs_AlphaToMask() { // Convert SSAA sample position to integer to temp.xy (not caring about the // resolution scale because it's not supported anywhere on the RTV output // path). - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); @@ -1913,6 +1940,139 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { PopSystemTemp(2); } +void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() { + if (!DSV_IsWritingFloat24Depth()) { + return; + } + + uint32_t temp; + if (writes_depth()) { + // The depth is already written to system_temp_depth_stencil_.x and clamped + // to 0...1 with NaNs dropped (saturating in StoreResult); yzw are free. + temp = system_temp_depth_stencil_; + } else { + // Need a temporary variable; copy the sample's depth input to it and + // saturate it (in Direct3D 11, depth is clamped to the viewport bounds + // after the pixel shader, and SV_Position.z contains the unclamped depth, + // which may be outside the viewport's depth range if it's biased); though + // it will be clamped to the viewport bounds anyway, but to be able to make + // the assumption of it being clamped while working with the bit + // representation. + temp = PushSystemTemp(); + in_position_used_ |= 0b0100; + DxbcOpMov( + DxbcDest::R(temp, 0b0001), + DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition), DxbcSrc::kZZZZ), + true); + } + + DxbcDest temp_x_dest(DxbcDest::R(temp, 0b0001)); + DxbcSrc temp_x_src(DxbcSrc::R(temp, DxbcSrc::kXXXX)); + DxbcDest temp_y_dest(DxbcDest::R(temp, 0b0010)); + DxbcSrc temp_y_src(DxbcSrc::R(temp, DxbcSrc::kYYYY)); + + if (GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating) { + // Simplified conversion, always less than or equal to the original value - + // just drop the lower bits. + // The float32 exponent bias is 127. + // After saturating, the exponent range is -127...0. + // The smallest normalized 20e4 exponent is -14 - should drop 3 mantissa + // bits at -14 or above. + // The smallest denormalized 20e4 number is -34 - should drop 23 mantissa + // bits at -34. + // Anything smaller than 2^-34 becomes 0. + DxbcDest truncate_dest(writes_depth() ? DxbcDest::ODepth() + : DxbcDest::ODepthLE()); + // Check if the number is representable as a float24 after truncation - the + // exponent is at least -34. + DxbcOpUGE(temp_y_dest, temp_x_src, DxbcSrc::LU(0x2E800000)); + DxbcOpIf(true, temp_y_src); + { + // Extract the biased float32 exponent to temp.y. + // temp.y = 113+ at exponent -14+. + // temp.y = 93 at exponent -34. + DxbcOpUBFE(temp_y_dest, DxbcSrc::LU(8), DxbcSrc::LU(23), temp_x_src); + // Convert exponent to the unclamped number of bits to truncate. + // 116 - 113 = 3. + // 116 - 93 = 23. + // temp.y = 3+ at exponent -14+. + // temp.y = 23 at exponent -34. + DxbcOpIAdd(temp_y_dest, DxbcSrc::LI(116), -temp_y_src); + // Clamp the truncated bit count to drop 3 bits of any normal number. + // Exponents below -34 are handled separately. + // temp.y = 3 at exponent -14. + // temp.y = 23 at exponent -34. + DxbcOpIMax(temp_y_dest, temp_y_src, DxbcSrc::LI(3)); + // Truncate the mantissa - fill the low bits with zeros. + DxbcOpBFI(truncate_dest, temp_y_src, DxbcSrc::LU(0), DxbcSrc::LU(0), + temp_x_src); + } + // The number is not representable as float24 after truncation - zero. + DxbcOpElse(); + DxbcOpMov(truncate_dest, DxbcSrc::LF(0.0f)); + // Close the non-zero result check. + DxbcOpEndIf(); + } else { + // Properly convert to 20e4, with rounding to the nearest even. + PreClampedDepthTo20e4(temp, 0, temp, 0, temp, 1); + // Convert back to float32. + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + // Unpack the exponent to temp.y. + DxbcOpUShR(temp_y_dest, temp_x_src, DxbcSrc::LU(20)); + // Unpack the mantissa to temp.x. + DxbcOpAnd(temp_x_dest, temp_x_src, DxbcSrc::LU(0xFFFFF)); + // Check if the number is denormalized. + DxbcOpIf(false, temp_y_src); + { + // Check if the number is non-zero (if the mantissa isn't zero - the + // exponent is known to be zero at this point). + DxbcOpIf(true, temp_x_src); + { + // Normalize the mantissa. + // Note that HLSL firstbithigh(x) is compiled to DXBC like: + // `x ? 31 - firstbit_hi(x) : -1` + // (returns the index from the LSB, not the MSB, but -1 for zero too). + // temp.y = firstbit_hi(mantissa) + DxbcOpFirstBitHi(temp_y_dest, temp_x_src); + // temp.y = 20 - firstbithigh(mantissa) + // Or: + // temp.y = 20 - (31 - firstbit_hi(mantissa)) + DxbcOpIAdd(temp_y_dest, temp_y_src, DxbcSrc::LI(20 - 31)); + // mantissa = mantissa << (20 - firstbithigh(mantissa)) + // AND 0xFFFFF not needed after this - BFI will do it. + DxbcOpIShL(temp_x_dest, temp_x_src, temp_y_src); + // Get the normalized exponent. + // exponent = 1 - (20 - firstbithigh(mantissa)) + DxbcOpIAdd(temp_y_dest, DxbcSrc::LI(1), -temp_y_src); + } + // The number is zero. + DxbcOpElse(); + { + // Set the unbiased exponent to -112 for zero - 112 will be added later, + // resulting in zero float32. + DxbcOpMov(temp_y_dest, DxbcSrc::LI(-112)); + } + // Close the non-zero check. + DxbcOpEndIf(); + } + // Close the denormal check. + DxbcOpEndIf(); + // Bias the exponent and move it to the correct location in float32 to + // temp.y. + DxbcOpIMAd(temp_y_dest, temp_y_src, DxbcSrc::LI(1 << 23), + DxbcSrc::LI(112 << 23)); + // Combine the mantissa and the exponent into the result. + DxbcOpBFI(DxbcDest::ODepth(), DxbcSrc::LU(20), DxbcSrc::LU(3), temp_x_src, + temp_y_src); + } + + if (!writes_depth()) { + // Release temp. + PopSystemTemp(); + } +} + void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMaskSample( uint32_t sample_index, float threshold_base, DxbcSrc threshold_offset, float threshold_offset_scale, uint32_t temp, uint32_t temp_component) { @@ -1957,7 +2117,7 @@ void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMask() { // floating-point. With resolution scaling, still using host pixels, to // preserve the idea of dithering. // temp.x = alpha to coverage offset as float 0.0...3.0. - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); DxbcOpAnd(DxbcDest::R(temp, 0b0010), DxbcSrc::R(temp, DxbcSrc::kYYYY), @@ -2067,7 +2227,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { DxbcOpStoreUAVTyped( DxbcDest::U(uav_index_edram_, uint32_t(UAVRegister::kEdram)), DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kYYYY), 1, - DxbcSrc::R(system_temp_rov_depth_stencil_).Select(i)); + DxbcSrc::R(system_temp_depth_stencil_).Select(i)); } // Close the write check. DxbcOpEndIf(); @@ -3059,15 +3219,16 @@ void DxbcShaderTranslator::CompletePixelShader() { CompletePixelShader_WriteToROV(); } else { CompletePixelShader_WriteToRTVs(); + CompletePixelShader_DSV_DepthTo24Bit(); } } -void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, - uint32_t d24_temp_component, - uint32_t d32_temp, - uint32_t d32_temp_component, - uint32_t temp_temp, - uint32_t temp_temp_component) { +void DxbcShaderTranslator::PreClampedDepthTo20e4(uint32_t d24_temp, + uint32_t d24_temp_component, + uint32_t d32_temp, + uint32_t d32_temp_component, + uint32_t temp_temp, + uint32_t temp_temp_component) { assert_true(temp_temp != d24_temp || temp_temp_component != d24_temp_component); assert_true(temp_temp != d32_temp || @@ -3079,68 +3240,83 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, DxbcDest temp_dest(DxbcDest::R(temp_temp, 1 << temp_temp_component)); DxbcSrc temp_src(DxbcSrc::R(temp_temp).Select(temp_temp_component)); + // CFloat24 from d3dref9.dll. + // Assuming the depth is already clamped to [0, 2) (in all places, the depth + // is written with the saturate flag set). + + // Check if the number is too small to be represented as normalized 20e4. + // temp = f32 < 2^-14 + DxbcOpULT(temp_dest, d32_src, DxbcSrc::LU(0x38800000)); + // Handle denormalized numbers separately. + DxbcOpIf(true, temp_src); + { + // temp = f32 >> 23 + DxbcOpUShR(temp_dest, d32_src, DxbcSrc::LU(23)); + // temp = 113 - (f32 >> 23) + DxbcOpIAdd(temp_dest, DxbcSrc::LI(113), -temp_src); + // Don't allow the shift to overflow, since in DXBC the lower 5 bits of the + // shift amount are used (otherwise 0 becomes 8). + // temp = min(113 - (f32 >> 23), 24) + DxbcOpUMin(temp_dest, temp_src, DxbcSrc::LU(24)); + // biased_f32 = (f32 & 0x7FFFFF) | 0x800000 + DxbcOpBFI(d24_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1), + d32_src); + // biased_f32 = ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) + DxbcOpUShR(d24_dest, d24_src, temp_src); + } + // Not denormalized? + DxbcOpElse(); + { + // Bias the exponent. + // biased_f32 = f32 + (-112 << 23) + // (left shift of a negative value is undefined behavior) + DxbcOpIAdd(d24_dest, d32_src, DxbcSrc::LU(0xC8000000u)); + } + // Close the denormal check. + DxbcOpEndIf(); + // Build the 20e4 number. + // temp = (biased_f32 >> 3) & 1 + DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), d24_src); + // f24 = biased_f32 + 3 + DxbcOpIAdd(d24_dest, d24_src, DxbcSrc::LU(3)); + // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) + DxbcOpIAdd(d24_dest, d24_src, temp_src); + // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF + DxbcOpUBFE(d24_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), d24_src); +} + +void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, + uint32_t d24_temp_component, + uint32_t d32_temp, + uint32_t d32_temp_component, + uint32_t temp_temp, + uint32_t temp_temp_component) { + assert_true(temp_temp != d32_temp || + temp_temp_component != d32_temp_component); + // Source and destination may be the same. + system_constants_used_ |= 1ull << kSysConst_Flags_Index; - DxbcOpAnd(temp_dest, + DxbcOpAnd(DxbcDest::R(temp_temp, 1 << temp_temp_component), DxbcSrc::CB(cbuffer_index_system_constants_, uint32_t(CbufferRegister::kSystemConstants), kSysConst_Flags_Vec) .Select(kSysConst_Flags_Comp), DxbcSrc::LU(kSysFlag_ROVDepthFloat24)); // Convert according to the format. - DxbcOpIf(true, temp_src); + DxbcOpIf(true, DxbcSrc::R(temp_temp).Select(temp_temp_component)); { - // 20e4 conversion, using 1 VGPR. - // CFloat24 from d3dref9.dll. - // Assuming the depth is already clamped to [0, 2) (in all places, the depth - // is written with the saturate flag set). - - // Check if the number is too small to be represented as normalized 20e4. - // temp = f32 < 2^-14 - DxbcOpULT(temp_dest, d32_src, DxbcSrc::LU(0x38800000)); - // Handle denormalized numbers separately. - DxbcOpIf(true, temp_src); - { - // temp = f32 >> 23 - DxbcOpUShR(temp_dest, d32_src, DxbcSrc::LU(23)); - // temp = 113 - (f32 >> 23) - DxbcOpIAdd(temp_dest, DxbcSrc::LI(113), -temp_src); - // Don't allow the shift to overflow, since in DXBC the lower 5 bits of - // the shift amount are used (otherwise 0 becomes 8). - // temp = min(113 - (f32 >> 23), 24) - DxbcOpUMin(temp_dest, temp_src, DxbcSrc::LU(24)); - // biased_f32 = (f32 & 0x7FFFFF) | 0x800000 - DxbcOpBFI(d24_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1), - d32_src); - // biased_f32 = - // ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) - DxbcOpUShR(d24_dest, d24_src, temp_src); - } - // Not denormalized? - DxbcOpElse(); - { - // Bias the exponent. - // biased_f32 = f32 + (-112 << 23) - // (left shift of a negative value is undefined behavior) - DxbcOpIAdd(d24_dest, d32_src, DxbcSrc::LU(0xC8000000u)); - } - // Close the denormal check. - DxbcOpEndIf(); - // Build the 20e4 number. - // temp = (biased_f32 >> 3) & 1 - DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), d24_src); - // f24 = biased_f32 + 3 - DxbcOpIAdd(d24_dest, d24_src, DxbcSrc::LU(3)); - // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) - DxbcOpIAdd(d24_dest, d24_src, temp_src); - // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF - DxbcOpUBFE(d24_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), d24_src); + // 20e4 conversion. + PreClampedDepthTo20e4(d24_temp, d24_temp_component, d32_temp, + d32_temp_component, temp_temp, temp_temp_component); } DxbcOpElse(); { // Unorm24 conversion. - + DxbcDest d24_dest(DxbcDest::R(d24_temp, 1 << d24_temp_component)); + DxbcSrc d24_src(DxbcSrc::R(d24_temp).Select(d24_temp_component)); // Multiply by float(0xFFFFFF). - DxbcOpMul(d24_dest, d32_src, DxbcSrc::LF(16777215.0f)); + DxbcOpMul(d24_dest, DxbcSrc::R(d32_temp).Select(d32_temp_component), + DxbcSrc::LF(16777215.0f)); // Round to the nearest even integer. This seems to be the correct way: // rounding towards zero gives 0xFF instead of 0x100 in clear shaders in, // for instance, Halo 3, but other clear shaders in it are also broken if diff --git a/src/xenia/gpu/gpu_flags.cc b/src/xenia/gpu/gpu_flags.cc index 5f73fd3c2..07eff0bc8 100644 --- a/src/xenia/gpu/gpu_flags.cc +++ b/src/xenia/gpu/gpu_flags.cc @@ -40,9 +40,63 @@ DEFINE_bool( "be fully covered when MSAA is used with fullscreen passes.", "GPU"); +DEFINE_string( + depth_float24_conversion, "", + "Method for converting 32-bit Z values to 20e4 floating point when using " + "host depth buffers without native 20e4 support (when not using rasterizer-" + "ordered views / fragment shader interlocks to perform depth testing " + "manually).\n" + "Use: [any, on_copy, truncate, round]\n" + " on_copy:\n" + " Do depth testing at host precision, converting when copying between " + "host depth buffers and the EDRAM buffer to support reinterpretation, " + "maintaining two copies, in both host and 20e4 formats, for reloading data " + "to host depth buffers when it wasn't overwritten.\n" + " + Highest performance, allows early depth test and writing.\n" + " + Host MSAA is possible with pixel-rate shading where supported.\n" + " - EDRAM > RAM > EDRAM depth buffer round trip done in certain games " + "(such as GTA IV) destroys precision irreparably, causing artifacts if " + "another rendering pass is done after the EDRAM reupload.\n" + " truncate:\n" + " Convert to 20e4 directly in pixel shaders, always rounding down.\n" + " + Good performance, conservative early depth test is possible.\n" + " + No precision loss when anything changes in the storage of the depth " + "buffer, EDRAM > RAM > EDRAM copying preserves precision.\n" + " - Rounding mode is incorrect, sometimes giving results smaller than " + "they should be - may cause inaccuracy especially in edge cases when the " + "game wants to write an exact value.\n" + " - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" + " round:\n" + " Convert to 20e4 directly in pixel shaders, correctly rounding to the " + "nearest even.\n" + " + Highest accuracy.\n" + " - Significantly limited performance, early depth test is not possible.\n" + " - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" + " Any other value:\n" + " Choose what is considered the most optimal (currently \"on_copy\").", + "GPU"); + DEFINE_int32(query_occlusion_fake_sample_count, 1000, "If set to -1 no sample counts are written, games may hang. Else, " "the sample count of every tile will be incremented on every " "EVENT_WRITE_ZPD by this number. Setting this to 0 means " "everything is reported as occluded.", "GPU"); + +namespace xe { +namespace gpu { +namespace flags { + +DepthFloat24Conversion GetDepthFloat24Conversion() { + if (cvars::depth_float24_conversion == "truncate") { + return DepthFloat24Conversion::kOnOutputTruncating; + } + if (cvars::depth_float24_conversion == "round") { + return DepthFloat24Conversion::kOnOutputRounding; + } + return DepthFloat24Conversion::kOnCopy; +} + +} // namespace flags +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/gpu_flags.h b/src/xenia/gpu/gpu_flags.h index 5ae64b76e..2405dc23c 100644 --- a/src/xenia/gpu/gpu_flags.h +++ b/src/xenia/gpu/gpu_flags.h @@ -22,6 +22,69 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants); DECLARE_bool(half_pixel_offset); +DECLARE_string(depth_float24_conversion); + DECLARE_int32(query_occlusion_fake_sample_count); +namespace xe { +namespace gpu { +namespace flags { + +enum class DepthFloat24Conversion { + // Doing depth test at the host precision, converting to 20e4 to support + // reinterpretation, but keeping a separate EDRAM view containing depth values + // in the host format. When copying from the EDRAM buffer to host depth + // buffers, writing the stored host pixel if stored_f24 == to_f24(stored_host) + // (otherwise it was overwritten by something else, like clearing, or a color + // buffer; this is inexact though, and will incorrectly load pixels that were + // overwritten by something else in the EDRAM, but turned out to have the same + // value on the guest as before - an outdated host-precision value will be + // loaded in these cases instead). + // + // EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM round + // trip destroys precision beyond repair. + // + // Full host early Z and MSAA with pixel-rate shading are supported. + kOnCopy, + // Converting the depth to the closest host value representable exactly as a + // 20e4 float in pixel shaders, to support invariance in cases when the guest + // reuploads a previously resolved depth buffer to the EDRAM, rounding towards + // zero (which contradicts the rounding used by the Direct3D 9 reference + // rasterizer, but allows less-than-or-equal pixel shader depth output to be + // used to preserve most of early Z culling when the game is using reversed + // depth, which is the usual way of doing depth testing on the Xbox 360 and of + // utilizing the advantages of a floating-point encoding). + // + // With MSAA, pixel shaders must run at sample frequency - otherwise, if the + // depth is the same for the entire pixel, intersections of polygons cannot be + // antialiased. + // + // Important usage note: When using this mode, bounds of the fixed-function + // viewport must be converted to and back from float24 too (preferably using + // correct rounding to the nearest even, to reduce the error already caused by + // truncation rather than to amplify it). This ensures that clamping to the + // viewport bounds, which happens after the pixel shader even if it overwrites + // the resulting depth, is never done to a value not representable as float24 + // (for example, if the minimum Z is a number too small to be represented as + // float24, but not zero, it won't be possible to write what should become + // 0x000000 to the depth buffer). Note that this may add some error to the + // depth values from the rasterizer; however, modifying Z in the vertex shader + // to make interpolated depth values would cause clipping to be done to + // different bounds, which may be more undesirable, especially in cases when Z + // is explicitly set to a value like 0 or W (in such cases, the adjusted + // polygon may go outside 0...W in clip space and disappear). + kOnOutputTruncating, + // Similar to kOnOutputTruncating, but rounding to the nearest even, more + // correctly, however, because the resulting depth can be bigger than the + // original host value, early depth testing can't be used at all. Same + // viewport usage rules apply. + kOnOutputRounding, +}; + +DepthFloat24Conversion GetDepthFloat24Conversion(); + +} // namespace flags +} // namespace gpu +} // namespace xe + #endif // XENIA_GPU_GPU_FLAGS_H_ diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index 04bc8024b..de327869c 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -276,8 +276,7 @@ void GraphicsSystem::ClearCaches() { } void GraphicsSystem::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) { + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { if (!cvars::store_shaders) { return; } @@ -285,21 +284,18 @@ void GraphicsSystem::InitializeShaderStorage( if (command_processor_->is_paused()) { // Safe to run on any thread while the command processor is paused, no // race condition. - command_processor_->InitializeShaderStorage(storage_root, title_id, true); + command_processor_->InitializeShaderStorage(cache_root, title_id, true); } else { xe::threading::Fence fence; - command_processor_->CallInThread( - [this, storage_root, title_id, &fence]() { - command_processor_->InitializeShaderStorage(storage_root, title_id, - true); - fence.Signal(); - }); + command_processor_->CallInThread([this, cache_root, title_id, &fence]() { + command_processor_->InitializeShaderStorage(cache_root, title_id, true); + fence.Signal(); + }); fence.Wait(); } } else { - command_processor_->CallInThread([this, storage_root, title_id]() { - command_processor_->InitializeShaderStorage(storage_root, title_id, - false); + command_processor_->CallInThread([this, cache_root, title_id]() { + command_processor_->InitializeShaderStorage(cache_root, title_id, false); }); } } diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 47a4d3f7b..148206af2 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -63,7 +63,7 @@ class GraphicsSystem { virtual void ClearCaches(); - void InitializeShaderStorage(const std::filesystem::path& storage_root, + void InitializeShaderStorage(const std::filesystem::path& cache_root, uint32_t title_id, bool blocking); void RequestFrameTrace(); diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc index 931b728da..6df03fb81 100644 --- a/src/xenia/gpu/shader.cc +++ b/src/xenia/gpu/shader.cc @@ -31,9 +31,13 @@ Shader::Shader(xenos::ShaderType shader_type, uint64_t ucode_data_hash, xe::copy_and_swap(ucode_data_.data(), ucode_dwords, ucode_dword_count); } -Shader::~Shader() = default; +Shader::~Shader() { + for (auto it : translations_) { + delete it.second; + } +} -std::string Shader::GetTranslatedBinaryString() const { +std::string Shader::Translation::GetTranslatedBinaryString() const { std::string result; result.resize(translated_binary_.size()); std::memcpy(const_cast(result.data()), translated_binary_.data(), @@ -41,36 +45,24 @@ std::string Shader::GetTranslatedBinaryString() const { return result; } -std::pair Shader::Dump( +std::filesystem::path Shader::Translation::Dump( const std::filesystem::path& base_path, const char* path_prefix) { + std::filesystem::path path = base_path; // Ensure target path exists. - auto target_path = base_path; - if (!target_path.empty()) { - target_path = std::filesystem::absolute(target_path); - std::filesystem::create_directories(target_path); + if (!path.empty()) { + path = std::filesystem::absolute(path); + std::filesystem::create_directories(path); } - - auto base_name = - fmt::format("shader_{}_{:016X}", path_prefix, ucode_data_hash_); - - std::string txt_name, bin_name; - if (shader_type_ == xenos::ShaderType::kVertex) { - txt_name = base_name + ".vert"; - bin_name = base_name + ".bin.vert"; - } else { - txt_name = base_name + ".frag"; - bin_name = base_name + ".bin.frag"; - } - - std::filesystem::path txt_path, bin_path; - txt_path = base_path / txt_name; - bin_path = base_path / bin_name; - - FILE* f = filesystem::OpenFile(txt_path, "wb"); + path = path / + fmt::format( + "shader_{:016X}_{:08X}.{}.{}", shader().ucode_data_hash(), + modification(), path_prefix, + shader().type() == xenos::ShaderType::kVertex ? "vert" : "frag"); + FILE* f = filesystem::OpenFile(path, "wb"); if (f) { fwrite(translated_binary_.data(), 1, translated_binary_.size(), f); fprintf(f, "\n\n"); - auto ucode_disasm_ptr = ucode_disassembly().c_str(); + auto ucode_disasm_ptr = shader().ucode_disassembly().c_str(); while (*ucode_disasm_ptr) { auto line_end = std::strchr(ucode_disasm_ptr, '\n'); fprintf(f, "// "); @@ -83,14 +75,58 @@ std::pair Shader::Dump( } fclose(f); } + return std::move(path); +} - f = filesystem::OpenFile(bin_path, "wb"); +Shader::Translation* Shader::GetOrCreateTranslation(uint32_t modification, + bool* is_new) { + auto it = translations_.find(modification); + if (it != translations_.end()) { + if (is_new) { + *is_new = false; + } + return it->second; + } + Translation* translation = CreateTranslationInstance(modification); + translations_.emplace(modification, translation); + if (is_new) { + *is_new = true; + } + return translation; +} + +void Shader::DestroyTranslation(uint32_t modification) { + auto it = translations_.find(modification); + if (it == translations_.end()) { + return; + } + delete it->second; + translations_.erase(it); +} + +std::filesystem::path Shader::DumpUcodeBinary( + const std::filesystem::path& base_path) { + // Ensure target path exists. + std::filesystem::path path = base_path; + if (!path.empty()) { + path = std::filesystem::absolute(path); + std::filesystem::create_directories(path); + } + path = path / + fmt::format("shader_{:016X}.ucode.bin.{}", ucode_data_hash(), + type() == xenos::ShaderType::kVertex ? "vert" : "frag"); + + FILE* f = filesystem::OpenFile(path, "wb"); if (f) { - fwrite(ucode_data_.data(), 4, ucode_data_.size(), f); + fwrite(ucode_data().data(), 4, ucode_data().size(), f); fclose(f); } + return std::move(path); +} - return {std::move(txt_path), std::move(bin_path)}; +Shader::Translation* Shader::CreateTranslationInstance(uint32_t modification) { + // Default implementation for simple cases like ucode disassembly. + return new Translation(*this, modification); } } // namespace gpu diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 23998c307..e533ba9b8 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -11,8 +11,12 @@ #define XENIA_GPU_SHADER_H_ #include +#include +#include #include #include +#include +#include #include #include "xenia/base/math.h" @@ -591,6 +595,8 @@ struct ParsedAluInstruction { class Shader { public: + // Type of the vertex shader in a D3D11-like rendering pipeline - shader + // interface depends on in, so it must be known at translation time. // If values are changed, INVALIDATE SHADER STORAGES (increase their version // constexpr) where those are stored! And check bit count where this is // packed. This is : uint32_t for simplicity of packing in bit fields. @@ -603,6 +609,8 @@ class Shader { kQuadDomainCPIndexed, kQuadDomainPatchIndexed, }; + // For packing HostVertexShaderType in bit fields. + static constexpr uint32_t kHostVertexShaderTypeBitCount = 3; struct Error { bool is_fatal = false; @@ -683,6 +691,67 @@ class Shader { } }; + class Translation { + public: + virtual ~Translation() {} + + Shader& shader() const { return shader_; } + + // Translator-specific modification bits. + uint32_t modification() const { return modification_; } + + // True if the shader was translated and prepared without error. + bool is_valid() const { return is_valid_; } + + // True if the shader has already been translated. + bool is_translated() const { return is_translated_; } + + // Errors that occurred during translation. + const std::vector& errors() const { return errors_; } + + // Translated shader binary (or text). + const std::vector& translated_binary() const { + return translated_binary_; + } + + // Gets the translated shader binary as a string. + // This is only valid if it is actually text. + std::string GetTranslatedBinaryString() const; + + // Disassembly of the translated from the host graphics layer. + // May be empty if the host does not support disassembly. + const std::string& host_disassembly() const { return host_disassembly_; } + + // In case disassembly depends on the GPU backend, for setting it + // externally. + void set_host_disassembly(std::string disassembly) { + host_disassembly_ = std::move(disassembly); + } + + // For dumping after translation. Dumps the shader's disassembled microcode, + // translated code, and, if available, translated disassembly, to a file in + // the given path based on ucode hash. Returns the name of the written file. + std::filesystem::path Dump(const std::filesystem::path& base_path, + const char* path_prefix); + + protected: + Translation(Shader& shader, uint32_t modification) + : shader_(shader), modification_(modification) {} + + private: + friend class Shader; + friend class ShaderTranslator; + + Shader& shader_; + uint32_t modification_; + + bool is_valid_ = false; + bool is_translated_ = false; + std::vector errors_; + std::vector translated_binary_; + std::string host_disassembly_; + }; + Shader(xenos::ShaderType shader_type, uint64_t ucode_data_hash, const uint32_t* ucode_dwords, size_t ucode_dword_count); virtual ~Shader(); @@ -690,19 +759,30 @@ class Shader { // Whether the shader is identified as a vertex or pixel shader. xenos::ShaderType type() const { return shader_type_; } - // If this is a vertex shader, and it has been translated, type of the shader - // in a D3D11-like rendering pipeline - shader interface depends on in, so it - // must be known at translation time. - HostVertexShaderType host_vertex_shader_type() const { - return host_vertex_shader_type_; - } - // Microcode dwords in host endianness. const std::vector& ucode_data() const { return ucode_data_; } uint64_t ucode_data_hash() const { return ucode_data_hash_; } const uint32_t* ucode_dwords() const { return ucode_data_.data(); } size_t ucode_dword_count() const { return ucode_data_.size(); } + // Host translations with the specified modification bits. Not thread-safe + // with respect to translation creation/destruction. + const std::unordered_map& translations() const { + return translations_; + } + Translation* GetTranslation(uint32_t modification) const { + auto it = translations_.find(modification); + if (it != translations_.cend()) { + return it->second; + } + return nullptr; + } + Translation* GetOrCreateTranslation(uint32_t modification, + bool* is_new = nullptr); + // For shader storage loading, to remove a modification in case of translation + // failure. Not thread-safe. + void DestroyTranslation(uint32_t modification); + // All vertex bindings used in the shader. // Valid for vertex shaders only. const std::vector& vertex_bindings() const { @@ -733,73 +813,55 @@ class Shader { // True if the shader overrides the pixel depth. bool writes_depth() const { return writes_depth_; } - // True if Xenia can automatically enable early depth/stencil for the pixel - // shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha - // testing and alpha to coverage are disabled. - bool implicit_early_z_allowed() const { return implicit_early_z_allowed_; } - - // True if the shader was translated and prepared without error. - bool is_valid() const { return is_valid_; } - - // True if the shader has already been translated. - bool is_translated() const { return is_translated_; } - - // Errors that occurred during translation. - const std::vector& errors() const { return errors_; } + // True if the current shader has any `kill` instructions. + bool kills_pixels() const { return kills_pixels_; } // Microcode disassembly in D3D format. const std::string& ucode_disassembly() const { return ucode_disassembly_; } - // Translated shader binary (or text). - const std::vector& translated_binary() const { - return translated_binary_; + // An externally managed identifier of the shader storage the microcode of the + // shader was last written to, or was loaded from, to only write the shader + // microcode to the storage once. UINT32_MAX by default. + uint32_t ucode_storage_index() const { return ucode_storage_index_; } + void set_ucode_storage_index(uint32_t storage_index) { + ucode_storage_index_ = storage_index; } - // Gets the translated shader binary as a string. - // This is only valid if it is actually text. - std::string GetTranslatedBinaryString() const; - - // Disassembly of the translated from the host graphics layer. - // May be empty if the host does not support disassembly. - const std::string& host_disassembly() const { return host_disassembly_; } - // A lot of errors that occurred during preparation of the host shader. - const std::string& host_error_log() const { return host_error_log_; } - // Host binary that can be saved and reused across runs. - // May be empty if the host does not support saving binaries. - const std::vector& host_binary() const { return host_binary_; } - - // Dumps the shader to a file in the given path based on ucode hash. - // Both the ucode binary and disassembled and translated shader will be - // written. - // Returns the filename of the shader and the binary. - std::pair Dump( - const std::filesystem::path& base_path, const char* path_prefix); + // Dumps the shader's microcode binary to a file in the given path based on + // ucode hash. Returns the name of the written file. Can be called at any + // time, doesn't require the shader to be translated. + std::filesystem::path DumpUcodeBinary(const std::filesystem::path& base_path); protected: friend class ShaderTranslator; + virtual Translation* CreateTranslationInstance(uint32_t modification); + xenos::ShaderType shader_type_; - HostVertexShaderType host_vertex_shader_type_ = HostVertexShaderType::kVertex; std::vector ucode_data_; uint64_t ucode_data_hash_; + // Modification bits -> translation. + std::unordered_map translations_; + + // Whether setup of the post-translation parameters (listed below, plus those + // specific to the implementation) has been initiated, by any thread. If + // translation is performed on multiple threads, only one thread must be + // setting this up (other threads would write the same data anyway). + std::atomic_flag post_translation_info_set_up_ = ATOMIC_FLAG_INIT; + + // Initialized after the first successful translation (these don't depend on + // the host-side modification bits). + std::string ucode_disassembly_; std::vector vertex_bindings_; std::vector texture_bindings_; ConstantRegisterMap constant_register_map_ = {0}; bool writes_color_targets_[4] = {false, false, false, false}; bool writes_depth_ = false; - bool implicit_early_z_allowed_ = true; + bool kills_pixels_ = false; std::vector memexport_stream_constants_; - bool is_valid_ = false; - bool is_translated_ = false; - std::vector errors_; - - std::string ucode_disassembly_; - std::vector translated_binary_; - std::string host_disassembly_; - std::string host_error_log_; - std::vector host_binary_; + uint32_t ucode_storage_index_ = UINT32_MAX; }; } // namespace gpu diff --git a/src/xenia/gpu/shader_compiler_main.cc b/src/xenia/gpu/shader_compiler_main.cc index f5392216b..a9a744955 100644 --- a/src/xenia/gpu/shader_compiler_main.cc +++ b/src/xenia/gpu/shader_compiler_main.cc @@ -140,11 +140,15 @@ int shader_compiler_main(const std::vector& args) { Shader::HostVertexShaderType::kQuadDomainPatchIndexed; } } + uint32_t modification = + translator->GetDefaultModification(shader_type, host_vertex_shader_type); - translator->Translate(shader.get(), host_vertex_shader_type); + Shader::Translation* translation = + shader->GetOrCreateTranslation(modification); + translator->Translate(*translation); - const void* source_data = shader->translated_binary().data(); - size_t source_data_size = shader->translated_binary().size(); + const void* source_data = translation->translated_binary().data(); + size_t source_data_size = translation->translated_binary().size(); std::unique_ptr spirv_disasm_result; if (cvars::shader_output_type == "spirvtext") { diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 304acf602..6d79e82c2 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -1,4 +1,3 @@ -#include "shader_translator.h" /** ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * @@ -14,6 +13,7 @@ #include #include +#include "xenia/base/assert.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" @@ -46,7 +46,9 @@ ShaderTranslator::ShaderTranslator() = default; ShaderTranslator::~ShaderTranslator() = default; -void ShaderTranslator::Reset() { +void ShaderTranslator::Reset(xenos::ShaderType shader_type) { + shader_type_ = shader_type; + modification_ = GetDefaultModification(shader_type); errors_.clear(); ucode_disasm_buffer_.Reset(); ucode_disasm_line_number_ = 0; @@ -64,37 +66,37 @@ void ShaderTranslator::Reset() { writes_color_targets_[i] = false; } writes_depth_ = false; - implicit_early_z_allowed_ = true; + kills_pixels_ = false; memexport_alloc_count_ = 0; memexport_eA_written_ = 0; std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_)); memexport_stream_constants_.clear(); } -bool ShaderTranslator::Translate( - Shader* shader, reg::SQ_PROGRAM_CNTL cntl, - Shader::HostVertexShaderType host_vertex_shader_type) { - Reset(); - uint32_t cntl_num_reg = shader->type() == xenos::ShaderType::kVertex +bool ShaderTranslator::Translate(Shader::Translation& translation, + reg::SQ_PROGRAM_CNTL cntl) { + xenos::ShaderType shader_type = translation.shader().type(); + Reset(shader_type); + uint32_t cntl_num_reg = shader_type == xenos::ShaderType::kVertex ? cntl.vs_num_reg : cntl.ps_num_reg; register_count_ = (cntl_num_reg & 0x80) ? 0 : (cntl_num_reg + 1); - return TranslateInternal(shader, host_vertex_shader_type); + return TranslateInternal(translation); } -bool ShaderTranslator::Translate( - Shader* shader, Shader::HostVertexShaderType host_vertex_shader_type) { - Reset(); - return TranslateInternal(shader, host_vertex_shader_type); +bool ShaderTranslator::Translate(Shader::Translation& translation) { + Reset(translation.shader().type()); + return TranslateInternal(translation); } -bool ShaderTranslator::TranslateInternal( - Shader* shader, Shader::HostVertexShaderType host_vertex_shader_type) { - shader_type_ = shader->type(); - host_vertex_shader_type_ = host_vertex_shader_type; - ucode_dwords_ = shader->ucode_dwords(); - ucode_dword_count_ = shader->ucode_dword_count(); +bool ShaderTranslator::TranslateInternal(Shader::Translation& translation) { + Shader& shader = translation.shader(); + assert_true(shader_type_ == shader.type()); + shader_type_ = shader.type(); + ucode_dwords_ = shader.ucode_dwords(); + ucode_dword_count_ = shader.ucode_dword_count(); + modification_ = translation.modification(); // Control flow instructions come paired in blocks of 3 dwords and all are // listed at the top of the ucode. @@ -150,12 +152,6 @@ bool ShaderTranslator::TranslateInternal( if (memexport_eA_written_ == 0) { memexport_stream_constants_.clear(); } - if (!memexport_stream_constants_.empty()) { - // TODO(Triang3l): Investigate what happens to memexport when the pixel - // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early - // depth/stencil. - implicit_early_z_allowed_ = false; - } StartTranslation(); @@ -192,35 +188,44 @@ bool ShaderTranslator::TranslateInternal( ++cf_index; } - shader->errors_ = std::move(errors_); - shader->translated_binary_ = CompleteTranslation(); - shader->ucode_disassembly_ = ucode_disasm_buffer_.to_string(); - shader->host_vertex_shader_type_ = host_vertex_shader_type_; - shader->vertex_bindings_ = std::move(vertex_bindings_); - shader->texture_bindings_ = std::move(texture_bindings_); - shader->constant_register_map_ = std::move(constant_register_map_); - for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { - shader->writes_color_targets_[i] = writes_color_targets_[i]; - } - shader->writes_depth_ = writes_depth_; - shader->implicit_early_z_allowed_ = implicit_early_z_allowed_; - shader->memexport_stream_constants_.clear(); - for (uint32_t memexport_stream_constant : memexport_stream_constants_) { - shader->memexport_stream_constants_.push_back(memexport_stream_constant); - } + translation.errors_ = std::move(errors_); + translation.translated_binary_ = CompleteTranslation(); + translation.is_translated_ = true; - shader->is_valid_ = true; - shader->is_translated_ = true; - for (const auto& error : shader->errors_) { + bool is_valid = true; + for (const auto& error : translation.errors_) { if (error.is_fatal) { - shader->is_valid_ = false; + is_valid = false; break; } } + translation.is_valid_ = is_valid; - PostTranslation(shader); + // Setup info that doesn't depend on the modification only once. + bool setup_shader_post_translation_info = + is_valid && !shader.post_translation_info_set_up_.test_and_set(); + if (setup_shader_post_translation_info) { + shader.ucode_disassembly_ = ucode_disasm_buffer_.to_string(); + shader.vertex_bindings_ = std::move(vertex_bindings_); + shader.texture_bindings_ = std::move(texture_bindings_); + shader.constant_register_map_ = std::move(constant_register_map_); + for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { + shader.writes_color_targets_[i] = writes_color_targets_[i]; + } + shader.writes_depth_ = writes_depth_; + shader.kills_pixels_ = kills_pixels_; + shader.memexport_stream_constants_.clear(); + shader.memexport_stream_constants_.reserve( + memexport_stream_constants_.size()); + shader.memexport_stream_constants_.insert( + shader.memexport_stream_constants_.cend(), + memexport_stream_constants_.cbegin(), + memexport_stream_constants_.cend()); + } + PostTranslation(translation, setup_shader_post_translation_info); - return shader->is_valid_; + // In case is_valid_ is modified by PostTranslation, reload. + return translation.is_valid_; } void ShaderTranslator::MarkUcodeInstruction(uint32_t dword_offset) { @@ -343,14 +348,9 @@ void ShaderTranslator::GatherInstructionInformation( ParsedAluInstruction instr; ParseAluInstruction(op, instr); - const auto& vector_opcode_info = - alu_vector_opcode_infos_[uint32_t(op.vector_opcode())]; - implicit_early_z_allowed_ &= - !vector_opcode_info.disable_implicit_early_z; - const auto& scalar_opcode_info = - alu_scalar_opcode_infos_[uint32_t(op.scalar_opcode())]; - implicit_early_z_allowed_ &= - !scalar_opcode_info.disable_implicit_early_z; + kills_pixels_ = kills_pixels_ || + ucode::AluVectorOpcodeIsKill(op.vector_opcode()) || + ucode::AluScalarOpcodeIsKill(op.scalar_opcode()); if (instr.vector_and_constant_result.storage_target != InstructionStorageTarget::kRegister || @@ -408,7 +408,6 @@ void ShaderTranslator::GatherInstructionInformation( break; case InstructionStorageTarget::kDepth: writes_depth_ = true; - implicit_early_z_allowed_ = false; break; default: break; @@ -1082,91 +1081,91 @@ uint32_t ParsedTextureFetchInstruction::GetNonZeroResultComponents() const { const ShaderTranslator::AluOpcodeInfo ShaderTranslator::alu_vector_opcode_infos_[0x20] = { - {"add", 2, 4, false}, // 0 - {"mul", 2, 4, false}, // 1 - {"max", 2, 4, false}, // 2 - {"min", 2, 4, false}, // 3 - {"seq", 2, 4, false}, // 4 - {"sgt", 2, 4, false}, // 5 - {"sge", 2, 4, false}, // 6 - {"sne", 2, 4, false}, // 7 - {"frc", 1, 4, false}, // 8 - {"trunc", 1, 4, false}, // 9 - {"floor", 1, 4, false}, // 10 - {"mad", 3, 4, false}, // 11 - {"cndeq", 3, 4, false}, // 12 - {"cndge", 3, 4, false}, // 13 - {"cndgt", 3, 4, false}, // 14 - {"dp4", 2, 4, false}, // 15 - {"dp3", 2, 4, false}, // 16 - {"dp2add", 3, 4, false}, // 17 - {"cube", 2, 4, false}, // 18 - {"max4", 1, 4, false}, // 19 - {"setp_eq_push", 2, 4, false}, // 20 - {"setp_ne_push", 2, 4, false}, // 21 - {"setp_gt_push", 2, 4, false}, // 22 - {"setp_ge_push", 2, 4, false}, // 23 - {"kill_eq", 2, 4, true}, // 24 - {"kill_gt", 2, 4, true}, // 25 - {"kill_ge", 2, 4, true}, // 26 - {"kill_ne", 2, 4, true}, // 27 - {"dst", 2, 4, false}, // 28 - {"maxa", 2, 4, false}, // 29 + {"add", 2, 4}, // 0 + {"mul", 2, 4}, // 1 + {"max", 2, 4}, // 2 + {"min", 2, 4}, // 3 + {"seq", 2, 4}, // 4 + {"sgt", 2, 4}, // 5 + {"sge", 2, 4}, // 6 + {"sne", 2, 4}, // 7 + {"frc", 1, 4}, // 8 + {"trunc", 1, 4}, // 9 + {"floor", 1, 4}, // 10 + {"mad", 3, 4}, // 11 + {"cndeq", 3, 4}, // 12 + {"cndge", 3, 4}, // 13 + {"cndgt", 3, 4}, // 14 + {"dp4", 2, 4}, // 15 + {"dp3", 2, 4}, // 16 + {"dp2add", 3, 4}, // 17 + {"cube", 2, 4}, // 18 + {"max4", 1, 4}, // 19 + {"setp_eq_push", 2, 4}, // 20 + {"setp_ne_push", 2, 4}, // 21 + {"setp_gt_push", 2, 4}, // 22 + {"setp_ge_push", 2, 4}, // 23 + {"kill_eq", 2, 4}, // 24 + {"kill_gt", 2, 4}, // 25 + {"kill_ge", 2, 4}, // 26 + {"kill_ne", 2, 4}, // 27 + {"dst", 2, 4}, // 28 + {"maxa", 2, 4}, // 29 }; const ShaderTranslator::AluOpcodeInfo ShaderTranslator::alu_scalar_opcode_infos_[0x40] = { - {"adds", 1, 2, false}, // 0 - {"adds_prev", 1, 1, false}, // 1 - {"muls", 1, 2, false}, // 2 - {"muls_prev", 1, 1, false}, // 3 - {"muls_prev2", 1, 2, false}, // 4 - {"maxs", 1, 2, false}, // 5 - {"mins", 1, 2, false}, // 6 - {"seqs", 1, 1, false}, // 7 - {"sgts", 1, 1, false}, // 8 - {"sges", 1, 1, false}, // 9 - {"snes", 1, 1, false}, // 10 - {"frcs", 1, 1, false}, // 11 - {"truncs", 1, 1, false}, // 12 - {"floors", 1, 1, false}, // 13 - {"exp", 1, 1, false}, // 14 - {"logc", 1, 1, false}, // 15 - {"log", 1, 1, false}, // 16 - {"rcpc", 1, 1, false}, // 17 - {"rcpf", 1, 1, false}, // 18 - {"rcp", 1, 1, false}, // 19 - {"rsqc", 1, 1, false}, // 20 - {"rsqf", 1, 1, false}, // 21 - {"rsq", 1, 1, false}, // 22 - {"maxas", 1, 2, false}, // 23 - {"maxasf", 1, 2, false}, // 24 - {"subs", 1, 2, false}, // 25 - {"subs_prev", 1, 1, false}, // 26 - {"setp_eq", 1, 1, false}, // 27 - {"setp_ne", 1, 1, false}, // 28 - {"setp_gt", 1, 1, false}, // 29 - {"setp_ge", 1, 1, false}, // 30 - {"setp_inv", 1, 1, false}, // 31 - {"setp_pop", 1, 1, false}, // 32 - {"setp_clr", 0, 0, false}, // 33 - {"setp_rstr", 1, 1, false}, // 34 - {"kills_eq", 1, 1, true}, // 35 - {"kills_gt", 1, 1, true}, // 36 - {"kills_ge", 1, 1, true}, // 37 - {"kills_ne", 1, 1, true}, // 38 - {"kills_one", 1, 1, true}, // 39 - {"sqrt", 1, 1, false}, // 40 - {"UNKNOWN", 0, 0, false}, // 41 - {"mulsc", 2, 1, false}, // 42 - {"mulsc", 2, 1, false}, // 43 - {"addsc", 2, 1, false}, // 44 - {"addsc", 2, 1, false}, // 45 - {"subsc", 2, 1, false}, // 46 - {"subsc", 2, 1, false}, // 47 - {"sin", 1, 1, false}, // 48 - {"cos", 1, 1, false}, // 49 - {"retain_prev", 0, 0, false}, // 50 + {"adds", 1, 2}, // 0 + {"adds_prev", 1, 1}, // 1 + {"muls", 1, 2}, // 2 + {"muls_prev", 1, 1}, // 3 + {"muls_prev2", 1, 2}, // 4 + {"maxs", 1, 2}, // 5 + {"mins", 1, 2}, // 6 + {"seqs", 1, 1}, // 7 + {"sgts", 1, 1}, // 8 + {"sges", 1, 1}, // 9 + {"snes", 1, 1}, // 10 + {"frcs", 1, 1}, // 11 + {"truncs", 1, 1}, // 12 + {"floors", 1, 1}, // 13 + {"exp", 1, 1}, // 14 + {"logc", 1, 1}, // 15 + {"log", 1, 1}, // 16 + {"rcpc", 1, 1}, // 17 + {"rcpf", 1, 1}, // 18 + {"rcp", 1, 1}, // 19 + {"rsqc", 1, 1}, // 20 + {"rsqf", 1, 1}, // 21 + {"rsq", 1, 1}, // 22 + {"maxas", 1, 2}, // 23 + {"maxasf", 1, 2}, // 24 + {"subs", 1, 2}, // 25 + {"subs_prev", 1, 1}, // 26 + {"setp_eq", 1, 1}, // 27 + {"setp_ne", 1, 1}, // 28 + {"setp_gt", 1, 1}, // 29 + {"setp_ge", 1, 1}, // 30 + {"setp_inv", 1, 1}, // 31 + {"setp_pop", 1, 1}, // 32 + {"setp_clr", 0, 0}, // 33 + {"setp_rstr", 1, 1}, // 34 + {"kills_eq", 1, 1}, // 35 + {"kills_gt", 1, 1}, // 36 + {"kills_ge", 1, 1}, // 37 + {"kills_ne", 1, 1}, // 38 + {"kills_one", 1, 1}, // 39 + {"sqrt", 1, 1}, // 40 + {"UNKNOWN", 0, 0}, // 41 + {"mulsc", 2, 1}, // 42 + {"mulsc", 2, 1}, // 43 + {"addsc", 2, 1}, // 44 + {"addsc", 2, 1}, // 45 + {"subsc", 2, 1}, // 46 + {"subsc", 2, 1}, // 47 + {"sin", 1, 1}, // 48 + {"cos", 1, 1}, // 49 + {"retain_prev", 0, 0}, // 50 }; void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) { diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index 3d4fa208d..e1c97808a 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -29,18 +29,27 @@ class ShaderTranslator { public: virtual ~ShaderTranslator(); - bool Translate(Shader* shader, reg::SQ_PROGRAM_CNTL cntl, - Shader::HostVertexShaderType host_vertex_shader_type = - Shader::HostVertexShaderType::kVertex); - bool Translate(Shader* shader, - Shader::HostVertexShaderType host_vertex_shader_type = - Shader::HostVertexShaderType::kVertex); + virtual uint32_t GetDefaultModification( + xenos::ShaderType shader_type, + Shader::HostVertexShaderType host_vertex_shader_type = + Shader::HostVertexShaderType::kVertex) const { + return 0; + } + + bool Translate(Shader::Translation& translation, reg::SQ_PROGRAM_CNTL cntl); + bool Translate(Shader::Translation& translation); protected: ShaderTranslator(); // Resets translator state before beginning translation. - virtual void Reset(); + // shader_type is passed here so translator implementations can generate + // special fixed shaders for internal use, and set up the type for this + // purpose. + virtual void Reset(xenos::ShaderType shader_type); + + // Current host-side modification being generated. + uint32_t modification() const { return modification_; } // Register count. uint32_t register_count() const { return register_count_; } @@ -48,11 +57,6 @@ class ShaderTranslator { bool is_vertex_shader() const { return shader_type_ == xenos::ShaderType::kVertex; } - // If translating a vertex shader, type of the shader in a D3D11-like - // rendering pipeline. - Shader::HostVertexShaderType host_vertex_shader_type() const { - return host_vertex_shader_type_; - } // True if the current shader is a pixel shader. bool is_pixel_shader() const { return shader_type_ == xenos::ShaderType::kPixel; @@ -85,10 +89,8 @@ class ShaderTranslator { // True if the current shader overrides the pixel depth, set before // translation. Doesn't include writes with an empty used write mask. bool writes_depth() const { return writes_depth_; } - // True if Xenia can automatically enable early depth/stencil for the pixel - // shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha - // testing and alpha to coverage are disabled. - bool implicit_early_z_allowed() const { return implicit_early_z_allowed_; } + // True if the current shader has any `kill` instructions. + bool kills_pixels() const { return kills_pixels_; } // A list of all vertex bindings, populated before translation occurs. const std::vector& vertex_bindings() const { return vertex_bindings_; @@ -112,6 +114,17 @@ class ShaderTranslator { return memexport_stream_constants_; } + // Whether the shader can have early depth and stencil writing enabled, unless + // alpha test or alpha to coverage is enabled. Data gathered before + // translation. + bool CanWriteZEarly() const { + // TODO(Triang3l): Investigate what happens to memexport when the pixel + // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early + // depth/stencil. + return !writes_depth_ && !kills_pixels_ && + memexport_stream_constants_.empty(); + } + // Current line number in the ucode disassembly. size_t ucode_disasm_line_number() const { return ucode_disasm_line_number_; } // Ucode disassembly buffer accumulated during translation. @@ -130,10 +143,14 @@ class ShaderTranslator { } // Handles post-translation tasks when the shader has been fully translated. - virtual void PostTranslation(Shader* shader) {} + // setup_shader_post_translation_info if non-modification-specific parameters + // of the Shader object behind the Translation can be set by this invocation. + virtual void PostTranslation(Shader::Translation& translation, + bool setup_shader_post_translation_info) {} // Sets the host disassembly on a shader. - void set_host_disassembly(Shader* shader, std::string value) { - shader->host_disassembly_ = std::move(value); + void set_host_disassembly(Shader::Translation& translation, + std::string value) { + translation.host_disassembly_ = std::move(value); } // Pre-process a control-flow instruction before anything else. @@ -188,11 +205,9 @@ class ShaderTranslator { const char* name; uint32_t argument_count; uint32_t src_swizzle_component_count; - bool disable_implicit_early_z; }; - bool TranslateInternal(Shader* shader, - Shader::HostVertexShaderType host_vertex_shader_type); + bool TranslateInternal(Shader::Translation& translation); void MarkUcodeInstruction(uint32_t dword_offset); void AppendUcodeDisasm(char c); @@ -246,12 +261,13 @@ class ShaderTranslator { // Input shader metadata and microcode. xenos::ShaderType shader_type_; - Shader::HostVertexShaderType host_vertex_shader_type_; const uint32_t* ucode_dwords_; size_t ucode_dword_count_; - reg::SQ_PROGRAM_CNTL program_cntl_; uint32_t register_count_; + // Current host-side modification being generated. + uint32_t modification_ = 0; + // Accumulated translation errors. std::vector errors_; @@ -272,7 +288,8 @@ class ShaderTranslator { // translation. std::set label_addresses_; - // Detected binding information gathered before translation. + // Detected binding information gathered before translation. Must not be + // affected by the modification index. int total_attrib_count_ = 0; std::vector vertex_bindings_; std::vector texture_bindings_; @@ -282,13 +299,15 @@ class ShaderTranslator { // These all are gathered before translation. // uses_register_dynamic_addressing_ for writes, writes_color_targets_, // writes_depth_ don't include empty used write masks. + // Must not be affected by the modification index. Shader::ConstantRegisterMap constant_register_map_ = {0}; bool uses_register_dynamic_addressing_ = false; bool writes_color_targets_[4] = {false, false, false, false}; bool writes_depth_ = false; - bool implicit_early_z_allowed_ = true; + bool kills_pixels_ = false; // Memexport info is gathered before translation. + // Must not be affected by the modification index. uint32_t memexport_alloc_count_ = 0; // For register allocation in implementations - what was used after each // `alloc export`. diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index bb1bb51f0..0ff228d53 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -667,12 +667,14 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { return spirv_bytes; } -void SpirvShaderTranslator::PostTranslation(Shader* shader) { +void SpirvShaderTranslator::PostTranslation( + Shader::Translation& translation, bool setup_shader_post_translation_info) { // Validation. if (cvars::spv_validate) { auto validation = validator_.Validate( - reinterpret_cast(shader->translated_binary().data()), - shader->translated_binary().size() / sizeof(uint32_t)); + reinterpret_cast( + translation.translated_binary().data()), + translation.translated_binary().size() / sizeof(uint32_t)); if (validation->has_error()) { XELOGE("SPIR-V Shader Validation failed! Error: {}", validation->error_string()); @@ -682,12 +684,13 @@ void SpirvShaderTranslator::PostTranslation(Shader* shader) { if (cvars::spv_disasm) { // TODO(benvanik): only if needed? could be slowish. auto disasm = disassembler_.Disassemble( - reinterpret_cast(shader->translated_binary().data()), - shader->translated_binary().size() / 4); + reinterpret_cast( + translation.translated_binary().data()), + translation.translated_binary().size() / sizeof(uint32_t)); if (disasm->has_error()) { XELOGE("Failed to disassemble SPIRV - invalid?"); } else { - set_host_disassembly(shader, disasm->to_string()); + set_host_disassembly(translation, disasm->to_string()); } } } diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 044dea019..478aa3428 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -61,7 +61,8 @@ class SpirvShaderTranslator : public ShaderTranslator { protected: void StartTranslation() override; std::vector CompleteTranslation() override; - void PostTranslation(Shader* shader) override; + void PostTranslation(Shader::Translation& translation, + bool setup_shader_post_translation_info) override; void PreProcessControlFlowInstructions( std::vector instrs) override; diff --git a/src/xenia/gpu/trace_dump.cc b/src/xenia/gpu/trace_dump.cc index 984984c4a..fdebcfba4 100644 --- a/src/xenia/gpu/trace_dump.cc +++ b/src/xenia/gpu/trace_dump.cc @@ -92,7 +92,7 @@ int TraceDump::Main(const std::vector& args) { bool TraceDump::Setup() { // Create the emulator but don't initialize so we can setup the window. - emulator_ = std::make_unique("", "", ""); + emulator_ = std::make_unique("", "", "", ""); X_STATUS result = emulator_->Setup( nullptr, nullptr, [this]() { return CreateGraphicsSystem(); }, nullptr); if (XFAILED(result)) { diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 5305c50ae..5297d6856 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -121,7 +121,7 @@ bool TraceViewer::Setup() { window_->Resize(1920, 1200); // Create the emulator but don't initialize so we can setup the window. - emulator_ = std::make_unique("", "", ""); + emulator_ = std::make_unique("", "", "", ""); X_STATUS result = emulator_->Setup( window_.get(), nullptr, [this]() { return CreateGraphicsSystem(); }, nullptr); @@ -566,8 +566,21 @@ TraceViewer::ShaderDisplayType TraceViewer::DrawShaderTypeUI() { void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) { // Must be prepared for advanced display modes. + // FIXME(Triang3l): This should display the actual translation used in the + // draw, but it may depend on multiple backend-related factors, including + // drawing multiple times with multiple modifications, even depending on + // values obtained during translation of other modifications (for instance, + // a memexporting shader can be executed both as a vertex shader (to draw the + // points) and as a compute shader (to actually export) if the host doesn't + // support writes from vertex shaders. + const Shader::Translation* translation = nullptr; if (display_type != ShaderDisplayType::kUcode) { - if (!shader->is_valid()) { + for (const auto& translation_pair : shader->translations()) { + if (translation_pair.second->is_valid()) { + translation = translation_pair.second; + } + } + if (!translation) { ImGui::TextColored(kColorError, "ERROR: shader error during parsing/translation"); return; @@ -580,7 +593,7 @@ void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) { break; } case ShaderDisplayType::kTranslated: { - const auto& str = shader->GetTranslatedBinaryString(); + const auto& str = translation->GetTranslatedBinaryString(); size_t i = 0; bool done = false; while (!done && i < str.size()) { @@ -600,7 +613,7 @@ void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) { break; } case ShaderDisplayType::kHostDisasm: { - DrawMultilineString(shader->host_disassembly()); + DrawMultilineString(translation->host_disassembly()); break; } } diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 21ccbaff9..85b52a377 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -1147,6 +1147,19 @@ enum class AluScalarOpcode : uint32_t { kRetainPrev = 50, }; +constexpr bool AluScalarOpcodeIsKill(AluScalarOpcode scalar_opcode) { + switch (scalar_opcode) { + case AluScalarOpcode::kKillsEq: + case AluScalarOpcode::kKillsGt: + case AluScalarOpcode::kKillsGe: + case AluScalarOpcode::kKillsNe: + case AluScalarOpcode::kKillsOne: + return true; + default: + return false; + } +} + enum class AluVectorOpcode : uint32_t { // Per-Component Floating-Point Add // add/ADDv dest, src0, src1 @@ -1471,27 +1484,37 @@ enum class AluVectorOpcode : uint32_t { kMaxA = 29, }; +constexpr bool AluVectorOpcodeIsKill(AluVectorOpcode vector_opcode) { + switch (vector_opcode) { + case AluVectorOpcode::kKillEq: + case AluVectorOpcode::kKillGt: + case AluVectorOpcode::kKillGe: + case AluVectorOpcode::kKillNe: + return true; + default: + return false; + } +} + // Whether the vector instruction has side effects such as discarding a pixel or // setting the predicate and can't be ignored even if it doesn't write to // anywhere. Note that all scalar operations except for retain_prev have a side // effect of modifying the previous scalar result register, so they must always // be executed even if not writing. constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) { + if (AluVectorOpcodeIsKill(vector_opcode)) { + return true; + } switch (vector_opcode) { case AluVectorOpcode::kSetpEqPush: case AluVectorOpcode::kSetpNePush: case AluVectorOpcode::kSetpGtPush: case AluVectorOpcode::kSetpGePush: - case AluVectorOpcode::kKillEq: - case AluVectorOpcode::kKillGt: - case AluVectorOpcode::kKillGe: - case AluVectorOpcode::kKillNe: case AluVectorOpcode::kMaxA: return true; default: - break; + return false; } - return false; } // Whether each component of a source operand is used at all in the instruction diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc index 8db418de9..3ab45245c 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/pipeline_cache.cc @@ -362,35 +362,38 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state, return pipeline; } -bool PipelineCache::TranslateShader(VulkanShader* shader, - reg::SQ_PROGRAM_CNTL cntl) { +bool PipelineCache::TranslateShader( + VulkanShader::VulkanTranslation& translation, reg::SQ_PROGRAM_CNTL cntl) { // Perform translation. // If this fails the shader will be marked as invalid and ignored later. - if (!shader_translator_->Translate(shader, cntl)) { + if (!shader_translator_->Translate(translation, cntl)) { XELOGE("Shader translation failed; marking shader as ignored"); return false; } // Prepare the shader for use (creates our VkShaderModule). // It could still fail at this point. - if (!shader->Prepare()) { + if (!translation.Prepare()) { XELOGE("Shader preparation failed; marking shader as ignored"); return false; } - if (shader->is_valid()) { + if (translation.is_valid()) { XELOGGPU("Generated {} shader ({}b) - hash {:016X}:\n{}\n", - shader->type() == xenos::ShaderType::kVertex ? "vertex" : "pixel", - shader->ucode_dword_count() * 4, shader->ucode_data_hash(), - shader->ucode_disassembly()); + translation.shader().type() == xenos::ShaderType::kVertex + ? "vertex" + : "pixel", + translation.shader().ucode_dword_count() * 4, + translation.shader().ucode_data_hash(), + translation.shader().ucode_disassembly()); } // Dump shader files if desired. if (!cvars::dump_shaders.empty()) { - shader->Dump(cvars::dump_shaders, "vk"); + translation.Dump(cvars::dump_shaders, "vk"); } - return shader->is_valid(); + return translation.is_valid(); } static void DumpShaderStatisticsAMD(const VkShaderStatisticsInfoAMD& stats) { @@ -1063,16 +1066,28 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( return UpdateStatus::kCompatible; } - if (!vertex_shader->is_translated() && - !TranslateShader(vertex_shader, regs.sq_program_cntl)) { + VulkanShader::VulkanTranslation* vertex_shader_translation = + static_cast( + vertex_shader->GetOrCreateTranslation( + shader_translator_->GetDefaultModification( + xenos::ShaderType::kVertex))); + if (!vertex_shader_translation->is_translated() && + !TranslateShader(*vertex_shader_translation, regs.sq_program_cntl)) { XELOGE("Failed to translate the vertex shader!"); return UpdateStatus::kError; } - if (pixel_shader && !pixel_shader->is_translated() && - !TranslateShader(pixel_shader, regs.sq_program_cntl)) { - XELOGE("Failed to translate the pixel shader!"); - return UpdateStatus::kError; + VulkanShader::VulkanTranslation* pixel_shader_translation = nullptr; + if (pixel_shader) { + pixel_shader_translation = static_cast( + pixel_shader->GetOrCreateTranslation( + shader_translator_->GetDefaultModification( + xenos::ShaderType::kPixel))); + if (!pixel_shader_translation->is_translated() && + !TranslateShader(*pixel_shader_translation, regs.sq_program_cntl)) { + XELOGE("Failed to translate the pixel shader!"); + return UpdateStatus::kError; + } } update_shader_stages_stage_count_ = 0; @@ -1084,7 +1099,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( vertex_pipeline_stage.pNext = nullptr; vertex_pipeline_stage.flags = 0; vertex_pipeline_stage.stage = VK_SHADER_STAGE_VERTEX_BIT; - vertex_pipeline_stage.module = vertex_shader->shader_module(); + vertex_pipeline_stage.module = vertex_shader_translation->shader_module(); vertex_pipeline_stage.pName = "main"; vertex_pipeline_stage.pSpecializationInfo = nullptr; @@ -1116,8 +1131,9 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( pixel_pipeline_stage.pNext = nullptr; pixel_pipeline_stage.flags = 0; pixel_pipeline_stage.stage = VK_SHADER_STAGE_FRAGMENT_BIT; - pixel_pipeline_stage.module = - pixel_shader ? pixel_shader->shader_module() : dummy_pixel_shader_; + pixel_pipeline_stage.module = pixel_shader_translation + ? pixel_shader_translation->shader_module() + : dummy_pixel_shader_; pixel_pipeline_stage.pName = "main"; pixel_pipeline_stage.pSpecializationInfo = nullptr; diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h index 3e03dce1e..693dd4594 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.h +++ b/src/xenia/gpu/vulkan/pipeline_cache.h @@ -79,7 +79,8 @@ class PipelineCache { // state. VkPipeline GetPipeline(const RenderState* render_state, uint64_t hash_key); - bool TranslateShader(VulkanShader* shader, reg::SQ_PROGRAM_CNTL cntl); + bool TranslateShader(VulkanShader::VulkanTranslation& translation, + reg::SQ_PROGRAM_CNTL cntl); void DumpShaderDisasmAMD(VkPipeline pipeline); void DumpShaderDisasmNV(const VkGraphicsPipelineCreateInfo& info); diff --git a/src/xenia/gpu/vulkan/vulkan_shader.cc b/src/xenia/gpu/vulkan/vulkan_shader.cc index 659ad9326..2eb41e9e5 100644 --- a/src/xenia/gpu/vulkan/vulkan_shader.cc +++ b/src/xenia/gpu/vulkan/vulkan_shader.cc @@ -27,38 +27,56 @@ VulkanShader::VulkanShader(ui::vulkan::VulkanDevice* device, const uint32_t* dword_ptr, uint32_t dword_count) : Shader(shader_type, data_hash, dword_ptr, dword_count), device_(device) {} -VulkanShader::~VulkanShader() { +VulkanShader::VulkanTranslation::~VulkanTranslation() { if (shader_module_) { - vkDestroyShaderModule(*device_, shader_module_, nullptr); + const VulkanShader& vulkan_shader = static_cast(shader()); + vkDestroyShaderModule(*vulkan_shader.device_, shader_module_, nullptr); shader_module_ = nullptr; } } -bool VulkanShader::Prepare() { +bool VulkanShader::VulkanTranslation::Prepare() { assert_null(shader_module_); assert_true(is_valid()); + const VulkanShader& vulkan_shader = static_cast(shader()); + ui::vulkan::VulkanDevice* device = vulkan_shader.device_; + // Create the shader module. VkShaderModuleCreateInfo shader_info; shader_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; shader_info.pNext = nullptr; shader_info.flags = 0; - shader_info.codeSize = translated_binary_.size(); + shader_info.codeSize = translated_binary().size(); shader_info.pCode = - reinterpret_cast(translated_binary_.data()); + reinterpret_cast(translated_binary().data()); auto status = - vkCreateShaderModule(*device_, &shader_info, nullptr, &shader_module_); + vkCreateShaderModule(*device, &shader_info, nullptr, &shader_module_); CheckResult(status, "vkCreateShaderModule"); - char typeChar = shader_type_ == xenos::ShaderType::kPixel - ? 'p' - : shader_type_ == xenos::ShaderType::kVertex ? 'v' : 'u'; - device_->DbgSetObjectName( - uint64_t(shader_module_), VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT, - fmt::format("S({}): {:016X}", typeChar, ucode_data_hash())); + char type_char; + switch (vulkan_shader.type()) { + case xenos::ShaderType::kVertex: + type_char = 'v'; + break; + case xenos::ShaderType::kPixel: + type_char = 'p'; + break; + default: + type_char = 'u'; + } + device->DbgSetObjectName(uint64_t(shader_module_), + VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT, + fmt::format("S({}): {:016X}", type_char, + vulkan_shader.ucode_data_hash())); return status == VK_SUCCESS; } +Shader::Translation* VulkanShader::CreateTranslationInstance( + uint32_t modification) { + return new VulkanTranslation(*this, modification); +} + } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_shader.h b/src/xenia/gpu/vulkan/vulkan_shader.h index 9dd64a22c..7d948ac71 100644 --- a/src/xenia/gpu/vulkan/vulkan_shader.h +++ b/src/xenia/gpu/vulkan/vulkan_shader.h @@ -21,19 +21,30 @@ namespace vulkan { class VulkanShader : public Shader { public: + class VulkanTranslation : public Translation { + public: + VulkanTranslation(VulkanShader& shader, uint32_t modification) + : Translation(shader, modification) {} + ~VulkanTranslation() override; + + bool Prepare(); + + // Available only if the translation is_valid and has been prepared. + VkShaderModule shader_module() const { return shader_module_; } + + private: + VkShaderModule shader_module_ = nullptr; + }; + VulkanShader(ui::vulkan::VulkanDevice* device, xenos::ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count); - ~VulkanShader() override; - // Available only if the shader is_valid and has been prepared. - VkShaderModule shader_module() const { return shader_module_; } - - bool Prepare(); + protected: + Translation* CreateTranslationInstance(uint32_t modification) override; private: ui::vulkan::VulkanDevice* device_ = nullptr; - VkShaderModule shader_module_ = nullptr; }; } // namespace vulkan diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index 4f9e2875f..faaf4818d 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -9,17 +9,41 @@ #include "xenia/gpu/xenos.h" +#include + #include "xenia/base/math.h" namespace xe { namespace gpu { namespace xenos { +// Based on CFloat24 from d3dref9.dll and the 6e4 code from: +// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp +// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). + +uint32_t Float32To20e4(float f32) { + if (!(f32 > 0.0f)) { + // Positive only, and not -0 or NaN. + return 0; + } + uint32_t f32u32 = *reinterpret_cast(&f32); + if (f32u32 >= 0x3FFFFFF8) { + // Saturate. + return 0xFFFFFF; + } + if (f32u32 < 0x38800000) { + // The number is too small to be represented as a normalized 20e4. + // Convert it to a denormalized value. + uint32_t shift = std::min(uint32_t(113 - (f32u32 >> 23)), uint32_t(24)); + f32u32 = (0x800000 | (f32u32 & 0x7FFFFF)) >> shift; + } else { + // Rebias the exponent to represent the value as a normalized 20e4. + f32u32 += 0xC8000000u; + } + return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF; +} + float Float20e4To32(uint32_t f24) { - // Based on CFloat24 from d3dref9.dll and the 6e4 code from: - // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp - // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows - // [0,2). f24 &= 0xFFFFFF; if (!f24) { return 0.0f; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 4117a8293..542372569 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -305,6 +305,9 @@ enum class DepthRenderTargetFormat : uint32_t { const char* GetDepthRenderTargetFormatName(DepthRenderTargetFormat format); +// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point +// depth, rounding to the nearest even. +uint32_t Float32To20e4(float f32); // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an // IEEE-754 32-bit floating-point number. float Float20e4To32(uint32_t f24);