From 45050b2380022dcd39d79bd071e87f4131cabf62 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 9 Oct 2022 22:06:41 +0300 Subject: [PATCH] [GPU] Vulkan fragment shader interlock RB and related fixes/cleanup Also fixes addressing of MSAA samples 2 and 3 for 64bpp color render targets in the ROV RB implementation on Direct3D 12. Additionally, with FSI/ROV, alpha test and alpha to coverage are done only if the render target 0 was dynamically written to (according to the Direct3D 9 rules for writing to color render targets, though not sure if they actually apply to the alpha tests on Direct3D 9, but for safety). There is also some code cleanup for things spotted during the development of the feature. --- .../gpu/d3d12/d3d12_command_processor.cc | 9 +- src/xenia/gpu/dxbc_shader_translator.h | 51 +- src/xenia/gpu/dxbc_shader_translator_om.cc | 252 +- src/xenia/gpu/render_target_cache.cc | 128 + src/xenia/gpu/render_target_cache.h | 48 + src/xenia/gpu/shader_compiler_main.cc | 15 +- src/xenia/gpu/spirv_shader_translator.cc | 844 ++-- src/xenia/gpu/spirv_shader_translator.h | 221 +- src/xenia/gpu/spirv_shader_translator_alu.cc | 28 +- .../gpu/spirv_shader_translator_fetch.cc | 32 +- src/xenia/gpu/spirv_shader_translator_rb.cc | 3495 ++++++++++++++++- .../gpu/vulkan/vulkan_command_processor.cc | 747 +++- .../gpu/vulkan/vulkan_command_processor.h | 14 +- .../gpu/vulkan/vulkan_graphics_system.cc | 9 + src/xenia/gpu/vulkan/vulkan_graphics_system.h | 4 +- src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc | 574 +-- src/xenia/gpu/vulkan/vulkan_pipeline_cache.h | 4 + .../gpu/vulkan/vulkan_render_target_cache.cc | 1056 +++-- .../gpu/vulkan/vulkan_render_target_cache.h | 64 +- ...ce_khr_get_physical_device_properties2.inc | 2 + src/xenia/ui/vulkan/vulkan_provider.cc | 74 +- src/xenia/ui/vulkan/vulkan_provider.h | 14 + tools/shader-playground/Editor.Designer.cs | 7 +- tools/shader-playground/Editor.cs | 6 +- 24 files changed, 6168 insertions(+), 1530 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 129f89fd0..e39418cc8 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -3189,15 +3189,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // flow. reg::RB_COLOR_INFO color_infos[4]; float rt_clamp[4][4]; + // Two UINT32_MAX if no components actually existing in the RT are written. uint32_t rt_keep_masks[4][2]; for (uint32_t i = 0; i < 4; ++i) { auto color_info = regs.Get( reg::RB_COLOR_INFO::rt_register_indices[i]); color_infos[i] = color_info; if (edram_rov_used) { - // Get the mask for keeping previous color's components unmodified, - // or two UINT32_MAX if no colors actually existing in the RT are written. - DxbcShaderTranslator::ROV_GetColorFormatSystemConstants( + RenderTargetCache::GetPSIColorFormatInfo( color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111, rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3], rt_keep_masks[i][0], rt_keep_masks[i][1]); @@ -3506,8 +3505,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( rt_base_dwords_scaled; system_constants_.edram_rt_base_dwords_scaled[i] = rt_base_dwords_scaled; - uint32_t format_flags = DxbcShaderTranslator::ROV_AddColorFormatFlags( - color_info.color_format); + uint32_t format_flags = + RenderTargetCache::AddPSIColorFormatFlags(color_info.color_format); dirty |= system_constants_.edram_rt_format_flags[i] != format_flags; system_constants_.edram_rt_format_flags[i] = format_flags; // Can't do float comparisons here because NaNs would result in always diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 9679e43a2..a75597011 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -267,19 +267,6 @@ class DxbcShaderTranslator : public ShaderTranslator { }; static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants"); - // Appended to the format in the format constant. - enum : uint32_t { - // Starting from bit 4 because the format itself needs 4 bits. - kRTFormatFlag_64bpp_Shift = 4, - // Requires clamping of blending sources and factors. - kRTFormatFlag_FixedPointColor_Shift, - kRTFormatFlag_FixedPointAlpha_Shift, - - kRTFormatFlag_64bpp = 1u << kRTFormatFlag_64bpp_Shift, - kRTFormatFlag_FixedPointColor = 1u << kRTFormatFlag_FixedPointColor_Shift, - kRTFormatFlag_FixedPointAlpha = 1u << kRTFormatFlag_FixedPointAlpha_Shift, - }; - // IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED: // - SystemConstants::Index enum. // - system_constant_rdef_. @@ -383,7 +370,8 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t edram_rt_base_dwords_scaled[4]; - // RT format combined with kRTFormatFlags. + // RT format combined with RenderTargetCache::kPSIColorFormatFlag values + // (pass via RenderTargetCache::AddPSIColorFormatFlags). uint32_t edram_rt_format_flags[4]; // Format info - values to clamp the color to before blending or storing. @@ -524,40 +512,6 @@ class DxbcShaderTranslator : public ShaderTranslator { kEdram, }; - // Returns the format with internal flags for passing via the - // edram_rt_format_flags system constant. - static constexpr uint32_t ROV_AddColorFormatFlags( - xenos::ColorRenderTargetFormat format) { - uint32_t format_flags = uint32_t(format); - if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || - format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT || - format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { - format_flags |= kRTFormatFlag_64bpp; - } - if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8 || - format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA || - format == xenos::ColorRenderTargetFormat::k_2_10_10_10 || - format == xenos::ColorRenderTargetFormat::k_16_16 || - format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || - format == xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10) { - format_flags |= - kRTFormatFlag_FixedPointColor | kRTFormatFlag_FixedPointAlpha; - } else if (format == xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT || - format == xenos::ColorRenderTargetFormat:: - k_2_10_10_10_FLOAT_AS_16_16_16_16) { - format_flags |= kRTFormatFlag_FixedPointAlpha; - } - return format_flags; - } - // Returns the bits that need to be added to the RT flags constant - needs to - // be done externally, not in SetColorFormatConstants, because the flags - // contain other state. - static void ROV_GetColorFormatSystemConstants( - xenos::ColorRenderTargetFormat format, uint32_t write_mask, - float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high, - float& clamp_alpha_high, uint32_t& keep_mask_low, - uint32_t& keep_mask_high); - uint64_t GetDefaultVertexShaderModification( uint32_t dynamic_addressable_register_count, Shader::HostVertexShaderType host_vertex_shader_type = @@ -772,6 +726,7 @@ class DxbcShaderTranslator : public ShaderTranslator { // Whether it's possible and worth skipping running the translated shader for // 2x2 quads. bool ROV_IsDepthStencilEarly() const { + assert_true(edram_rov_used_); return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() && !current_shader().is_valid_memexport_used(); } diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index a4d1b3c83..412e003ec 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -14,139 +14,13 @@ #include "xenia/base/assert.h" #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/render_target_cache.h" #include "xenia/gpu/texture_cache.h" namespace xe { namespace gpu { using namespace ucode; -void DxbcShaderTranslator::ROV_GetColorFormatSystemConstants( - xenos::ColorRenderTargetFormat format, uint32_t write_mask, - float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high, - float& clamp_alpha_high, uint32_t& keep_mask_low, - uint32_t& keep_mask_high) { - keep_mask_low = keep_mask_high = 0; - switch (format) { - case xenos::ColorRenderTargetFormat::k_8_8_8_8: - case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { - clamp_rgb_low = clamp_alpha_low = 0.0f; - clamp_rgb_high = clamp_alpha_high = 1.0f; - for (uint32_t i = 0; i < 4; ++i) { - if (!(write_mask & (1 << i))) { - keep_mask_low |= uint32_t(0xFF) << (i * 8); - } - } - } break; - case xenos::ColorRenderTargetFormat::k_2_10_10_10: - case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { - clamp_rgb_low = clamp_alpha_low = 0.0f; - clamp_rgb_high = clamp_alpha_high = 1.0f; - for (uint32_t i = 0; i < 3; ++i) { - if (!(write_mask & (1 << i))) { - keep_mask_low |= uint32_t(0x3FF) << (i * 10); - } - } - if (!(write_mask & 0b1000)) { - keep_mask_low |= uint32_t(3) << 30; - } - } break; - case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: - case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: { - clamp_rgb_low = clamp_alpha_low = 0.0f; - clamp_rgb_high = 31.875f; - clamp_alpha_high = 1.0f; - for (uint32_t i = 0; i < 3; ++i) { - if (!(write_mask & (1 << i))) { - keep_mask_low |= uint32_t(0x3FF) << (i * 10); - } - } - if (!(write_mask & 0b1000)) { - keep_mask_low |= uint32_t(3) << 30; - } - } break; - case xenos::ColorRenderTargetFormat::k_16_16: - case xenos::ColorRenderTargetFormat::k_16_16_16_16: - // Alpha clamping affects blending source, so it's non-zero for alpha for - // k_16_16 (the render target is fixed-point). There's one deviation from - // how Direct3D 11.3 functional specification defines SNorm conversion - // (NaN should be 0, not the lowest negative number), but NaN handling in - // output shouldn't be very important. - clamp_rgb_low = clamp_alpha_low = -32.0f; - clamp_rgb_high = clamp_alpha_high = 32.0f; - if (!(write_mask & 0b0001)) { - keep_mask_low |= 0xFFFFu; - } - if (!(write_mask & 0b0010)) { - keep_mask_low |= 0xFFFF0000u; - } - if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16) { - if (!(write_mask & 0b0100)) { - keep_mask_high |= 0xFFFFu; - } - if (!(write_mask & 0b1000)) { - keep_mask_high |= 0xFFFF0000u; - } - } else { - write_mask &= 0b0011; - } - break; - case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: - case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: - // No NaNs on the Xbox 360 GPU, though can't use the extended range with - // f32tof16. - clamp_rgb_low = clamp_alpha_low = -65504.0f; - clamp_rgb_high = clamp_alpha_high = 65504.0f; - if (!(write_mask & 0b0001)) { - keep_mask_low |= 0xFFFFu; - } - if (!(write_mask & 0b0010)) { - keep_mask_low |= 0xFFFF0000u; - } - if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT) { - if (!(write_mask & 0b0100)) { - keep_mask_high |= 0xFFFFu; - } - if (!(write_mask & 0b1000)) { - keep_mask_high |= 0xFFFF0000u; - } - } else { - write_mask &= 0b0011; - } - break; - case xenos::ColorRenderTargetFormat::k_32_FLOAT: - // No clamping - let min/max always pick the original value. - clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = - std::nanf(""); - write_mask &= 0b0001; - if (!(write_mask & 0b0001)) { - keep_mask_low = ~uint32_t(0); - } - break; - case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: - // No clamping - let min/max always pick the original value. - clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = - std::nanf(""); - write_mask &= 0b0011; - if (!(write_mask & 0b0001)) { - keep_mask_low = ~uint32_t(0); - } - if (!(write_mask & 0b0010)) { - keep_mask_high = ~uint32_t(0); - } - break; - default: - assert_unhandled_case(format); - // Disable invalid render targets. - write_mask = 0; - break; - } - // Special case handled in the shaders for empty write mask to completely skip - // a disabled render target: all keep bits are set. - if (!write_mask) { - keep_mask_low = keep_mask_high = ~uint32_t(0); - } -} - void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { bool any_color_targets_written = current_shader().writes_color_targets() != 0; @@ -484,8 +358,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { { // Copy the 4x AA coverage to system_temp_rov_params_.x, making top-right // the sample [2] and bottom-left the sample [1] (the opposite of Direct3D - // 12), because on the Xbox 360, 2x MSAA doubles the storage width, 4x MSAA - // doubles the storage height. + // 12), because on the Xbox 360, 2x MSAA doubles the storage height, 4x MSAA + // doubles the storage width. // Flip samples in bits 0:1 to bits 29:30. a_.OpBFRev(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::VCoverage()); @@ -1304,7 +1178,7 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_8_8_8_8_GAMMA // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA : xenos::ColorRenderTargetFormat::k_8_8_8_8))); // Unpack the components. @@ -1328,9 +1202,9 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_2_10_10_10 // k_2_10_10_10_AS_10_10_10_10 // *************************************************************************** - a_.OpCase(dxbc::Src::LU( - ROV_AddColorFormatFlags(xenos::ColorRenderTargetFormat::k_2_10_10_10))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); { // Unpack the components. @@ -1350,9 +1224,9 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_2_10_10_10_FLOAT_AS_16_16_16_16 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp // *************************************************************************** - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16))); { // Unpack the alpha. @@ -1381,7 +1255,7 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_16_16_16_16 (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16 : xenos::ColorRenderTargetFormat::k_16_16))); dxbc::Dest color_components_dest( @@ -1404,7 +1278,7 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_16_16_16_16_FLOAT (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT : xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); dxbc::Dest color_components_dest( @@ -1465,7 +1339,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_8_8_8_8_GAMMA // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA : xenos::ColorRenderTargetFormat::k_8_8_8_8))); for (uint32_t j = 0; j < 4; ++j) { @@ -1496,9 +1370,9 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_2_10_10_10 // k_2_10_10_10_AS_10_10_10_10 // *************************************************************************** - a_.OpCase(dxbc::Src::LU( - ROV_AddColorFormatFlags(xenos::ColorRenderTargetFormat::k_2_10_10_10))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); for (uint32_t i = 0; i < 4; ++i) { // Denormalize and convert to fixed-point. @@ -1518,9 +1392,9 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_2_10_10_10_FLOAT_AS_16_16_16_16 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp // *************************************************************************** - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16))); { // Convert red directly to the destination, which may be the same as the @@ -1550,7 +1424,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_16_16_16_16 (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16 : xenos::ColorRenderTargetFormat::k_16_16))); for (uint32_t j = 0; j < (uint32_t(2) << i); ++j) { @@ -1582,7 +1456,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_16_16_16_16_FLOAT (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT : xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); for (uint32_t j = 0; j < (uint32_t(2) << i); ++j) { @@ -2230,7 +2104,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Load whether the render target is 64bpp to system_temp_rov_params_.y to // get the needed relative sample address. a_.OpAnd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), - rt_format_flags_src, dxbc::Src::LU(kRTFormatFlag_64bpp)); + rt_format_flags_src, + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); // Choose the relative sample address for the render target to // system_temp_rov_params_.y. a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010), @@ -2287,7 +2162,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the blending source color is fixed-point for clamping if it is. // temp.x = whether color is fixed-point. a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointColor)); + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)); // Check if the blending source color is fixed-point and needs clamping. // temp.x = free. a_.OpIf(true, temp_x_src); @@ -2306,7 +2182,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the blending source alpha is fixed-point for clamping if it is. // temp.x = whether alpha is fixed-point. a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha)); + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)); // Check if the blending source alpha is fixed-point and needs clamping. // temp.x = free. a_.OpIf(true, temp_x_src); @@ -2387,7 +2264,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the format is 64bpp to temp.w. // temp.w = whether the render target is 64bpp. a_.OpAnd(temp_w_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_64bpp)); + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); // Check if the format is 64bpp. // temp.w = free. a_.OpIf(true, temp_w_src); @@ -2478,8 +2355,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the render target color is fixed-point and the source // color factor needs clamping to temp.x. // temp.x = whether color is fixed-point. - a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointColor)); + a_.OpAnd( + temp_x_dest, rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)); // Check if the source color factor needs clamping. a_.OpIf(true, temp_x_src); { @@ -2558,8 +2437,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the render target color is fixed-point and the // destination color factor needs clamping to temp.x. // temp.x = whether color is fixed-point. - a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointColor)); + a_.OpAnd( + temp_x_dest, rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)); // Check if the destination color factor needs clamping. a_.OpIf(true, temp_x_src); { @@ -2701,8 +2582,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the render target alpha is fixed-point and the source // alpha factor needs clamping to temp.y. // temp.y = whether alpha is fixed-point. - a_.OpAnd(temp_y_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha)); + a_.OpAnd( + temp_y_dest, rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)); // Check if the source alpha factor needs clamping. a_.OpIf(true, temp_y_src); { @@ -2769,9 +2652,11 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // destination alpha factor needs clamping. // alpha_is_fixed_temp.x = whether alpha is fixed-point. uint32_t alpha_is_fixed_temp = PushSystemTemp(); - a_.OpAnd(dxbc::Dest::R(alpha_is_fixed_temp, 0b0001), - rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha)); + a_.OpAnd( + dxbc::Dest::R(alpha_is_fixed_temp, 0b0001), + rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)); // Check if the destination alpha factor needs clamping. a_.OpIf(true, dxbc::Src::R(alpha_is_fixed_temp, dxbc::Src::kXXXX)); @@ -2925,7 +2810,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the format is 64bpp to temp.z. // temp.z = whether the render target is 64bpp. a_.OpAnd(temp_z_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_64bpp)); + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); // Check if the format is 64bpp. // temp.z = free. a_.OpIf(true, temp_z_src); @@ -2954,16 +2839,29 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Close the sample covered check. a_.OpEndIf(); - // Go to the next sample (samples are at +0, +(80*scale_x), +1, - // +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1), - // +(80*scale_x) and -(80*scale_x+1) after each sample). + // Go to the next sample (samples are at +0, +(80*scale_x), +dwpp, + // +(80*scale_x+dwpp), so need to do +(80*scale_x), -(80*scale_x-dwpp), + // +(80*scale_x) and -(80*scale_x+dwpp) after each sample). // Though no need to do this for the last sample as for the next render // target, the address will be recalculated. if (j < 3) { - a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LI((j & 1) ? -int32_t(tile_width) + 2 - j - : int32_t(tile_width))); + if (j & 1) { + // temp.z = whether the render target is 64bpp. + a_.OpAnd(temp_z_dest, rt_format_flags_src, + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); + // temp.z = offset from the current sample to the next. + a_.OpMovC(temp_z_dest, temp_z_src, + dxbc::Src::LI(-int32_t(tile_width) + 2 * (2 - int32_t(j))), + dxbc::Src::LI(-int32_t(tile_width) + (2 - int32_t(j)))); + // temp.z = free. + a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), + temp_z_src); + } else { + a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), + dxbc::Src::LU(tile_width)); + } } } @@ -2987,6 +2885,17 @@ void DxbcShaderTranslator::CompletePixelShader() { if (current_shader().writes_color_target(0) && !IsForceEarlyDepthStencilGlobalFlagEnabled()) { + if (edram_rov_used_) { + // Check if the render target 0 was written to on the execution path. + uint32_t rt_0_written_temp = PushSystemTemp(); + a_.OpAnd(dxbc::Dest::R(rt_0_written_temp, 0b0001), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX), + dxbc::Src::LU(1 << 8)); + a_.OpIf(true, dxbc::Src::R(rt_0_written_temp, dxbc::Src::kXXXX)); + // Release rt_0_written_temp. + PopSystemTemp(); + } + // Alpha test. // X - mask, then masked result (SGPR for loading, VGPR for masking). // Y - operation result (SGPR for mask operations, VGPR for alpha @@ -3057,10 +2966,15 @@ void DxbcShaderTranslator::CompletePixelShader() { a_.OpEndIf(); // Release alpha_test_temp. PopSystemTemp(); - } - // Discard samples with alpha to coverage. - CompletePixelShader_AlphaToMask(); + // Discard samples with alpha to coverage. + CompletePixelShader_AlphaToMask(); + + if (edram_rov_used_) { + // Close the render target 0 written check. + a_.OpEndIf(); + } + } // Write the values to the render targets. Not applying the exponent bias yet // because the original 0 to 1 alpha value is needed for alpha to coverage, diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 2695b22d9..4bc882bbc 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -207,6 +207,134 @@ DEFINE_bool( namespace xe { namespace gpu { +void RenderTargetCache::GetPSIColorFormatInfo( + xenos::ColorRenderTargetFormat format, uint32_t write_mask, + float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high, + float& clamp_alpha_high, uint32_t& keep_mask_low, + uint32_t& keep_mask_high) { + keep_mask_low = keep_mask_high = 0; + switch (format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + clamp_rgb_low = clamp_alpha_low = 0.0f; + clamp_rgb_high = clamp_alpha_high = 1.0f; + for (uint32_t i = 0; i < 4; ++i) { + if (!(write_mask & (1 << i))) { + keep_mask_low |= uint32_t(0xFF) << (i * 8); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + clamp_rgb_low = clamp_alpha_low = 0.0f; + clamp_rgb_high = clamp_alpha_high = 1.0f; + for (uint32_t i = 0; i < 3; ++i) { + if (!(write_mask & (1 << i))) { + keep_mask_low |= uint32_t(0x3FF) << (i * 10); + } + } + if (!(write_mask & 0b1000)) { + keep_mask_low |= uint32_t(3) << 30; + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: { + clamp_rgb_low = clamp_alpha_low = 0.0f; + clamp_rgb_high = 31.875f; + clamp_alpha_high = 1.0f; + for (uint32_t i = 0; i < 3; ++i) { + if (!(write_mask & (1 << i))) { + keep_mask_low |= uint32_t(0x3FF) << (i * 10); + } + } + if (!(write_mask & 0b1000)) { + keep_mask_low |= uint32_t(3) << 30; + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + // Alpha clamping affects blending source, so it's non-zero for alpha for + // k_16_16 (the render target is fixed-point). There's one deviation from + // how Direct3D 11.3 functional specification defines SNorm conversion + // (NaN should be 0, not the lowest negative number), and that needs to be + // handled separately. + clamp_rgb_low = clamp_alpha_low = -32.0f; + clamp_rgb_high = clamp_alpha_high = 32.0f; + if (!(write_mask & 0b0001)) { + keep_mask_low |= 0xFFFFu; + } + if (!(write_mask & 0b0010)) { + keep_mask_low |= 0xFFFF0000u; + } + if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16) { + if (!(write_mask & 0b0100)) { + keep_mask_high |= 0xFFFFu; + } + if (!(write_mask & 0b1000)) { + keep_mask_high |= 0xFFFF0000u; + } + } else { + write_mask &= 0b0011; + } + break; + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: + // No NaNs on the Xbox 360 GPU, though can't use the extended range with + // Direct3D and Vulkan conversions. + // TODO(Triang3l): Use the extended-range encoding in all implementations. + clamp_rgb_low = clamp_alpha_low = -65504.0f; + clamp_rgb_high = clamp_alpha_high = 65504.0f; + if (!(write_mask & 0b0001)) { + keep_mask_low |= 0xFFFFu; + } + if (!(write_mask & 0b0010)) { + keep_mask_low |= 0xFFFF0000u; + } + if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT) { + if (!(write_mask & 0b0100)) { + keep_mask_high |= 0xFFFFu; + } + if (!(write_mask & 0b1000)) { + keep_mask_high |= 0xFFFF0000u; + } + } else { + write_mask &= 0b0011; + } + break; + case xenos::ColorRenderTargetFormat::k_32_FLOAT: + // No clamping - let min/max always pick the original value. + clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = + std::nanf(""); + write_mask &= 0b0001; + if (!(write_mask & 0b0001)) { + keep_mask_low = ~uint32_t(0); + } + break; + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: + // No clamping - let min/max always pick the original value. + clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = + std::nanf(""); + write_mask &= 0b0011; + if (!(write_mask & 0b0001)) { + keep_mask_low = ~uint32_t(0); + } + if (!(write_mask & 0b0010)) { + keep_mask_high = ~uint32_t(0); + } + break; + default: + assert_unhandled_case(format); + // Disable invalid render targets. + write_mask = 0; + break; + } + // Special case handled in the shaders for empty write mask to completely skip + // a disabled render target: all keep bits are set. + if (!write_mask) { + keep_mask_low = keep_mask_high = ~uint32_t(0); + } +} + uint32_t RenderTargetCache::Transfer::GetRangeRectangles( uint32_t start_tiles, uint32_t end_tiles, uint32_t base_tiles, uint32_t pitch_tiles, xenos::MsaaSamples msaa_samples, bool is_64bpp, diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index 84cce18fd..5353176ed 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -113,6 +113,54 @@ class RenderTargetCache { kSrgbToLinearExponent); } + // Pixel shader interlock implementation helpers. + + // Appended to the format in the format constant via bitwise OR. + enum : uint32_t { + kPSIColorFormatFlag_64bpp_Shift = xenos::kColorRenderTargetFormatBits, + // Requires clamping of blending sources and factors. + kPSIColorFormatFlag_FixedPointColor_Shift, + kPSIColorFormatFlag_FixedPointAlpha_Shift, + + kPSIColorFormatFlag_64bpp = uint32_t(1) << kPSIColorFormatFlag_64bpp_Shift, + kPSIColorFormatFlag_FixedPointColor = + uint32_t(1) << kPSIColorFormatFlag_FixedPointColor_Shift, + kPSIColorFormatFlag_FixedPointAlpha = + uint32_t(1) << kPSIColorFormatFlag_FixedPointAlpha_Shift, + }; + + static constexpr uint32_t AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat format) { + uint32_t format_flags = uint32_t(format); + if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || + format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT || + format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + format_flags |= kPSIColorFormatFlag_64bpp; + } + if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8 || + format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA || + format == xenos::ColorRenderTargetFormat::k_2_10_10_10 || + format == xenos::ColorRenderTargetFormat::k_16_16 || + format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || + format == xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10) { + format_flags |= kPSIColorFormatFlag_FixedPointColor | + kPSIColorFormatFlag_FixedPointAlpha; + } else if (format == xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT || + format == xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16) { + format_flags |= kPSIColorFormatFlag_FixedPointAlpha; + } + return format_flags; + } + + static void GetPSIColorFormatInfo(xenos::ColorRenderTargetFormat format, + uint32_t write_mask, float& clamp_rgb_low, + float& clamp_alpha_low, + float& clamp_rgb_high, + float& clamp_alpha_high, + uint32_t& keep_mask_low, + uint32_t& keep_mask_high); + virtual ~RenderTargetCache(); virtual Path GetPath() const = 0; diff --git a/src/xenia/gpu/shader_compiler_main.cc b/src/xenia/gpu/shader_compiler_main.cc index ec2e20184..4fdcec736 100644 --- a/src/xenia/gpu/shader_compiler_main.cc +++ b/src/xenia/gpu/shader_compiler_main.cc @@ -54,9 +54,11 @@ DEFINE_string( "GPU"); DEFINE_bool(shader_output_bindless_resources, false, "Output host shader with bindless resources used.", "GPU"); -DEFINE_bool(shader_output_dxbc_rov, false, - "Output ROV-based output-merger code in DXBC pixel shaders.", - "GPU"); +DEFINE_bool( + shader_output_pixel_shader_interlock, false, + "Output host shader with a render backend implementation based on pixel " + "shader interlock.", + "GPU"); namespace xe { namespace gpu { @@ -124,12 +126,15 @@ int shader_compiler_main(const std::vector& args) { SpirvShaderTranslator::Features spirv_features(true); if (cvars::shader_output_type == "spirv" || cvars::shader_output_type == "spirvtext") { - translator = std::make_unique(spirv_features); + translator = std::make_unique( + spirv_features, true, true, + cvars::shader_output_pixel_shader_interlock); } else if (cvars::shader_output_type == "dxbc" || cvars::shader_output_type == "dxbctext") { translator = std::make_unique( ui::GraphicsProvider::GpuVendorID(0), - cvars::shader_output_bindless_resources, cvars::shader_output_dxbc_rov); + cvars::shader_output_bindless_resources, + cvars::shader_output_pixel_shader_interlock); } else { // Just output microcode disassembly generated during microcode information // gathering. diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index bb89e0d41..eb31e13b9 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -21,6 +21,7 @@ #include "third_party/glslang/SPIRV/GLSL.std.450.h" #include "xenia/base/assert.h" #include "xenia/base/math.h" +#include "xenia/base/string_buffer.h" #include "xenia/gpu/spirv_shader.h" namespace xe { @@ -31,6 +32,8 @@ SpirvShaderTranslator::Features::Features(bool all) max_storage_buffer_range(all ? UINT32_MAX : (128 * 1024 * 1024)), clip_distance(all), cull_distance(all), + demote_to_helper_invocation(all), + fragment_shader_sample_interlock(all), full_draw_index_uint32(all), image_view_format_swizzle(all), signed_zero_inf_nan_preserve_float32(all), @@ -42,6 +45,14 @@ SpirvShaderTranslator::Features::Features( provider.device_properties().limits.maxStorageBufferRange), clip_distance(provider.device_features().shaderClipDistance), cull_distance(provider.device_features().shaderCullDistance), + demote_to_helper_invocation( + provider.device_extensions().ext_shader_demote_to_helper_invocation && + provider.device_shader_demote_to_helper_invocation_features() + .shaderDemoteToHelperInvocation), + fragment_shader_sample_interlock( + provider.device_extensions().ext_fragment_shader_interlock && + provider.device_fragment_shader_interlock_features() + .fragmentShaderSampleInterlock), full_draw_index_uint32(provider.device_features().fullDrawIndexUint32) { uint32_t device_version = provider.device_properties().apiVersion; const ui::vulkan::VulkanProvider::DeviceExtensions& device_extensions = @@ -78,9 +89,6 @@ SpirvShaderTranslator::Features::Features( } } -SpirvShaderTranslator::SpirvShaderTranslator(const Features& features) - : features_(features) {} - uint64_t SpirvShaderTranslator::GetDefaultVertexShaderModification( uint32_t dynamic_addressable_register_count, Shader::HostVertexShaderType host_vertex_shader_type) const { @@ -99,6 +107,19 @@ uint64_t SpirvShaderTranslator::GetDefaultPixelShaderModification( return shader_modification.value; } +std::vector SpirvShaderTranslator::CreateDepthOnlyFragmentShader() { + is_depth_only_fragment_shader_ = true; + // TODO(Triang3l): Handle in a nicer way (is_depth_only_fragment_shader_ is a + // leftover from when a Shader object wasn't used during translation). + Shader shader(xenos::ShaderType::kPixel, 0, nullptr, 0); + StringBuffer instruction_disassembly_buffer; + shader.AnalyzeUcode(instruction_disassembly_buffer); + Shader::Translation& translation = *shader.GetOrCreateTranslation(0); + TranslateAnalyzedShader(translation); + is_depth_only_fragment_shader_ = false; + return translation.translated_binary(); +} + void SpirvShaderTranslator::Reset() { ShaderTranslator::Reset(); @@ -109,6 +130,7 @@ void SpirvShaderTranslator::Reset() { input_point_coordinates_ = spv::NoResult; input_fragment_coordinates_ = spv::NoResult; input_front_facing_ = spv::NoResult; + input_sample_mask_ = spv::NoResult; std::fill(input_output_interpolators_.begin(), input_output_interpolators_.end(), spv::NoResult); output_point_coordinates_ = spv::NoResult; @@ -120,6 +142,8 @@ void SpirvShaderTranslator::Reset() { main_interface_.clear(); var_main_registers_ = spv::NoResult; var_main_point_size_edge_flag_kill_vertex_ = spv::NoResult; + var_main_kill_pixel_ = spv::NoResult; + var_main_fsi_color_written_ = spv::NoResult; main_switch_op_.reset(); main_switch_next_pc_phi_operands_.clear(); @@ -217,6 +241,10 @@ void SpirvShaderTranslator::StartTranslation() { size_t offset; spv::Id type; }; + spv::Id type_float4_array_4 = builder_->makeArrayType( + type_float4_, builder_->makeUintConstant(4), sizeof(float) * 4); + builder_->addDecoration(type_float4_array_4, spv::DecorationArrayStride, + sizeof(float) * 4); spv::Id type_uint4_array_2 = builder_->makeArrayType( type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4); builder_->addDecoration(type_uint4_array_2, spv::DecorationArrayStride, @@ -250,8 +278,37 @@ void SpirvShaderTranslator::StartTranslation() { type_uint4_array_4}, {"alpha_test_reference", offsetof(SystemConstants, alpha_test_reference), type_float_}, + {"edram_32bpp_tile_pitch_dwords_scaled", + offsetof(SystemConstants, edram_32bpp_tile_pitch_dwords_scaled), + type_uint_}, + {"edram_depth_base_dwords_scaled", + offsetof(SystemConstants, edram_depth_base_dwords_scaled), type_uint_}, {"color_exp_bias", offsetof(SystemConstants, color_exp_bias), type_float4_}, + {"edram_poly_offset_front_scale", + offsetof(SystemConstants, edram_poly_offset_front_scale), type_float_}, + {"edram_poly_offset_back_scale", + offsetof(SystemConstants, edram_poly_offset_back_scale), type_float_}, + {"edram_poly_offset_front_offset", + offsetof(SystemConstants, edram_poly_offset_front_offset), type_float_}, + {"edram_poly_offset_back_offset", + offsetof(SystemConstants, edram_poly_offset_back_offset), type_float_}, + {"edram_stencil_front", offsetof(SystemConstants, edram_stencil_front), + type_uint2_}, + {"edram_stencil_back", offsetof(SystemConstants, edram_stencil_back), + type_uint2_}, + {"edram_rt_base_dwords_scaled", + offsetof(SystemConstants, edram_rt_base_dwords_scaled), type_uint4_}, + {"edram_rt_format_flags", + offsetof(SystemConstants, edram_rt_format_flags), type_uint4_}, + {"edram_rt_blend_factors_ops", + offsetof(SystemConstants, edram_rt_blend_factors_ops), type_uint4_}, + {"edram_rt_keep_mask", offsetof(SystemConstants, edram_rt_keep_mask), + type_uint4_array_2}, + {"edram_rt_clamp", offsetof(SystemConstants, edram_rt_clamp), + type_float4_array_4}, + {"edram_blend_constant", offsetof(SystemConstants, edram_blend_constant), + type_float4_}, }; id_vector_temp_.clear(); id_vector_temp_.reserve(xe::countof(system_constants)); @@ -281,139 +338,145 @@ void SpirvShaderTranslator::StartTranslation() { main_interface_.push_back(uniform_system_constants_); } - // Common uniform buffer - float constants. - uint32_t float_constant_count = - current_shader().constant_register_map().float_count; - if (float_constant_count) { + if (!is_depth_only_fragment_shader_) { + // Common uniform buffer - float constants. + uint32_t float_constant_count = + current_shader().constant_register_map().float_count; + if (float_constant_count) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeArrayType( + type_float4_, builder_->makeUintConstant(float_constant_count), + sizeof(float) * 4)); + // Currently (as of October 24, 2020) makeArrayType only uses the stride + // to check if deduplication can be done - the array stride decoration + // needs to be applied explicitly. + builder_->addDecoration(id_vector_temp_.back(), + spv::DecorationArrayStride, sizeof(float) * 4); + spv::Id type_float_constants = + builder_->makeStructType(id_vector_temp_, "XeFloatConstants"); + builder_->addMemberName(type_float_constants, 0, "float_constants"); + builder_->addMemberDecoration(type_float_constants, 0, + spv::DecorationOffset, 0); + builder_->addDecoration(type_float_constants, spv::DecorationBlock); + uniform_float_constants_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassUniform, type_float_constants, + "xe_uniform_float_constants"); + builder_->addDecoration(uniform_float_constants_, + spv::DecorationDescriptorSet, + int(kDescriptorSetConstants)); + builder_->addDecoration( + uniform_float_constants_, spv::DecorationBinding, + int(is_pixel_shader() ? kConstantBufferFloatPixel + : kConstantBufferFloatVertex)); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(uniform_float_constants_); + } + } + + // Common uniform buffer - bool and loop constants. + // Uniform buffers must have std140 packing, so using arrays of 4-component + // vectors instead of scalar arrays because the latter would have padding to + // 16 bytes in each element. id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // 256 bool constants. id_vector_temp_.push_back(builder_->makeArrayType( - type_float4_, builder_->makeUintConstant(float_constant_count), - sizeof(float) * 4)); - // Currently (as of October 24, 2020) makeArrayType only uses the stride to - // check if deduplication can be done - the array stride decoration needs to - // be applied explicitly. + type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4)); builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(float) * 4); - spv::Id type_float_constants = - builder_->makeStructType(id_vector_temp_, "XeFloatConstants"); - builder_->addMemberName(type_float_constants, 0, "float_constants"); - builder_->addMemberDecoration(type_float_constants, 0, + sizeof(uint32_t) * 4); + // 32 loop constants. + id_vector_temp_.push_back(builder_->makeArrayType( + type_uint4_, builder_->makeUintConstant(8), sizeof(uint32_t) * 4)); + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t) * 4); + spv::Id type_bool_loop_constants = + builder_->makeStructType(id_vector_temp_, "XeBoolLoopConstants"); + builder_->addMemberName(type_bool_loop_constants, 0, "bool_constants"); + builder_->addMemberDecoration(type_bool_loop_constants, 0, spv::DecorationOffset, 0); - builder_->addDecoration(type_float_constants, spv::DecorationBlock); - uniform_float_constants_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassUniform, type_float_constants, - "xe_uniform_float_constants"); - builder_->addDecoration(uniform_float_constants_, + builder_->addMemberName(type_bool_loop_constants, 1, "loop_constants"); + builder_->addMemberDecoration(type_bool_loop_constants, 1, + spv::DecorationOffset, sizeof(uint32_t) * 8); + builder_->addDecoration(type_bool_loop_constants, spv::DecorationBlock); + uniform_bool_loop_constants_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassUniform, type_bool_loop_constants, + "xe_uniform_bool_loop_constants"); + builder_->addDecoration(uniform_bool_loop_constants_, spv::DecorationDescriptorSet, int(kDescriptorSetConstants)); - builder_->addDecoration( - uniform_float_constants_, spv::DecorationBinding, - int(is_pixel_shader() ? kConstantBufferFloatPixel - : kConstantBufferFloatVertex)); + builder_->addDecoration(uniform_bool_loop_constants_, + spv::DecorationBinding, + int(kConstantBufferBoolLoop)); if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(uniform_float_constants_); + main_interface_.push_back(uniform_bool_loop_constants_); } - } - // Common uniform buffer - bool and loop constants. - // Uniform buffers must have std140 packing, so using arrays of 4-component - // vectors instead of scalar arrays because the latter would have padding to - // 16 bytes in each element. - id_vector_temp_.clear(); - id_vector_temp_.reserve(2); - // 256 bool constants. - id_vector_temp_.push_back(builder_->makeArrayType( - type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4)); - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t) * 4); - // 32 loop constants. - id_vector_temp_.push_back(builder_->makeArrayType( - type_uint4_, builder_->makeUintConstant(8), sizeof(uint32_t) * 4)); - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t) * 4); - spv::Id type_bool_loop_constants = - builder_->makeStructType(id_vector_temp_, "XeBoolLoopConstants"); - builder_->addMemberName(type_bool_loop_constants, 0, "bool_constants"); - builder_->addMemberDecoration(type_bool_loop_constants, 0, - spv::DecorationOffset, 0); - builder_->addMemberName(type_bool_loop_constants, 1, "loop_constants"); - builder_->addMemberDecoration(type_bool_loop_constants, 1, - spv::DecorationOffset, sizeof(uint32_t) * 8); - builder_->addDecoration(type_bool_loop_constants, spv::DecorationBlock); - uniform_bool_loop_constants_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassUniform, type_bool_loop_constants, - "xe_uniform_bool_loop_constants"); - builder_->addDecoration(uniform_bool_loop_constants_, - spv::DecorationDescriptorSet, - int(kDescriptorSetConstants)); - builder_->addDecoration(uniform_bool_loop_constants_, spv::DecorationBinding, - int(kConstantBufferBoolLoop)); - if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(uniform_bool_loop_constants_); - } + // Common uniform buffer - fetch constants (32 x 6 uints packed in std140 as + // 4-component vectors). + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeArrayType( + type_uint4_, builder_->makeUintConstant(32 * 6 / 4), + sizeof(uint32_t) * 4)); + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t) * 4); + spv::Id type_fetch_constants = + builder_->makeStructType(id_vector_temp_, "XeFetchConstants"); + builder_->addMemberName(type_fetch_constants, 0, "fetch_constants"); + builder_->addMemberDecoration(type_fetch_constants, 0, + spv::DecorationOffset, 0); + builder_->addDecoration(type_fetch_constants, spv::DecorationBlock); + uniform_fetch_constants_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassUniform, type_fetch_constants, + "xe_uniform_fetch_constants"); + builder_->addDecoration(uniform_fetch_constants_, + spv::DecorationDescriptorSet, + int(kDescriptorSetConstants)); + builder_->addDecoration(uniform_fetch_constants_, spv::DecorationBinding, + int(kConstantBufferFetch)); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(uniform_fetch_constants_); + } - // Common uniform buffer - fetch constants (32 x 6 uints packed in std140 as - // 4-component vectors). - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeArrayType( - type_uint4_, builder_->makeUintConstant(32 * 6 / 4), - sizeof(uint32_t) * 4)); - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t) * 4); - spv::Id type_fetch_constants = - builder_->makeStructType(id_vector_temp_, "XeFetchConstants"); - builder_->addMemberName(type_fetch_constants, 0, "fetch_constants"); - builder_->addMemberDecoration(type_fetch_constants, 0, spv::DecorationOffset, - 0); - builder_->addDecoration(type_fetch_constants, spv::DecorationBlock); - uniform_fetch_constants_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassUniform, type_fetch_constants, - "xe_uniform_fetch_constants"); - builder_->addDecoration(uniform_fetch_constants_, - spv::DecorationDescriptorSet, - int(kDescriptorSetConstants)); - builder_->addDecoration(uniform_fetch_constants_, spv::DecorationBinding, - int(kConstantBufferFetch)); - if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(uniform_fetch_constants_); - } - - // Common storage buffers - shared memory uint[], each 128 MB or larger, - // depending on what's possible on the device. - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_)); - // Storage buffers have std430 packing, no padding to 4-component vectors. - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t)); - spv::Id type_shared_memory = - builder_->makeStructType(id_vector_temp_, "XeSharedMemory"); - builder_->addMemberName(type_shared_memory, 0, "shared_memory"); - // TODO(Triang3l): Make writable when memexport is implemented. - builder_->addMemberDecoration(type_shared_memory, 0, - spv::DecorationNonWritable); - builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset, - 0); - builder_->addDecoration(type_shared_memory, - features_.spirv_version >= spv::Spv_1_3 - ? spv::DecorationBlock - : spv::DecorationBufferBlock); - unsigned int shared_memory_binding_count = - 1 << GetSharedMemoryStorageBufferCountLog2(); - if (shared_memory_binding_count > 1) { - type_shared_memory = builder_->makeArrayType( - type_shared_memory, - builder_->makeUintConstant(shared_memory_binding_count), 0); - } - buffers_shared_memory_ = builder_->createVariable( - spv::NoPrecision, - features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer - : spv::StorageClassUniform, - type_shared_memory, "xe_shared_memory"); - builder_->addDecoration(buffers_shared_memory_, spv::DecorationDescriptorSet, - int(kDescriptorSetSharedMemoryAndEdram)); - builder_->addDecoration(buffers_shared_memory_, spv::DecorationBinding, 0); - if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(buffers_shared_memory_); + // Common storage buffers - shared memory uint[], each 128 MB or larger, + // depending on what's possible on the device. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_)); + // Storage buffers have std430 packing, no padding to 4-component vectors. + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t)); + spv::Id type_shared_memory = + builder_->makeStructType(id_vector_temp_, "XeSharedMemory"); + builder_->addMemberName(type_shared_memory, 0, "shared_memory"); + builder_->addMemberDecoration(type_shared_memory, 0, + spv::DecorationRestrict); + // TODO(Triang3l): Make writable when memexport is implemented. + builder_->addMemberDecoration(type_shared_memory, 0, + spv::DecorationNonWritable); + builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset, + 0); + builder_->addDecoration(type_shared_memory, + features_.spirv_version >= spv::Spv_1_3 + ? spv::DecorationBlock + : spv::DecorationBufferBlock); + unsigned int shared_memory_binding_count = + 1 << GetSharedMemoryStorageBufferCountLog2(); + if (shared_memory_binding_count > 1) { + type_shared_memory = builder_->makeArrayType( + type_shared_memory, + builder_->makeUintConstant(shared_memory_binding_count), 0); + } + buffers_shared_memory_ = builder_->createVariable( + spv::NoPrecision, + features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + type_shared_memory, "xe_shared_memory"); + builder_->addDecoration(buffers_shared_memory_, + spv::DecorationDescriptorSet, + int(kDescriptorSetSharedMemoryAndEdram)); + builder_->addDecoration(buffers_shared_memory_, spv::DecorationBinding, 0); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(buffers_shared_memory_); + } } if (is_vertex_shader()) { @@ -438,41 +501,43 @@ void SpirvShaderTranslator::StartTranslation() { uniform_system_constants_, id_vector_temp_), spv::NoPrecision); - // Begin ucode translation. Initialize everything, even without defined - // defaults, for safety. - var_main_predicate_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_bool_, - "xe_var_predicate", builder_->makeBoolConstant(false)); - var_main_loop_count_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_uint4_, - "xe_var_loop_count", const_uint4_0_); - var_main_address_register_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_int_, - "xe_var_address_register", const_int_0_); - var_main_loop_address_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_int4_, - "xe_var_loop_address", const_int4_0_); - var_main_previous_scalar_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float_, - "xe_var_previous_scalar", const_float_0_); - var_main_vfetch_address_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_int_, - "xe_var_vfetch_address", const_int_0_); - var_main_tfetch_lod_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float_, - "xe_var_tfetch_lod", const_float_0_); - var_main_tfetch_gradients_h_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float3_, - "xe_var_tfetch_gradients_h", const_float3_0_); - var_main_tfetch_gradients_v_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float3_, - "xe_var_tfetch_gradients_v", const_float3_0_); - if (register_count()) { - spv::Id type_register_array = builder_->makeArrayType( - type_float4_, builder_->makeUintConstant(register_count()), 0); - var_main_registers_ = - builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction, - type_register_array, "xe_var_registers"); + if (!is_depth_only_fragment_shader_) { + // Begin ucode translation. Initialize everything, even without defined + // defaults, for safety. + var_main_predicate_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_bool_, + "xe_var_predicate", builder_->makeBoolConstant(false)); + var_main_loop_count_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_uint4_, + "xe_var_loop_count", const_uint4_0_); + var_main_address_register_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_int_, + "xe_var_address_register", const_int_0_); + var_main_loop_address_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_int4_, + "xe_var_loop_address", const_int4_0_); + var_main_previous_scalar_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float_, + "xe_var_previous_scalar", const_float_0_); + var_main_vfetch_address_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_int_, + "xe_var_vfetch_address", const_int_0_); + var_main_tfetch_lod_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float_, + "xe_var_tfetch_lod", const_float_0_); + var_main_tfetch_gradients_h_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float3_, + "xe_var_tfetch_gradients_h", const_float3_0_); + var_main_tfetch_gradients_v_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float3_, + "xe_var_tfetch_gradients_v", const_float3_0_); + if (register_count()) { + spv::Id type_register_array = builder_->makeArrayType( + type_float4_, builder_->makeUintConstant(register_count()), 0); + var_main_registers_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction, + type_register_array, "xe_var_registers"); + } } // Write the execution model-specific prologue with access to variables in the @@ -483,6 +548,10 @@ void SpirvShaderTranslator::StartTranslation() { StartFragmentShaderInMain(); } + if (is_depth_only_fragment_shader_) { + return; + } + // Open the main loop. spv::Block& main_loop_pre_header = *builder_->getBuildPoint(); main_loop_header_ = &builder_->makeNewBlock(); @@ -551,57 +620,62 @@ void SpirvShaderTranslator::StartTranslation() { } std::vector SpirvShaderTranslator::CompleteTranslation() { - // Close flow control within the last switch case. - CloseExecConditionals(); - bool has_main_switch = !current_shader().label_addresses().empty(); - // After the final exec (if it happened to be not exece, which would already - // have a break branch), break from the switch if it exists, or from the - // loop it doesn't. - if (!builder_->getBuildPoint()->isTerminated()) { - builder_->createBranch(has_main_switch ? main_switch_merge_ - : main_loop_merge_); - } - if (has_main_switch) { - // Insert the switch instruction with all cases added as operands. - builder_->setBuildPoint(main_switch_header_); - builder_->getBuildPoint()->addInstruction(std::move(main_switch_op_)); - // Build the main switch merge, breaking out of the loop after falling - // through the end or breaking from exece (only continuing if a jump - from - // a guest loop or from jmp/call - was made). - function_main_->addBlock(main_switch_merge_); - builder_->setBuildPoint(main_switch_merge_); - builder_->createBranch(main_loop_merge_); - } - - // Main loop continuation - choose the program counter based on the path - // taken (-1 if not from a jump as a safe fallback, which would result in not - // hitting any switch case and reaching the final break in the body). - function_main_->addBlock(main_loop_continue_); - builder_->setBuildPoint(main_loop_continue_); - if (has_main_switch) { - // OpPhi, if added, must be the first in the block. - // If labels were added, but not jumps (for example, due to the call - // instruction not being implemented as of October 18, 2020), send an - // impossible program counter value (-1) to the OpPhi at the next iteration. - if (main_switch_next_pc_phi_operands_.empty()) { - main_switch_next_pc_phi_operands_.push_back( - builder_->makeIntConstant(-1)); + if (!is_depth_only_fragment_shader_) { + // Close flow control within the last switch case. + CloseExecConditionals(); + bool has_main_switch = !current_shader().label_addresses().empty(); + // After the final exec (if it happened to be not exece, which would already + // have a break branch), break from the switch if it exists, or from the + // loop it doesn't. + if (!builder_->getBuildPoint()->isTerminated()) { + builder_->createBranch(has_main_switch ? main_switch_merge_ + : main_loop_merge_); } - std::unique_ptr main_loop_pc_next_op = - std::make_unique( - main_loop_pc_next_, type_int_, - main_switch_next_pc_phi_operands_.size() >= 2 ? spv::OpPhi - : spv::OpCopyObject); - for (spv::Id operand : main_switch_next_pc_phi_operands_) { - main_loop_pc_next_op->addIdOperand(operand); + if (has_main_switch) { + // Insert the switch instruction with all cases added as operands. + builder_->setBuildPoint(main_switch_header_); + builder_->getBuildPoint()->addInstruction(std::move(main_switch_op_)); + // Build the main switch merge, breaking out of the loop after falling + // through the end or breaking from exece (only continuing if a jump - + // from a guest loop or from jmp/call - was made). + function_main_->addBlock(main_switch_merge_); + builder_->setBuildPoint(main_switch_merge_); + builder_->createBranch(main_loop_merge_); } - builder_->getBuildPoint()->addInstruction(std::move(main_loop_pc_next_op)); - } - builder_->createBranch(main_loop_header_); - // Add the main loop merge block and go back to the function. - function_main_->addBlock(main_loop_merge_); - builder_->setBuildPoint(main_loop_merge_); + // Main loop continuation - choose the program counter based on the path + // taken (-1 if not from a jump as a safe fallback, which would result in + // not hitting any switch case and reaching the final break in the body). + function_main_->addBlock(main_loop_continue_); + builder_->setBuildPoint(main_loop_continue_); + if (has_main_switch) { + // OpPhi, if added, must be the first in the block. + // If labels were added, but not jumps (for example, due to the call + // instruction not being implemented as of October 18, 2020), send an + // impossible program counter value (-1) to the OpPhi at the next + // iteration. + if (main_switch_next_pc_phi_operands_.empty()) { + main_switch_next_pc_phi_operands_.push_back( + builder_->makeIntConstant(-1)); + } + std::unique_ptr main_loop_pc_next_op = + std::make_unique( + main_loop_pc_next_, type_int_, + main_switch_next_pc_phi_operands_.size() >= 2 + ? spv::OpPhi + : spv::OpCopyObject); + for (spv::Id operand : main_switch_next_pc_phi_operands_) { + main_loop_pc_next_op->addIdOperand(operand); + } + builder_->getBuildPoint()->addInstruction( + std::move(main_loop_pc_next_op)); + } + builder_->createBranch(main_loop_header_); + + // Add the main loop merge block and go back to the function. + function_main_->addBlock(main_loop_merge_); + builder_->setBuildPoint(main_loop_merge_); + } if (is_vertex_shader()) { CompleteVertexOrTessEvalShaderInMain(); @@ -622,6 +696,20 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { builder_->addExecutionMode(function_main_, spv::ExecutionModeEarlyFragmentTests); } + if (edram_fragment_shader_interlock_) { + // Accessing per-sample values, so interlocking just when there's common + // coverage is enough if the device exposes that. + if (features_.fragment_shader_sample_interlock) { + builder_->addCapability( + spv::CapabilityFragmentShaderSampleInterlockEXT); + builder_->addExecutionMode(function_main_, + spv::ExecutionModeSampleInterlockOrderedEXT); + } else { + builder_->addCapability(spv::CapabilityFragmentShaderPixelInterlockEXT); + builder_->addExecutionMode(function_main_, + spv::ExecutionModePixelInterlockOrderedEXT); + } + } } else { assert_true(is_vertex_shader()); execution_model = IsSpirvTessEvalShader() @@ -649,14 +737,17 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { entry_point->addIdOperand(interface_id); } - // Specify the binding indices for samplers when the number of textures is - // known, as samplers are located after images in the texture descriptor set. - size_t texture_binding_count = texture_bindings_.size(); - size_t sampler_binding_count = sampler_bindings_.size(); - for (size_t i = 0; i < sampler_binding_count; ++i) { - builder_->addDecoration(sampler_bindings_[i].variable, - spv::DecorationBinding, - int(texture_binding_count + i)); + if (!is_depth_only_fragment_shader_) { + // Specify the binding indices for samplers when the number of textures is + // known, as samplers are located after images in the texture descriptor + // set. + size_t texture_binding_count = texture_bindings_.size(); + size_t sampler_binding_count = sampler_bindings_.size(); + for (size_t i = 0; i < sampler_binding_count; ++i) { + builder_->addDecoration(sampler_bindings_[i].variable, + spv::DecorationBinding, + int(texture_binding_count + i)); + } } // TODO(Triang3l): Avoid copy? @@ -1682,49 +1773,83 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { Modification shader_modification = GetSpirvShaderModification(); - uint32_t input_location = 0; + if (edram_fragment_shader_interlock_) { + builder_->addExtension("SPV_EXT_fragment_shader_interlock"); - // Interpolator inputs. - { - uint32_t interpolators_remaining = GetModificationInterpolatorMask(); - uint32_t interpolator_index; - while (xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) { - interpolators_remaining &= ~(UINT32_C(1) << interpolator_index); - spv::Id interpolator = builder_->createVariable( - spv::NoPrecision, spv::StorageClassInput, type_float4_, - fmt::format("xe_in_interpolator_{}", interpolator_index).c_str()); - input_output_interpolators_[interpolator_index] = interpolator; - builder_->addDecoration(interpolator, spv::DecorationLocation, - int(input_location)); - if (shader_modification.pixel.interpolators_centroid & - (UINT32_C(1) << interpolator_index)) { - builder_->addDecoration(interpolator, spv::DecorationCentroid); + // EDRAM buffer uint[]. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_)); + // Storage buffers have std430 packing, no padding to 4-component vectors. + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t)); + spv::Id type_edram = builder_->makeStructType(id_vector_temp_, "XeEdram"); + builder_->addMemberName(type_edram, 0, "edram"); + builder_->addMemberDecoration(type_edram, 0, spv::DecorationCoherent); + builder_->addMemberDecoration(type_edram, 0, spv::DecorationRestrict); + builder_->addMemberDecoration(type_edram, 0, spv::DecorationOffset, 0); + builder_->addDecoration(type_edram, features_.spirv_version >= spv::Spv_1_3 + ? spv::DecorationBlock + : spv::DecorationBufferBlock); + buffer_edram_ = builder_->createVariable( + spv::NoPrecision, + features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + type_edram, "xe_edram"); + builder_->addDecoration(buffer_edram_, spv::DecorationDescriptorSet, + int(kDescriptorSetSharedMemoryAndEdram)); + builder_->addDecoration(buffer_edram_, spv::DecorationBinding, 1); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(buffer_edram_); + } + } + + bool param_gen_needed = !is_depth_only_fragment_shader_ && + GetPsParamGenInterpolator() != UINT32_MAX; + + if (!is_depth_only_fragment_shader_) { + uint32_t input_location = 0; + + // Interpolator inputs. + { + uint32_t interpolators_remaining = GetModificationInterpolatorMask(); + uint32_t interpolator_index; + while ( + xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) { + interpolators_remaining &= ~(UINT32_C(1) << interpolator_index); + spv::Id interpolator = builder_->createVariable( + spv::NoPrecision, spv::StorageClassInput, type_float4_, + fmt::format("xe_in_interpolator_{}", interpolator_index).c_str()); + input_output_interpolators_[interpolator_index] = interpolator; + builder_->addDecoration(interpolator, spv::DecorationLocation, + int(input_location)); + if (shader_modification.pixel.interpolators_centroid & + (UINT32_C(1) << interpolator_index)) { + builder_->addDecoration(interpolator, spv::DecorationCentroid); + } + main_interface_.push_back(interpolator); + ++input_location; + } + } + + // Point coordinate input. + if (shader_modification.pixel.param_gen_point) { + if (param_gen_needed) { + input_point_coordinates_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassInput, + type_float2_, "xe_in_point_coordinates"); + builder_->addDecoration(input_point_coordinates_, + spv::DecorationLocation, int(input_location)); + main_interface_.push_back(input_point_coordinates_); } - main_interface_.push_back(interpolator); ++input_location; } } - bool param_gen_needed = GetPsParamGenInterpolator() != UINT32_MAX; - - // Point coordinate input. - if (shader_modification.pixel.param_gen_point) { - if (param_gen_needed) { - input_point_coordinates_ = - builder_->createVariable(spv::NoPrecision, spv::StorageClassInput, - type_float2_, "xe_in_point_coordinates"); - builder_->addDecoration(input_point_coordinates_, spv::DecorationLocation, - int(input_location)); - main_interface_.push_back(input_point_coordinates_); - } - ++input_location; - } - // Fragment coordinates. - // TODO(Triang3l): More conditions - fragment shader interlock render backend, - // alpha to coverage (if RT 0 is written, and there's no early depth / - // stencil), depth writing in the fragment shader (per-sample if supported). - if (param_gen_needed) { + // TODO(Triang3l): More conditions - alpha to coverage (if RT 0 is written, + // and there's no early depth / stencil), depth writing in the fragment shader + // (per-sample if supported). + if (edram_fragment_shader_interlock_ || param_gen_needed) { input_fragment_coordinates_ = builder_->createVariable( spv::NoPrecision, spv::StorageClassInput, type_float4_, "gl_FragCoord"); builder_->addDecoration(input_fragment_coordinates_, spv::DecorationBuiltIn, @@ -1733,9 +1858,9 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { } // Is front facing. - // TODO(Triang3l): Needed for stencil in the fragment shader interlock render - // backend. - if (param_gen_needed && !GetSpirvShaderModification().pixel.param_gen_point) { + if (edram_fragment_shader_interlock_ || + (param_gen_needed && + !GetSpirvShaderModification().pixel.param_gen_point)) { input_front_facing_ = builder_->createVariable( spv::NoPrecision, spv::StorageClassInput, type_bool_, "gl_FrontFacing"); builder_->addDecoration(input_front_facing_, spv::DecorationBuiltIn, @@ -1743,33 +1868,165 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { main_interface_.push_back(input_front_facing_); } - // Framebuffer attachment outputs. - std::fill(output_fragment_data_.begin(), output_fragment_data_.end(), - spv::NoResult); - static const char* const kFragmentDataNames[] = { - "xe_out_fragment_data_0", - "xe_out_fragment_data_1", - "xe_out_fragment_data_2", - "xe_out_fragment_data_3", - }; - uint32_t color_targets_remaining = current_shader().writes_color_targets(); - uint32_t color_target_index; - while (xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { - color_targets_remaining &= ~(UINT32_C(1) << color_target_index); - spv::Id output_fragment_data_rt = builder_->createVariable( - spv::NoPrecision, spv::StorageClassOutput, type_float4_, - kFragmentDataNames[color_target_index]); - output_fragment_data_[color_target_index] = output_fragment_data_rt; - builder_->addDecoration(output_fragment_data_rt, spv::DecorationLocation, - int(color_target_index)); - // Make invariant as pixel shaders may be used for various precise - // computations. - builder_->addDecoration(output_fragment_data_rt, spv::DecorationInvariant); - main_interface_.push_back(output_fragment_data_rt); + // Sample mask input. + if (edram_fragment_shader_interlock_) { + // SampleMask depends on SampleRateShading in some SPIR-V revisions. + builder_->addCapability(spv::CapabilitySampleRateShading); + input_sample_mask_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassInput, + builder_->makeArrayType(type_int_, builder_->makeUintConstant(1), 0), + "gl_SampleMaskIn"); + builder_->addDecoration(input_sample_mask_, spv::DecorationFlat); + builder_->addDecoration(input_sample_mask_, spv::DecorationBuiltIn, + spv::BuiltInSampleMask); + main_interface_.push_back(input_sample_mask_); + } + + if (!is_depth_only_fragment_shader_) { + // Framebuffer color attachment outputs. + if (!edram_fragment_shader_interlock_) { + std::fill(output_or_var_fragment_data_.begin(), + output_or_var_fragment_data_.end(), spv::NoResult); + static const char* const kFragmentDataOutputNames[] = { + "xe_out_fragment_data_0", + "xe_out_fragment_data_1", + "xe_out_fragment_data_2", + "xe_out_fragment_data_3", + }; + uint32_t color_targets_remaining = + current_shader().writes_color_targets(); + uint32_t color_target_index; + while ( + xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { + color_targets_remaining &= ~(UINT32_C(1) << color_target_index); + spv::Id output_fragment_data_rt = builder_->createVariable( + spv::NoPrecision, spv::StorageClassOutput, type_float4_, + kFragmentDataOutputNames[color_target_index]); + output_or_var_fragment_data_[color_target_index] = + output_fragment_data_rt; + builder_->addDecoration(output_fragment_data_rt, + spv::DecorationLocation, + int(color_target_index)); + // Make invariant as pixel shaders may be used for various precise + // computations. + builder_->addDecoration(output_fragment_data_rt, + spv::DecorationInvariant); + main_interface_.push_back(output_fragment_data_rt); + } + } } } void SpirvShaderTranslator::StartFragmentShaderInMain() { + // Set up pixel killing from within the translated shader without affecting + // the control flow (unlike with OpKill), similarly to how pixel killing works + // on the Xenos, and also keeping a single critical section exit and return + // for safety across different Vulkan implementations with fragment shader + // interlock. + if (current_shader().kills_pixels()) { + if (features_.demote_to_helper_invocation) { + // TODO(Triang3l): Promoted to SPIR-V 1.6 - don't add the extension there. + builder_->addExtension("SPV_EXT_demote_to_helper_invocation"); + builder_->addCapability(spv::CapabilityDemoteToHelperInvocationEXT); + } else { + var_main_kill_pixel_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_bool_, + "xe_var_kill_pixel", builder_->makeBoolConstant(false)); + } + // For killing with fragment shader interlock when demotion is supported, + // using OpIsHelperInvocationEXT to avoid allocating a variable in addition + // to the execution mask GPUs naturally have. + } + + if (edram_fragment_shader_interlock_) { + // Initialize color output variables with fragment shader interlock. + std::fill(output_or_var_fragment_data_.begin(), + output_or_var_fragment_data_.end(), spv::NoResult); + var_main_fsi_color_written_ = spv::NoResult; + uint32_t color_targets_written = current_shader().writes_color_targets(); + if (color_targets_written) { + static const char* const kFragmentDataVariableNames[] = { + "xe_var_fragment_data_0", + "xe_var_fragment_data_1", + "xe_var_fragment_data_2", + "xe_var_fragment_data_3", + }; + uint32_t color_targets_remaining = color_targets_written; + uint32_t color_target_index; + while ( + xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { + color_targets_remaining &= ~(UINT32_C(1) << color_target_index); + output_or_var_fragment_data_[color_target_index] = + builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float4_, + kFragmentDataVariableNames[color_target_index], + const_float4_0_); + } + var_main_fsi_color_written_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_uint_, + "xe_var_fsi_color_written", const_uint_0_); + } + } + + if (edram_fragment_shader_interlock_ && FSI_IsDepthStencilEarly()) { + spv::Id msaa_samples = LoadMsaaSamplesFromFlags(); + FSI_LoadSampleMask(msaa_samples); + FSI_LoadEdramOffsets(msaa_samples); + builder_->createNoResultOp(spv::OpBeginInvocationInterlockEXT); + FSI_DepthStencilTest(msaa_samples, false); + if (!is_depth_only_fragment_shader_) { + // Skip the rest of the shader if the whole quad (due to derivatives) has + // failed the depth / stencil test, and there are no depth and stencil + // values to conditionally write after running the shader to check if + // samples don't additionally need to be discarded. + spv::Id quad_needs_execution = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + // TODO(Triang3l): Use GroupNonUniformQuad operations where supported. + // If none of the pixels in the quad passed the depth / stencil test, the + // value of (any samples covered ? 1.0f : 0.0f) for the current pixel will + // be 0.0f, and since it will be 0.0f in other pixels too, the derivatives + // will be zero as well. + builder_->addCapability(spv::CapabilityDerivativeControl); + // Query the horizontally adjacent pixel. + quad_needs_execution = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, quad_needs_execution, + builder_->createBinOp( + spv::OpFOrdNotEqual, type_bool_, + builder_->createUnaryOp( + spv::OpDPdxFine, type_float_, + builder_->createTriOp(spv::OpSelect, type_float_, + quad_needs_execution, const_float_1_, + const_float_0_)), + const_float_0_)); + // Query the vertically adjacent pair of pixels. + quad_needs_execution = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, quad_needs_execution, + builder_->createBinOp( + spv::OpFOrdNotEqual, type_bool_, + builder_->createUnaryOp( + spv::OpDPdyCoarse, type_float_, + builder_->createTriOp(spv::OpSelect, type_float_, + quad_needs_execution, const_float_1_, + const_float_0_)), + const_float_0_)); + spv::Block& main_fsi_early_depth_stencil_execute_quad = + builder_->makeNewBlock(); + main_fsi_early_depth_stencil_execute_quad_merge_ = + &builder_->makeNewBlock(); + SpirvCreateSelectionMerge( + main_fsi_early_depth_stencil_execute_quad_merge_->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + quad_needs_execution, &main_fsi_early_depth_stencil_execute_quad, + main_fsi_early_depth_stencil_execute_quad_merge_); + builder_->setBuildPoint(&main_fsi_early_depth_stencil_execute_quad); + } + } + + if (is_depth_only_fragment_shader_) { + return; + } + uint32_t param_gen_interpolator = GetPsParamGenInterpolator(); // Zero general-purpose registers to prevent crashes when the game @@ -1928,11 +2185,13 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() { var_main_registers_, id_vector_temp_)); } - // Initialize the colors for safety. - for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { - spv::Id output_fragment_data_rt = output_fragment_data_[i]; - if (output_fragment_data_rt != spv::NoResult) { - builder_->createStore(const_float4_0_, output_fragment_data_rt); + if (!edram_fragment_shader_interlock_) { + // Initialize the colors for safety. + for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { + spv::Id output_fragment_data_rt = output_or_var_fragment_data_[i]; + if (output_fragment_data_rt != spv::NoResult) { + builder_->createStore(const_float4_0_, output_fragment_data_rt); + } } } } @@ -2299,11 +2558,18 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result, assert_true(is_pixel_shader()); assert_not_zero(used_write_mask); assert_true(current_shader().writes_color_target(result.storage_index)); - target_pointer = output_fragment_data_[result.storage_index]; - // May be spv::NoResult if the color output is explicitly removed due to - // an empty write mask without independent blending. - // TODO(Triang3l): Store the alpha of the first output in this case for - // alpha test and alpha to coverage. + target_pointer = output_or_var_fragment_data_[result.storage_index]; + if (edram_fragment_shader_interlock_) { + assert_true(var_main_fsi_color_written_ != spv::NoResult); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createLoad(var_main_fsi_color_written_, + spv::NoPrecision), + builder_->makeUintConstant(uint32_t(1) + << result.storage_index)), + var_main_fsi_color_written_); + } } break; default: // TODO(Triang3l): All storage targets. diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 3bcd342a3..d453aa329 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -96,6 +96,9 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_WNotReciprocal_Shift, kSysFlag_PrimitivePolygonal_Shift, kSysFlag_PrimitiveLine_Shift, + kSysFlag_MsaaSamples_Shift, + kSysFlag_DepthFloat24_Shift = + kSysFlag_MsaaSamples_Shift + xenos::kMsaaSamplesBits, kSysFlag_AlphaPassIfLess_Shift, kSysFlag_AlphaPassIfEqual_Shift, kSysFlag_AlphaPassIfGreater_Shift, @@ -104,6 +107,26 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_ConvertColor2ToGamma_Shift, kSysFlag_ConvertColor3ToGamma_Shift, + kSysFlag_FSIDepthStencil_Shift, + kSysFlag_FSIDepthPassIfLess_Shift, + kSysFlag_FSIDepthPassIfEqual_Shift, + kSysFlag_FSIDepthPassIfGreater_Shift, + // 1 to write new depth to the depth buffer, 0 to keep the old one if the + // depth test passes. + kSysFlag_FSIDepthWrite_Shift, + kSysFlag_FSIStencilTest_Shift, + // If the depth / stencil test has failed, but resulted in a stencil value + // that is different than the one currently in the depth buffer, write it + // anyway and don't run the rest of the shader (to check if the sample may + // be discarded some way) - use when alpha test and alpha to coverage are + // disabled. Ignored by the shader if not applicable to it (like if it has + // kill instructions or writes the depth output). + // TODO(Triang3l): Investigate replacement with an alpha-to-mask flag, + // checking `(flags & (alpha test | alpha to mask)) == (always | disabled)`, + // taking into account the potential relation with occlusion queries (but + // should be safe at least temporarily). + kSysFlag_FSIDepthStencilEarlyWrite_Shift, + kSysFlag_Count, // For HostVertexShaderType kVertex, if fullDrawIndexUint32 is not @@ -127,6 +150,7 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift, kSysFlag_PrimitivePolygonal = 1u << kSysFlag_PrimitivePolygonal_Shift, kSysFlag_PrimitiveLine = 1u << kSysFlag_PrimitiveLine_Shift, + kSysFlag_DepthFloat24 = 1u << kSysFlag_DepthFloat24_Shift, kSysFlag_AlphaPassIfLess = 1u << kSysFlag_AlphaPassIfLess_Shift, kSysFlag_AlphaPassIfEqual = 1u << kSysFlag_AlphaPassIfEqual_Shift, kSysFlag_AlphaPassIfGreater = 1u << kSysFlag_AlphaPassIfGreater_Shift, @@ -134,6 +158,14 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_ConvertColor1ToGamma = 1u << kSysFlag_ConvertColor1ToGamma_Shift, kSysFlag_ConvertColor2ToGamma = 1u << kSysFlag_ConvertColor2ToGamma_Shift, kSysFlag_ConvertColor3ToGamma = 1u << kSysFlag_ConvertColor3ToGamma_Shift, + kSysFlag_FSIDepthStencil = 1u << kSysFlag_FSIDepthStencil_Shift, + kSysFlag_FSIDepthPassIfLess = 1u << kSysFlag_FSIDepthPassIfLess_Shift, + kSysFlag_FSIDepthPassIfEqual = 1u << kSysFlag_FSIDepthPassIfEqual_Shift, + kSysFlag_FSIDepthPassIfGreater = 1u << kSysFlag_FSIDepthPassIfGreater_Shift, + kSysFlag_FSIDepthWrite = 1u << kSysFlag_FSIDepthWrite_Shift, + kSysFlag_FSIStencilTest = 1u << kSysFlag_FSIStencilTest_Shift, + kSysFlag_FSIDepthStencilEarlyWrite = + 1u << kSysFlag_FSIDepthStencilEarlyWrite_Shift, }; static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants"); @@ -171,9 +203,55 @@ class SpirvShaderTranslator : public ShaderTranslator { uint32_t texture_swizzles[16]; float alpha_test_reference; - float padding_alpha_test_reference[3]; + uint32_t edram_32bpp_tile_pitch_dwords_scaled; + uint32_t edram_depth_base_dwords_scaled; + float padding_edram_depth_base_dwords_scaled; float color_exp_bias[4]; + + float edram_poly_offset_front_scale; + float edram_poly_offset_back_scale; + float edram_poly_offset_front_offset; + float edram_poly_offset_back_offset; + + union { + struct { + uint32_t edram_stencil_front_reference_masks; + uint32_t edram_stencil_front_func_ops; + + uint32_t edram_stencil_back_reference_masks; + uint32_t edram_stencil_back_func_ops; + }; + struct { + uint32_t edram_stencil_front[2]; + uint32_t edram_stencil_back[2]; + }; + }; + + uint32_t edram_rt_base_dwords_scaled[4]; + + // RT format combined with RenderTargetCache::kPSIColorFormatFlag values + // (pass via RenderTargetCache::AddPSIColorFormatFlags). + uint32_t edram_rt_format_flags[4]; + + // Render target blending options - RB_BLENDCONTROL, with only the relevant + // options (factors and operations - AND 0x1FFF1FFF). If 0x00010001 + // (1 * src + 0 * dst), blending is disabled for the render target. + uint32_t edram_rt_blend_factors_ops[4]; + + // Format info - mask to apply to the old packed RT data, and to apply as + // inverted to the new packed data, before storing (more or less the inverse + // of the write mask packed like render target channels). This can be used + // to bypass unpacking if blending is not used. If 0 and not blending, + // reading the old data from the EDRAM buffer is not required. + uint32_t edram_rt_keep_mask[4][2]; + + // Format info - values to clamp the color to before blending or storing. + // Low color, low alpha, high color, high alpha. + float edram_rt_clamp[4][4]; + + // The constant blend factor for the respective modes. + float edram_blend_constant[4]; }; enum ConstantBuffer : uint32_t { @@ -248,12 +326,22 @@ class SpirvShaderTranslator : public ShaderTranslator { uint32_t max_storage_buffer_range; bool clip_distance; bool cull_distance; + bool demote_to_helper_invocation; + bool fragment_shader_sample_interlock; bool full_draw_index_uint32; bool image_view_format_swizzle; bool signed_zero_inf_nan_preserve_float32; bool denorm_flush_to_zero_float32; }; - SpirvShaderTranslator(const Features& features); + + SpirvShaderTranslator(const Features& features, + bool native_2x_msaa_with_attachments, + bool native_2x_msaa_no_attachments, + bool edram_fragment_shader_interlock) + : features_(features), + native_2x_msaa_with_attachments_(native_2x_msaa_with_attachments), + native_2x_msaa_no_attachments_(native_2x_msaa_no_attachments), + edram_fragment_shader_interlock_(edram_fragment_shader_interlock) {} uint64_t GetDefaultVertexShaderModification( uint32_t dynamic_addressable_register_count, @@ -277,6 +365,10 @@ class SpirvShaderTranslator : public ShaderTranslator { features_.max_storage_buffer_range); } + // Creates a special fragment shader without color outputs - this resets the + // state of the translator. + std::vector CreateDepthOnlyFragmentShader(); + // Common functions useful not only for the translator, but also for EDRAM // emulation via conventional render targets. @@ -385,10 +477,10 @@ class SpirvShaderTranslator : public ShaderTranslator { } bool IsExecutionModeEarlyFragmentTests() const { - // TODO(Triang3l): Not applicable to fragment shader interlock. return is_pixel_shader() && GetSpirvShaderModification().pixel.depth_stencil_mode == Modification::DepthStencilMode::kEarlyHint && + !edram_fragment_shader_interlock_ && current_shader().implicit_early_z_write_allowed(); } @@ -528,7 +620,72 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id image_unsigned, spv::Id image_signed, spv::Id sampler, spv::Id is_all_signed); + spv::Id LoadMsaaSamplesFromFlags(); + // Whether it's possible and worth skipping running the translated shader for + // 2x2 quads. + bool FSI_IsDepthStencilEarly() const { + assert_true(edram_fragment_shader_interlock_); + return !is_depth_only_fragment_shader_ && + !current_shader().writes_depth() && + !current_shader().is_valid_memexport_used(); + } + void FSI_LoadSampleMask(spv::Id msaa_samples); + void FSI_LoadEdramOffsets(spv::Id msaa_samples); + // The address must be a signed int. Whether the render target is 64bpp, if + // present at all, must be a bool (if it's NoResult, 32bpp will be assumed). + spv::Id FSI_AddSampleOffset(spv::Id sample_0_address, uint32_t sample_index, + spv::Id is_64bpp = spv::NoResult); + // Updates main_fsi_sample_mask_. Must be called outside non-uniform control + // flow because of taking derivatives of the fragment depth. + void FSI_DepthStencilTest(spv::Id msaa_samples, + bool sample_mask_potentially_narrowed_previouly); + // Returns the first and the second 32 bits as two uints. + std::array FSI_ClampAndPackColor(spv::Id color_float4, + spv::Id format_with_flags); + std::array FSI_UnpackColor(std::array color_packed, + spv::Id format_with_flags); + // The bounds must have the same number of components as the color or alpha. + spv::Id FSI_FlushNaNClampAndInBlending(spv::Id color_or_alpha, + spv::Id is_fixed_point, + spv::Id min_value, spv::Id max_value); + spv::Id FSI_ApplyColorBlendFactor(spv::Id value, spv::Id is_fixed_point, + spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, + spv::Id source_color, spv::Id source_alpha, + spv::Id dest_color, spv::Id dest_alpha, + spv::Id constant_color, + spv::Id constant_alpha); + spv::Id FSI_ApplyAlphaBlendFactor(spv::Id value, spv::Id is_fixed_point, + spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, + spv::Id source_alpha, spv::Id dest_alpha, + spv::Id constant_alpha); + // If source_color_clamped, dest_color, constant_color_clamped are + // spv::NoResult, will blend the alpha. Otherwise, will blend the color. + // The result will be unclamped (color packing is supposed to clamp it). + spv::Id FSI_BlendColorOrAlphaWithUnclampedResult( + spv::Id is_fixed_point, spv::Id clamp_min_value, spv::Id clamp_max_value, + spv::Id source_color_clamped, spv::Id source_alpha_clamped, + spv::Id dest_color, spv::Id dest_alpha, spv::Id constant_color_clamped, + spv::Id constant_alpha_clamped, spv::Id equation, spv::Id source_factor, + spv::Id dest_factor); + Features features_; + bool native_2x_msaa_with_attachments_; + bool native_2x_msaa_no_attachments_; + + // For safety with different drivers (even though fragment shader interlock in + // SPIR-V only has one control flow requirement - that both begin and end must + // be dynamically executed exactly once in this order), adhering to the more + // strict control flow limitations of OpenGL (GLSL) fragment shader interlock, + // that begin and end are called only on the outermost level of the control + // flow of the main function, and that there are no returns before either + // (there's a single return from the shader). + bool edram_fragment_shader_interlock_; + + // Is currently writing the empty depth-only pixel shader, such as for depth + // and stencil testing with fragment shader interlock. + bool is_depth_only_fragment_shader_ = false; std::unique_ptr builder_; @@ -621,7 +778,23 @@ class SpirvShaderTranslator : public ShaderTranslator { kSystemConstantTextureSwizzledSigns, kSystemConstantTextureSwizzles, kSystemConstantAlphaTestReference, + kSystemConstantEdram32bppTilePitchDwordsScaled, + kSystemConstantEdramDepthBaseDwordsScaled, kSystemConstantColorExpBias, + kSystemConstantEdramPolyOffsetFrontScale, + kSystemConstantEdramPolyOffsetBackScale, + kSystemConstantEdramPolyOffsetFrontOffset, + kSystemConstantEdramPolyOffsetBackOffset, + kSystemConstantEdramStencilFront, + kSystemConstantEdramStencilBack, + kSystemConstantEdramRTBaseDwordsScaled, + kSystemConstantEdramRTFormatFlags, + kSystemConstantEdramRTBlendFactorsOps, + // Accessed as float4[2], not float2[4], due to std140 array stride + // alignment. + kSystemConstantEdramRTKeepMask, + kSystemConstantEdramRTClamp, + kSystemConstantEdramBlendConstant, }; spv::Id uniform_system_constants_; spv::Id uniform_float_constants_; @@ -629,6 +802,7 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id uniform_fetch_constants_; spv::Id buffers_shared_memory_; + spv::Id buffer_edram_; // Not using combined images and samplers because // maxPerStageDescriptorSamplers is often lower than @@ -647,6 +821,8 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id input_fragment_coordinates_; // PS, only when needed - bool. spv::Id input_front_facing_; + // PS, only when needed - int[1]. + spv::Id input_sample_mask_; // VS output or PS input, only the ones that are needed (spv::NoResult for the // unneeded interpolators), indexed by the guest interpolator index - float4. @@ -671,7 +847,10 @@ class SpirvShaderTranslator : public ShaderTranslator { }; spv::Id output_per_vertex_; - std::array output_fragment_data_; + // With fragment shader interlock, variables in the main function. + // Otherwise, framebuffer color attachment outputs. + std::array + output_or_var_fragment_data_; std::vector main_interface_; spv::Function* function_main_; @@ -698,6 +877,40 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id var_main_registers_; // VS only - float3 (special exports). spv::Id var_main_point_size_edge_flag_kill_vertex_; + // PS, only when needed - bool. + spv::Id var_main_kill_pixel_; + // PS, only when writing to color render targets with fragment shader + // interlock - uint. + // Whether color buffers have been written to, if not written on the taken + // execution path, don't export according to Direct3D 9 register documentation + // (some games rely on this behavior). + spv::Id var_main_fsi_color_written_; + // Loaded by FSI_LoadSampleMask. + // Can be modified on the outermost control flow level in the main function. + // 0:3 - Per-sample coverage at the current stage of the shader's execution. + // Affected by things like gl_SampleMaskIn, early or late depth / + // stencil (always resets bits for failing, no matter if need to defer + // writing), alpha to coverage. + // 4:7 - Depth write deferred mask - when early depth / stencil resulted in a + // different value for the sample (like different stencil if the test + // failed), but can't write it before running the shader because it's + // not known if the sample will be discarded by the shader, alphatest or + // AtoC. + // Early depth / stencil rejection of the pixel is possible when both 0:3 and + // 4:7 are zero. + spv::Id main_fsi_sample_mask_; + // Loaded by FSI_LoadEdramOffsets. + // Including the depth render target base. + spv::Id main_fsi_address_depth_; + // Not including the render target base. + spv::Id main_fsi_offset_32bpp_; + spv::Id main_fsi_offset_64bpp_; + // Loaded by FSI_DepthStencilTest for early depth / stencil, the depth / + // stencil values to write at the end of the shader if the specified in + // main_fsi_sample_mask_ and if the samples were not discarded later after the + // early test. + std::array main_fsi_late_write_depth_stencil_; + spv::Block* main_fsi_early_depth_stencil_execute_quad_merge_; spv::Block* main_loop_header_; spv::Block* main_loop_continue_; spv::Block* main_loop_merge_; diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc index 9dfbccb09..7188258e9 100644 --- a/src/xenia/gpu/spirv_shader_translator_alu.cc +++ b/src/xenia/gpu/spirv_shader_translator_alu.cc @@ -123,7 +123,7 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( : spv::NoType; // In case the paired scalar instruction (if processed first) terminates the - // block (like via OpKill). + // block. EnsureBuildPointAvailable(); // Lookup table for variants of instructions with similar structure. @@ -838,9 +838,15 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( SpirvCreateSelectionMerge(merge_block.getId()); builder_->createConditionalBranch(condition, &kill_block, &merge_block); builder_->setBuildPoint(&kill_block); - // TODO(Triang3l): Demote to helper invocation to keep derivatives if - // needed (and return 1 if killed in this case). - builder_->createNoResultOp(spv::OpKill); + // Kill without influencing the control flow in the translated shader. + if (var_main_kill_pixel_ != spv::NoResult) { + builder_->createStore(builder_->makeBoolConstant(true), + var_main_kill_pixel_); + } + if (features_.demote_to_helper_invocation) { + builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); + } + builder_->createBranch(&merge_block); builder_->setBuildPoint(&merge_block); return const_float_0_; } @@ -938,7 +944,7 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( } // In case the paired vector instruction (if processed first) terminates the - // block (like via OpKill). + // block. EnsureBuildPointAvailable(); // Lookup table for variants of instructions with similar structure. @@ -1393,9 +1399,15 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( SpirvCreateSelectionMerge(merge_block.getId()); builder_->createConditionalBranch(condition, &kill_block, &merge_block); builder_->setBuildPoint(&kill_block); - // TODO(Triang3l): Demote to helper invocation to keep derivatives if - // needed (and return 1 if killed in this case). - builder_->createNoResultOp(spv::OpKill); + // Kill without influencing the control flow in the translated shader. + if (var_main_kill_pixel_ != spv::NoResult) { + builder_->createStore(builder_->makeBoolConstant(true), + var_main_kill_pixel_); + } + if (features_.demote_to_helper_invocation) { + builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); + } + builder_->createBranch(&merge_block); builder_->setBuildPoint(&merge_block); return const_float_0_; } diff --git a/src/xenia/gpu/spirv_shader_translator_fetch.cc b/src/xenia/gpu/spirv_shader_translator_fetch.cc index 88d3bd5ab..c9655c64f 100644 --- a/src/xenia/gpu/spirv_shader_translator_fetch.cc +++ b/src/xenia/gpu/spirv_shader_translator_fetch.cc @@ -1898,30 +1898,14 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( builder_->setBuildPoint(&block_dimension_stacked_start); if (use_computed_lod) { // Extract 2D gradients for stacked textures which are 2D arrays. - { - std::unique_ptr shuffle_op = - std::make_unique(builder_->getUniqueId(), - type_float2_, - spv::OpVectorShuffle); - shuffle_op->addIdOperand(gradients_h); - shuffle_op->addIdOperand(gradients_h); - shuffle_op->addImmediateOperand(0); - shuffle_op->addImmediateOperand(1); - texture_parameters.gradX = shuffle_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(shuffle_op)); - } - { - std::unique_ptr shuffle_op = - std::make_unique(builder_->getUniqueId(), - type_float2_, - spv::OpVectorShuffle); - shuffle_op->addIdOperand(gradients_v); - shuffle_op->addIdOperand(gradients_v); - shuffle_op->addImmediateOperand(0); - shuffle_op->addImmediateOperand(1); - texture_parameters.gradY = shuffle_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(shuffle_op)); - } + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + texture_parameters.gradX = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, gradients_h, uint_vector_temp_); + texture_parameters.gradY = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, gradients_v, uint_vector_temp_); } // Check if linear filtering is needed. bool vol_mag_filter_is_fetch_const = diff --git a/src/xenia/gpu/spirv_shader_translator_rb.cc b/src/xenia/gpu/spirv_shader_translator_rb.cc index c594a902f..f710ad7a5 100644 --- a/src/xenia/gpu/spirv_shader_translator_rb.cc +++ b/src/xenia/gpu/spirv_shader_translator_rb.cc @@ -16,6 +16,8 @@ #include "third_party/glslang/SPIRV/GLSL.std.450.h" #include "xenia/base/assert.h" #include "xenia/base/math.h" +#include "xenia/gpu/draw_util.h" +#include "xenia/gpu/render_target_cache.h" namespace xe { namespace gpu { @@ -426,15 +428,102 @@ spv::Id SpirvShaderTranslator::Depth20e4To32(spv::Builder& builder, } void SpirvShaderTranslator::CompleteFragmentShaderInMain() { - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeIntConstant(kSystemConstantFlags)); - spv::Id system_constant_flags = builder_->createLoad( - builder_->createAccessChain(spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision); + // Loaded if needed. + spv::Id msaa_samples = spv::NoResult; + + if (edram_fragment_shader_interlock_ && !FSI_IsDepthStencilEarly()) { + if (msaa_samples == spv::NoResult) { + msaa_samples = LoadMsaaSamplesFromFlags(); + } + // Load the sample mask, which may be modified later by killing from + // different sources, if not loaded already. + FSI_LoadSampleMask(msaa_samples); + } + + bool fsi_pixel_potentially_killed = false; + + if (current_shader().kills_pixels()) { + if (edram_fragment_shader_interlock_) { + fsi_pixel_potentially_killed = true; + if (!features_.demote_to_helper_invocation) { + assert_true(var_main_kill_pixel_ != spv::NoResult); + main_fsi_sample_mask_ = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision), + const_uint_0_, main_fsi_sample_mask_); + } + } else { + if (!features_.demote_to_helper_invocation) { + // Kill the pixel once the guest control flow and derivatives are not + // needed anymore. + assert_true(var_main_kill_pixel_ != spv::NoResult); + // Load the condition before the OpSelectionMerge, which must be the + // penultimate instruction. + spv::Id kill_pixel = + builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision); + spv::Block& block_kill = builder_->makeNewBlock(); + spv::Block& block_kill_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_kill_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(kill_pixel, &block_kill, + &block_kill_merge); + builder_->setBuildPoint(&block_kill); + // TODO(Triang3l): Use OpTerminateInvocation when SPIR-V 1.6 is + // targeted. + builder_->createNoResultOp(spv::OpKill); + // OpKill terminates the block. + builder_->setBuildPoint(&block_kill_merge); + } + } + } + + uint32_t color_targets_written = current_shader().writes_color_targets(); + + if ((color_targets_written & 0b1) && !IsExecutionModeEarlyFragmentTests()) { + spv::Id fsi_sample_mask_in_rt_0_alpha_tests = spv::NoResult; + spv::Block* block_fsi_rt_0_alpha_tests_rt_written_head = nullptr; + spv::Block* block_fsi_rt_0_alpha_tests_rt_written_merge = nullptr; + builder_->makeNewBlock(); + if (edram_fragment_shader_interlock_) { + // Skip the alpha test and alpha to coverage if the render target 0 is not + // written to dynamically. + fsi_sample_mask_in_rt_0_alpha_tests = main_fsi_sample_mask_; + spv::Id rt_0_written = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createLoad(var_main_fsi_color_written_, + spv::NoPrecision), + builder_->makeUintConstant(0b1)), + const_uint_0_); + block_fsi_rt_0_alpha_tests_rt_written_head = builder_->getBuildPoint(); + spv::Block& block_fsi_rt_0_alpha_tests_rt_written = + builder_->makeNewBlock(); + block_fsi_rt_0_alpha_tests_rt_written_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge( + block_fsi_rt_0_alpha_tests_rt_written_merge->getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr rt_0_written_branch_conditional_op = + std::make_unique(spv::OpBranchConditional); + rt_0_written_branch_conditional_op->addIdOperand(rt_0_written); + rt_0_written_branch_conditional_op->addIdOperand( + block_fsi_rt_0_alpha_tests_rt_written.getId()); + rt_0_written_branch_conditional_op->addIdOperand( + block_fsi_rt_0_alpha_tests_rt_written_merge->getId()); + // More likely to write to the render target 0 than not. + rt_0_written_branch_conditional_op->addImmediateOperand(2); + rt_0_written_branch_conditional_op->addImmediateOperand(1); + builder_->getBuildPoint()->addInstruction( + std::move(rt_0_written_branch_conditional_op)); + } + block_fsi_rt_0_alpha_tests_rt_written.addPredecessor( + block_fsi_rt_0_alpha_tests_rt_written_head); + block_fsi_rt_0_alpha_tests_rt_written_merge->addPredecessor( + block_fsi_rt_0_alpha_tests_rt_written_head); + builder_->setBuildPoint(&block_fsi_rt_0_alpha_tests_rt_written); + } - if (current_shader().writes_color_target(0) && - !IsExecutionModeEarlyFragmentTests()) { // Alpha test. // TODO(Triang3l): Check how alpha test works with NaN on Direct3D 9. // Extract the comparison function (less, equal, greater bits). @@ -458,11 +547,12 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { { id_vector_temp_.clear(); id_vector_temp_.push_back(builder_->makeIntConstant(3)); - spv::Id alpha_test_alpha = - builder_->createLoad(builder_->createAccessChain( - spv::StorageClassOutput, - output_fragment_data_[0], id_vector_temp_), - spv::NoPrecision); + spv::Id alpha_test_alpha = builder_->createLoad( + builder_->createAccessChain( + edram_fragment_shader_interlock_ ? spv::StorageClassFunction + : spv::StorageClassOutput, + output_or_var_fragment_data_[0], id_vector_temp_), + spv::NoPrecision); id_vector_temp_.clear(); id_vector_temp_.push_back( builder_->makeIntConstant(kSystemConstantAlphaTestReference)); @@ -522,126 +612,3297 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { builder_->createBranch(&block_alpha_test_not_equal_merge); } builder_->setBuildPoint(&block_alpha_test_not_equal_merge); - spv::Id alpha_test_result; - { - std::unique_ptr alpha_test_result_phi_op = - std::make_unique(builder_->getUniqueId(), - type_bool_, spv::OpPhi); - alpha_test_result_phi_op->addIdOperand(alpha_test_result_not_equal); - alpha_test_result_phi_op->addIdOperand( - block_alpha_test_not_equal.getId()); - alpha_test_result_phi_op->addIdOperand(alpha_test_result_non_not_equal); - alpha_test_result_phi_op->addIdOperand( - block_alpha_test_non_not_equal.getId()); - alpha_test_result = alpha_test_result_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(alpha_test_result_phi_op)); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(alpha_test_result_not_equal); + id_vector_temp_.push_back(block_alpha_test_not_equal.getId()); + id_vector_temp_.push_back(alpha_test_result_non_not_equal); + id_vector_temp_.push_back(block_alpha_test_non_not_equal.getId()); + spv::Id alpha_test_result = + builder_->createOp(spv::OpPhi, type_bool_, id_vector_temp_); + // Discard the pixel if the alpha test has failed. + if (edram_fragment_shader_interlock_ && + !features_.demote_to_helper_invocation) { + fsi_pixel_potentially_killed = true; + fsi_sample_mask_in_rt_0_alpha_tests = builder_->createTriOp( + spv::OpSelect, type_uint_, alpha_test_result, + fsi_sample_mask_in_rt_0_alpha_tests, const_uint_0_); + } else { + // Creating a merge block even though it will contain just one OpBranch + // since SPIR-V requires structured control flow in shaders. + spv::Block& block_alpha_test_kill = builder_->makeNewBlock(); + spv::Block& block_alpha_test_kill_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_alpha_test_kill_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(alpha_test_result, + &block_alpha_test_kill_merge, + &block_alpha_test_kill); + builder_->setBuildPoint(&block_alpha_test_kill); + if (edram_fragment_shader_interlock_) { + assert_true(features_.demote_to_helper_invocation); + fsi_pixel_potentially_killed = true; + // TODO(Triang3l): Promoted to SPIR-V 1.6 - don't add the extension + // there. + builder_->addExtension("SPV_EXT_demote_to_helper_invocation"); + builder_->addCapability(spv::CapabilityDemoteToHelperInvocationEXT); + builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); + builder_->createBranch(&block_alpha_test_kill_merge); + } else { + // TODO(Triang3l): Use OpTerminateInvocation when SPIR-V 1.6 is + // targeted. + builder_->createNoResultOp(spv::OpKill); + // OpKill terminates the block. + } + builder_->setBuildPoint(&block_alpha_test_kill_merge); + builder_->createBranch(&block_alpha_test_merge); } - // Discard the pixel if the alpha test has failed. Creating a merge block - // even though it will contain just one OpBranch since SPIR-V requires - // structured control flow in shaders. - spv::Block& block_alpha_test_kill = builder_->makeNewBlock(); - spv::Block& block_alpha_test_kill_merge = builder_->makeNewBlock(); - SpirvCreateSelectionMerge(block_alpha_test_kill_merge.getId(), - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(alpha_test_result, - &block_alpha_test_kill_merge, - &block_alpha_test_kill); - builder_->setBuildPoint(&block_alpha_test_kill); - builder_->createNoResultOp(spv::OpKill); - // OpKill terminates the block. - builder_->setBuildPoint(&block_alpha_test_kill_merge); - builder_->createBranch(&block_alpha_test_merge); } builder_->setBuildPoint(&block_alpha_test_merge); + + // TODO(Triang3l): Alpha to coverage. + + if (edram_fragment_shader_interlock_) { + // Close the render target 0 written check. + builder_->createBranch(block_fsi_rt_0_alpha_tests_rt_written_merge); + spv::Block& block_fsi_rt_0_alpha_tests_rt_written_end = + *builder_->getBuildPoint(); + builder_->setBuildPoint(block_fsi_rt_0_alpha_tests_rt_written_merge); + if (!features_.demote_to_helper_invocation) { + // The tests might have modified the sample mask via + // fsi_sample_mask_in_rt_0_alpha_tests. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(fsi_sample_mask_in_rt_0_alpha_tests); + id_vector_temp_.push_back( + block_fsi_rt_0_alpha_tests_rt_written_end.getId()); + id_vector_temp_.push_back(main_fsi_sample_mask_); + id_vector_temp_.push_back( + block_fsi_rt_0_alpha_tests_rt_written_head->getId()); + main_fsi_sample_mask_ = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + } } - uint32_t color_targets_remaining = current_shader().writes_color_targets(); - uint32_t color_target_index; - while (xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { - color_targets_remaining &= ~(UINT32_C(1) << color_target_index); - spv::Id color_variable = output_fragment_data_[color_target_index]; - spv::Id color = builder_->createLoad(color_variable, spv::NoPrecision); + spv::Block* block_fsi_if_after_kill = nullptr; + spv::Block* block_fsi_if_after_kill_merge = nullptr; - // Apply the exponent bias after the alpha test and alpha to coverage - // because they need the unbiased alpha from the shader. + spv::Block* block_fsi_if_after_depth_stencil = nullptr; + spv::Block* block_fsi_if_after_depth_stencil_merge = nullptr; + + if (edram_fragment_shader_interlock_) { + if (fsi_pixel_potentially_killed) { + if (features_.demote_to_helper_invocation) { + // Don't do anything related to writing to the EDRAM if the pixel was + // killed. + id_vector_temp_.clear(); + // TODO(Triang3l): Use HelperInvocation volatile load on SPIR-V 1.6. + main_fsi_sample_mask_ = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createOp(spv::OpIsHelperInvocationEXT, type_bool_, + id_vector_temp_), + const_uint_0_, main_fsi_sample_mask_); + } + // Check the condition before the OpSelectionMerge, which must be the + // penultimate instruction in a block. + spv::Id pixel_not_killed = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + block_fsi_if_after_kill = &builder_->makeNewBlock(); + block_fsi_if_after_kill_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_fsi_if_after_kill_merge->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(pixel_not_killed, + block_fsi_if_after_kill, + block_fsi_if_after_kill_merge); + builder_->setBuildPoint(block_fsi_if_after_kill); + } + + spv::Id color_write_depth_stencil_condition = spv::NoResult; + if (FSI_IsDepthStencilEarly()) { + // Perform late depth / stencil writes for samples not discarded. + for (uint32_t i = 0; i < 4; ++i) { + spv::Id sample_late_depth_stencil_write_needed = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_fsi_sample_mask_, + builder_->makeUintConstant(uint32_t(1) << (4 + i))), + const_uint_0_); + spv::Block& block_sample_late_depth_stencil_write = + builder_->makeNewBlock(); + spv::Block& block_sample_late_depth_stencil_write_merge = + builder_->makeNewBlock(); + SpirvCreateSelectionMerge( + block_sample_late_depth_stencil_write_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + sample_late_depth_stencil_write_needed, + &block_sample_late_depth_stencil_write, + &block_sample_late_depth_stencil_write_merge); + builder_->setBuildPoint(&block_sample_late_depth_stencil_write); + spv::Id depth_stencil_sample_address = + FSI_AddSampleOffset(main_fsi_address_depth_, i); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(depth_stencil_sample_address); + builder_->createStore( + main_fsi_late_write_depth_stencil_[i], + builder_->createAccessChain(features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_)); + builder_->createBranch(&block_sample_late_depth_stencil_write_merge); + builder_->setBuildPoint(&block_sample_late_depth_stencil_write_merge); + } + if (color_targets_written) { + // Only take the remaining coverage bits, not the late depth / stencil + // write bits, into account in the check whether anything needs to be + // done for the color targets. + color_write_depth_stencil_condition = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_fsi_sample_mask_, + builder_->makeUintConstant((uint32_t(1) << 4) - 1)), + const_uint_0_); + } + } else { + if (msaa_samples == spv::NoResult) { + msaa_samples = LoadMsaaSamplesFromFlags(); + } + FSI_LoadEdramOffsets(msaa_samples); + // Begin the critical section on the outermost control flow level so it's + // entered exactly once on any control flow path as required by the SPIR-V + // extension specification. + builder_->createNoResultOp(spv::OpBeginInvocationInterlockEXT); + // Do the depth / stencil test. + // The sample mask might have been made narrower than the initially loaded + // mask by various conditions that discard the whole pixel, as well as by + // alpha to coverage. + FSI_DepthStencilTest(msaa_samples, fsi_pixel_potentially_killed || + (color_targets_written & 0b1)); + if (color_targets_written) { + // Only bits 0:3 of main_fsi_sample_mask_ are written by the late + // depth / stencil test. + color_write_depth_stencil_condition = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + } + } + + if (color_write_depth_stencil_condition != spv::NoResult) { + // Skip all color operations if the pixel has failed the tests entirely. + block_fsi_if_after_depth_stencil = &builder_->makeNewBlock(); + block_fsi_if_after_depth_stencil_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_fsi_if_after_depth_stencil_merge->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(color_write_depth_stencil_condition, + block_fsi_if_after_depth_stencil, + block_fsi_if_after_depth_stencil_merge); + builder_->setBuildPoint(block_fsi_if_after_depth_stencil); + } + } + + if (color_targets_written) { + spv::Id fsi_color_targets_written = spv::NoResult; + spv::Id fsi_const_int_1 = spv::NoResult; + spv::Id fsi_const_edram_size_dwords = spv::NoResult; + spv::Id fsi_samples_covered[4] = {}; + if (edram_fragment_shader_interlock_) { + fsi_color_targets_written = + builder_->createLoad(var_main_fsi_color_written_, spv::NoPrecision); + fsi_const_int_1 = builder_->makeIntConstant(1); + // TODO(Triang3l): Resolution scaling. + fsi_const_edram_size_dwords = builder_->makeUintConstant( + xenos::kEdramTileWidthSamples * xenos::kEdramTileHeightSamples * + xenos::kEdramTileCount); + for (uint32_t i = 0; i < 4; ++i) { + fsi_samples_covered[i] = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_fsi_sample_mask_, + builder_->makeUintConstant(uint32_t(1) << i)), + const_uint_0_); + } + } + uint32_t color_targets_remaining = color_targets_written; + uint32_t color_target_index; + while (xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { + color_targets_remaining &= ~(UINT32_C(1) << color_target_index); + spv::Id color_variable = output_or_var_fragment_data_[color_target_index]; + spv::Id color = builder_->createLoad(color_variable, spv::NoPrecision); + + // Apply the exponent bias after the alpha test and alpha to coverage + // because they need the unbiased alpha from the shader. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantColorExpBias)); + id_vector_temp_.push_back( + builder_->makeIntConstant(int32_t(color_target_index))); + color = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float4_, color, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision)); + builder_->addDecoration(color, spv::DecorationNoContraction); + + if (edram_fragment_shader_interlock_) { + // Write the color to the target in the EDRAM only it was written on the + // shader's execution path, according to the Direct3D 9 rules that games + // rely on. + spv::Id fsi_color_written = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, fsi_color_targets_written, + builder_->makeUintConstant(uint32_t(1) << color_target_index)), + const_uint_0_); + spv::Block& fsi_color_written_if_head = *builder_->getBuildPoint(); + spv::Block& fsi_color_written_if = builder_->makeNewBlock(); + spv::Block& fsi_color_written_if_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(fsi_color_written_if_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr rt_written_branch_conditional_op = + std::make_unique(spv::OpBranchConditional); + rt_written_branch_conditional_op->addIdOperand(fsi_color_written); + rt_written_branch_conditional_op->addIdOperand( + fsi_color_written_if.getId()); + rt_written_branch_conditional_op->addIdOperand( + fsi_color_written_if_merge.getId()); + // More likely to write to the render target than not. + rt_written_branch_conditional_op->addImmediateOperand(2); + rt_written_branch_conditional_op->addImmediateOperand(1); + builder_->getBuildPoint()->addInstruction( + std::move(rt_written_branch_conditional_op)); + } + fsi_color_written_if.addPredecessor(&fsi_color_written_if_head); + fsi_color_written_if_merge.addPredecessor(&fsi_color_written_if_head); + builder_->setBuildPoint(&fsi_color_written_if); + + // For accessing uint2 arrays of per-render-target data which are passed + // as uint4 arrays due to std140 array element alignment. + spv::Id rt_uint2_index_array = + builder_->makeIntConstant(color_target_index >> 1); + spv::Id rt_uint2_index_element[] = { + builder_->makeIntConstant((color_target_index & 1) << 1), + builder_->makeIntConstant(((color_target_index & 1) << 1) + 1), + }; + + // Load the mask of the bits of the destination color that should be + // preserved (in 32-bit halves), which are 0, 0 if the color is fully + // overwritten, or UINT32_MAX, UINT32_MAX if writing to the target is + // disabled completely. + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTKeepMask)); + id_vector_temp_.push_back(rt_uint2_index_array); + id_vector_temp_.push_back(rt_uint2_index_element[0]); + spv::Id rt_keep_mask[2]; + rt_keep_mask[0] = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.back() = rt_uint2_index_element[1]; + rt_keep_mask[1] = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + + // Check if writing to the render target is not disabled completely. + spv::Id const_uint32_max = builder_->makeUintConstant(UINT32_MAX); + spv::Id rt_write_mask_not_empty = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, + builder_->createBinOp(spv::OpINotEqual, type_bool_, rt_keep_mask[0], + const_uint32_max), + builder_->createBinOp(spv::OpINotEqual, type_bool_, rt_keep_mask[1], + const_uint32_max)); + spv::Block& rt_write_mask_not_empty_if = builder_->makeNewBlock(); + spv::Block& rt_write_mask_not_empty_if_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(rt_write_mask_not_empty_if_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_write_mask_not_empty, + &rt_write_mask_not_empty_if, + &rt_write_mask_not_empty_if_merge); + builder_->setBuildPoint(&rt_write_mask_not_empty_if); + + spv::Id const_int_rt_index = + builder_->makeIntConstant(color_target_index); + + // Load the information about the render target. + + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTFormatFlags)); + id_vector_temp_.push_back(const_int_rt_index); + spv::Id rt_format_with_flags = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + + spv::Id rt_is_64bpp = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, rt_format_with_flags, + builder_->makeUintConstant( + RenderTargetCache::kPSIColorFormatFlag_64bpp)), + const_uint_0_); + + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTBaseDwordsScaled)); + id_vector_temp_.push_back(const_int_rt_index); + // EDRAM addresses are wrapped on the Xenos (modulo the EDRAM size). + spv::Id rt_sample_0_address = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp( + spv::OpUMod, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision), + builder_->createTriOp(spv::OpSelect, type_uint_, + rt_is_64bpp, main_fsi_offset_64bpp_, + main_fsi_offset_32bpp_)), + fsi_const_edram_size_dwords)); + + // Load the blending parameters for the render target. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTBlendFactorsOps)); + id_vector_temp_.push_back(const_int_rt_index); + spv::Id rt_blend_factors_equations = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + + // Check if blending (the blending is not 1 * source + 0 * destination). + spv::Id rt_blend_enabled = builder_->createBinOp( + spv::OpINotEqual, type_bool_, rt_blend_factors_equations, + builder_->makeUintConstant(0x00010001)); + spv::Block& rt_blend_enabled_if = builder_->makeNewBlock(); + spv::Block& rt_blend_enabled_else = builder_->makeNewBlock(); + spv::Block& rt_blend_enabled_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(rt_blend_enabled_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + rt_blend_enabled, &rt_blend_enabled_if, &rt_blend_enabled_else); + + // Blending path. + { + builder_->setBuildPoint(&rt_blend_enabled_if); + + // Get various parameters used in blending. + spv::Id rt_color_is_fixed_point = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, rt_format_with_flags, + builder_->makeUintConstant( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)), + const_uint_0_); + spv::Id rt_alpha_is_fixed_point = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, rt_format_with_flags, + builder_->makeUintConstant( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)), + const_uint_0_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTClamp)); + id_vector_temp_.push_back(const_int_rt_index); + spv::Id rt_clamp = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + spv::Id rt_clamp_color_min = builder_->smearScalar( + spv::NoPrecision, + builder_->createCompositeExtract(rt_clamp, type_float_, 0), + type_float3_); + spv::Id rt_clamp_alpha_min = + builder_->createCompositeExtract(rt_clamp, type_float_, 1); + spv::Id rt_clamp_color_max = builder_->smearScalar( + spv::NoPrecision, + builder_->createCompositeExtract(rt_clamp, type_float_, 2), + type_float3_); + spv::Id rt_clamp_alpha_max = + builder_->createCompositeExtract(rt_clamp, type_float_, 3); + + spv::Id blend_factor_width = builder_->makeUintConstant(5); + spv::Id blend_equation_width = builder_->makeUintConstant(3); + spv::Id rt_color_source_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + const_uint_0_, blend_factor_width); + spv::Id rt_color_equation = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + blend_factor_width, blend_equation_width); + spv::Id rt_color_dest_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(8), blend_factor_width); + spv::Id rt_alpha_source_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(16), blend_factor_width); + spv::Id rt_alpha_equation = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(21), blend_equation_width); + spv::Id rt_alpha_dest_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(24), blend_factor_width); + + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramBlendConstant)); + spv::Id blend_constant_unclamped = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id blend_constant_color_unclamped = + builder_->createRvalueSwizzle(spv::NoPrecision, type_float3_, + blend_constant_unclamped, + uint_vector_temp_); + spv::Id blend_constant_color_clamped = FSI_FlushNaNClampAndInBlending( + blend_constant_color_unclamped, rt_color_is_fixed_point, + rt_clamp_color_min, rt_clamp_color_max); + spv::Id blend_constant_alpha_clamped = FSI_FlushNaNClampAndInBlending( + builder_->createCompositeExtract(blend_constant_unclamped, + type_float_, 3), + rt_alpha_is_fixed_point, rt_clamp_alpha_min, rt_clamp_alpha_max); + + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id source_color_unclamped = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float3_, color, uint_vector_temp_); + spv::Id source_color_clamped = FSI_FlushNaNClampAndInBlending( + source_color_unclamped, rt_color_is_fixed_point, + rt_clamp_color_min, rt_clamp_color_max); + spv::Id source_alpha_clamped = FSI_FlushNaNClampAndInBlending( + builder_->createCompositeExtract(color, type_float_, 3), + rt_alpha_is_fixed_point, rt_clamp_alpha_min, rt_clamp_alpha_max); + + std::array rt_replace_mask; + for (uint32_t i = 0; i < 2; ++i) { + rt_replace_mask[i] = builder_->createUnaryOp(spv::OpNot, type_uint_, + rt_keep_mask[i]); + } + + // Blend and mask each sample. + for (uint32_t i = 0; i < 4; ++i) { + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(fsi_samples_covered[i], + &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + + spv::Id rt_sample_address = + FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(rt_sample_address); + spv::Id rt_access_chain_0 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, rt_sample_address, fsi_const_int_1); + spv::Id rt_access_chain_1 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + + // Load the destination color. + std::array dest_packed; + dest_packed[0] = + builder_->createLoad(rt_access_chain_0, spv::NoPrecision); + { + spv::Block& block_load_64bpp_head = *builder_->getBuildPoint(); + spv::Block& block_load_64bpp = builder_->makeNewBlock(); + spv::Block& block_load_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_load_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_load_64bpp, + &block_load_64bpp_merge); + builder_->setBuildPoint(&block_load_64bpp); + spv::Id dest_packed_64bpp_high = + builder_->createLoad(rt_access_chain_1, spv::NoPrecision); + builder_->createBranch(&block_load_64bpp_merge); + builder_->setBuildPoint(&block_load_64bpp_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(dest_packed_64bpp_high); + id_vector_temp_.push_back(block_load_64bpp.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_load_64bpp_head.getId()); + dest_packed[1] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + std::array dest_unpacked = + FSI_UnpackColor(dest_packed, rt_format_with_flags); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(dest_unpacked[0]); + id_vector_temp_.push_back(dest_unpacked[1]); + id_vector_temp_.push_back(dest_unpacked[2]); + spv::Id dest_color = builder_->createCompositeConstruct( + type_float3_, id_vector_temp_); + + // Blend the components. + spv::Id result_color = FSI_BlendColorOrAlphaWithUnclampedResult( + rt_color_is_fixed_point, rt_clamp_color_min, rt_clamp_color_max, + source_color_clamped, source_alpha_clamped, dest_color, + dest_unpacked[3], blend_constant_color_clamped, + blend_constant_alpha_clamped, rt_color_equation, + rt_color_source_factor, rt_color_dest_factor); + spv::Id result_alpha = FSI_BlendColorOrAlphaWithUnclampedResult( + rt_alpha_is_fixed_point, rt_clamp_alpha_min, rt_clamp_alpha_max, + spv::NoResult, source_alpha_clamped, spv::NoResult, + dest_unpacked[3], spv::NoResult, blend_constant_alpha_clamped, + rt_alpha_equation, rt_alpha_source_factor, + rt_alpha_dest_factor); + + // Pack and store the result. + // Bypass the `getNumTypeConstituents(typeId) == + // (int)constituents.size()` assertion in createCompositeConstruct, + // OpCompositeConstruct can construct vectors not only from scalars, + // but also from other vectors. + spv::Id result_float4; + { + std::unique_ptr result_composite_construct_op = + std::make_unique(builder_->getUniqueId(), + type_float4_, + spv::OpCompositeConstruct); + result_composite_construct_op->addIdOperand(result_color); + result_composite_construct_op->addIdOperand(result_alpha); + result_float4 = result_composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(result_composite_construct_op)); + } + std::array result_packed = + FSI_ClampAndPackColor(result_float4, rt_format_with_flags); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + dest_packed[0], rt_keep_mask[0]), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + result_packed[0], + rt_replace_mask[0])), + rt_access_chain_0); + spv::Block& block_store_64bpp = builder_->makeNewBlock(); + spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_store_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, + &block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + dest_packed[1], rt_keep_mask[1]), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + result_packed[1], + rt_replace_mask[1])), + rt_access_chain_0); + builder_->createBranch(&block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp_merge); + + builder_->createBranch(&block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered_merge); + } + + builder_->createBranch(&rt_blend_enabled_merge); + } + + // Non-blending paths. + { + builder_->setBuildPoint(&rt_blend_enabled_else); + + // Pack the new color for all samples. + std::array color_packed = + FSI_ClampAndPackColor(color, rt_format_with_flags); + + // Check if need to load the original contents. + spv::Id rt_keep_mask_not_empty = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, + builder_->createBinOp(spv::OpINotEqual, type_bool_, + rt_keep_mask[0], const_uint_0_), + builder_->createBinOp(spv::OpINotEqual, type_bool_, + rt_keep_mask[1], const_uint_0_)); + spv::Block& rt_keep_mask_not_empty_if = builder_->makeNewBlock(); + spv::Block& rt_keep_mask_not_empty_if_else = builder_->makeNewBlock(); + spv::Block& rt_keep_mask_not_empty_if_merge = + builder_->makeNewBlock(); + SpirvCreateSelectionMerge(rt_keep_mask_not_empty_if_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_keep_mask_not_empty, + &rt_keep_mask_not_empty_if, + &rt_keep_mask_not_empty_if_else); + + // Loading and masking path. + { + builder_->setBuildPoint(&rt_keep_mask_not_empty_if); + std::array color_packed_masked; + for (uint32_t i = 0; i < 2; ++i) { + color_packed_masked[i] = builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, color_packed[i], + builder_->createUnaryOp(spv::OpNot, type_uint_, + rt_keep_mask[i])); + } + for (uint32_t i = 0; i < 4; ++i) { + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(fsi_samples_covered[i], + &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + spv::Id rt_sample_address = + FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(rt_sample_address); + spv::Id rt_access_chain_0 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createLoad(rt_access_chain_0, + spv::NoPrecision), + rt_keep_mask[0]), + color_packed_masked[0]), + rt_access_chain_0); + spv::Block& block_store_64bpp = builder_->makeNewBlock(); + spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_store_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, + &block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp); + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, rt_sample_address, fsi_const_int_1); + spv::Id rt_access_chain_1 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createLoad(rt_access_chain_1, + spv::NoPrecision), + rt_keep_mask[1]), + color_packed_masked[1]), + rt_access_chain_1); + builder_->createBranch(&block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp_merge); + builder_->createBranch(&block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered_merge); + } + builder_->createBranch(&rt_keep_mask_not_empty_if_merge); + } + + // Fully overwriting path. + { + builder_->setBuildPoint(&rt_keep_mask_not_empty_if_else); + for (uint32_t i = 0; i < 4; ++i) { + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(fsi_samples_covered[i], + &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + spv::Id rt_sample_address = + FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(rt_sample_address); + builder_->createStore(color_packed[0], + builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_)); + spv::Block& block_store_64bpp = builder_->makeNewBlock(); + spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_store_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, + &block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp); + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, id_vector_temp_.back(), + fsi_const_int_1); + builder_->createStore(color_packed[1], + builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_)); + builder_->createBranch(&block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp_merge); + builder_->createBranch(&block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered_merge); + } + builder_->createBranch(&rt_keep_mask_not_empty_if_merge); + } + + builder_->setBuildPoint(&rt_keep_mask_not_empty_if_merge); + builder_->createBranch(&rt_blend_enabled_merge); + } + + builder_->setBuildPoint(&rt_blend_enabled_merge); + builder_->createBranch(&rt_write_mask_not_empty_if_merge); + builder_->setBuildPoint(&rt_write_mask_not_empty_if_merge); + builder_->createBranch(&fsi_color_written_if_merge); + builder_->setBuildPoint(&fsi_color_written_if_merge); + } else { + // Convert to gamma space - this is incorrect, since it must be done + // after blending on the Xbox 360, but this is just one of many blending + // issues in the host render target path. + // TODO(Triang3l): Gamma as sRGB check. + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id color_rgb = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float3_, color, uint_vector_temp_); + spv::Id is_gamma = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_ConvertColor0ToGamma + << color_target_index)), + const_uint_0_); + spv::Block& block_gamma_head = *builder_->getBuildPoint(); + spv::Block& block_gamma = builder_->makeNewBlock(); + spv::Block& block_gamma_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_gamma_merge.getId()); + builder_->createConditionalBranch(is_gamma, &block_gamma, + &block_gamma_merge); + builder_->setBuildPoint(&block_gamma); + spv::Id color_rgb_gamma = LinearToPWLGamma(color_rgb, false); + builder_->createBranch(&block_gamma_merge); + builder_->setBuildPoint(&block_gamma_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(color_rgb_gamma); + id_vector_temp_.push_back(block_gamma.getId()); + id_vector_temp_.push_back(color_rgb); + id_vector_temp_.push_back(block_gamma_head.getId()); + color_rgb = + builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); + { + std::unique_ptr color_rgba_shuffle_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpVectorShuffle); + color_rgba_shuffle_op->addIdOperand(color_rgb); + color_rgba_shuffle_op->addIdOperand(color); + color_rgba_shuffle_op->addImmediateOperand(0); + color_rgba_shuffle_op->addImmediateOperand(1); + color_rgba_shuffle_op->addImmediateOperand(2); + color_rgba_shuffle_op->addImmediateOperand(3 + 3); + color = color_rgba_shuffle_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(color_rgba_shuffle_op)); + } + + builder_->createStore(color, color_variable); + } + } + } + + if (edram_fragment_shader_interlock_) { + if (block_fsi_if_after_depth_stencil_merge) { + builder_->createBranch(block_fsi_if_after_depth_stencil_merge); + builder_->setBuildPoint(block_fsi_if_after_depth_stencil_merge); + } + + if (block_fsi_if_after_kill_merge) { + builder_->createBranch(block_fsi_if_after_kill_merge); + builder_->setBuildPoint(block_fsi_if_after_kill_merge); + } + + if (FSI_IsDepthStencilEarly()) { + builder_->createBranch(main_fsi_early_depth_stencil_execute_quad_merge_); + builder_->setBuildPoint(main_fsi_early_depth_stencil_execute_quad_merge_); + } + + builder_->createNoResultOp(spv::OpEndInvocationInterlockEXT); + } +} + +spv::Id SpirvShaderTranslator::LoadMsaaSamplesFromFlags() { + return builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_MsaaSamples_Shift), + builder_->makeUintConstant(2)); +} + +void SpirvShaderTranslator::FSI_LoadSampleMask(spv::Id msaa_samples) { + // On the Xbox 360, 2x MSAA doubles the storage height, 4x MSAA doubles the + // storage width. + // Vulkan standard 2x samples are bottom, top. + // Vulkan standard 4x samples are TL, TR, BL, BR. + // Remap to T, B for 2x, and to TL, BL, TR, BR for 4x. + // 2x corresponds to 1, 0 with native 2x MSAA on Vulkan, 0, 3 with 2x as 4x. + // 4x corresponds to 0, 2, 1, 3 on Vulkan. + + spv::Id const_uint_1 = builder_->makeUintConstant(1); + spv::Id const_uint_2 = builder_->makeUintConstant(2); + + assert_true(input_sample_mask_ != spv::NoResult); + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_int_0_); + spv::Id input_sample_mask_value = builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassInput, + input_sample_mask_, id_vector_temp_), + spv::NoPrecision)); + + spv::Block& block_msaa_head = *builder_->getBuildPoint(); + spv::Block& block_msaa_1x = builder_->makeNewBlock(); + spv::Block& block_msaa_2x = builder_->makeNewBlock(); + spv::Block& block_msaa_4x = builder_->makeNewBlock(); + spv::Block& block_msaa_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_msaa_merge.getId()); + { + std::unique_ptr msaa_switch_op = + std::make_unique(spv::OpSwitch); + msaa_switch_op->addIdOperand(msaa_samples); + // Make 1x the default. + msaa_switch_op->addIdOperand(block_msaa_1x.getId()); + msaa_switch_op->addImmediateOperand(int32_t(xenos::MsaaSamples::k2X)); + msaa_switch_op->addIdOperand(block_msaa_2x.getId()); + msaa_switch_op->addImmediateOperand(int32_t(xenos::MsaaSamples::k4X)); + msaa_switch_op->addIdOperand(block_msaa_4x.getId()); + builder_->getBuildPoint()->addInstruction(std::move(msaa_switch_op)); + } + block_msaa_1x.addPredecessor(&block_msaa_head); + block_msaa_2x.addPredecessor(&block_msaa_head); + block_msaa_4x.addPredecessor(&block_msaa_head); + + // 1x MSAA - pass input_sample_mask_value through. + builder_->setBuildPoint(&block_msaa_1x); + builder_->createBranch(&block_msaa_merge); + + // 2x MSAA. + builder_->setBuildPoint(&block_msaa_2x); + spv::Id sample_mask_2x; + if (native_2x_msaa_no_attachments_) { + // 1 and 0 to 0 and 1. + sample_mask_2x = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createUnaryOp(spv::OpBitReverse, type_uint_, + input_sample_mask_value), + builder_->makeUintConstant(32 - 2)); + } else { + // 0 and 3 to 0 and 1. + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(input_sample_mask_value); + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, input_sample_mask_value, + const_uint_2, const_uint_1)); + id_vector_temp_.push_back(const_uint_1); + id_vector_temp_.push_back(builder_->makeUintConstant(32 - 1)); + sample_mask_2x = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + } + builder_->createBranch(&block_msaa_merge); + + // 4x MSAA. + builder_->setBuildPoint(&block_msaa_4x); + // Flip samples in bits 0:1 by reversing the whole coverage mask and inserting + // the reversing bits. + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(input_sample_mask_value); + id_vector_temp_.push_back(builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createUnaryOp(spv::OpBitReverse, type_uint_, + input_sample_mask_value), + builder_->makeUintConstant(32 - 1 - 2))); + id_vector_temp_.push_back(const_uint_1); + id_vector_temp_.push_back(const_uint_2); + spv::Id sample_mask_4x = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + builder_->createBranch(&block_msaa_merge); + + // Select the result depending on the MSAA sample count. + builder_->setBuildPoint(&block_msaa_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 3); + id_vector_temp_.push_back(input_sample_mask_value); + id_vector_temp_.push_back(block_msaa_1x.getId()); + id_vector_temp_.push_back(sample_mask_2x); + id_vector_temp_.push_back(block_msaa_2x.getId()); + id_vector_temp_.push_back(sample_mask_4x); + id_vector_temp_.push_back(block_msaa_4x.getId()); + main_fsi_sample_mask_ = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); +} + +void SpirvShaderTranslator::FSI_LoadEdramOffsets(spv::Id msaa_samples) { + // Convert the floating-point pixel coordinates to integer sample 0 + // coordinates. + assert_true(input_fragment_coordinates_ != spv::NoResult); + spv::Id axes_have_two_msaa_samples[2]; + spv::Id sample_coordinates[2]; + spv::Id const_uint_1 = builder_->makeUintConstant(1); + for (uint32_t i = 0; i < 2; ++i) { + spv::Id axis_has_two_msaa_samples = builder_->createBinOp( + spv::OpUGreaterThanEqual, type_bool_, msaa_samples, + builder_->makeUintConstant( + uint32_t(i ? xenos::MsaaSamples::k2X : xenos::MsaaSamples::k4X))); + axes_have_two_msaa_samples[i] = axis_has_two_msaa_samples; + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant(int32_t(i))); + sample_coordinates[i] = builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, + builder_->createUnaryOp( + spv::OpConvertFToU, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassInput, + input_fragment_coordinates_, + id_vector_temp_), + spv::NoPrecision)), + builder_->createTriOp(spv::OpSelect, type_uint_, + axis_has_two_msaa_samples, const_uint_1, + const_uint_0_)); + } + + // Get 40 x 16 x resolution scale 32bpp half-tile or 40x16 64bpp tile index. + // Working with 40x16-sample portions for 64bpp and for swapping for depth - + // dividing by 40, not by 80. + // TODO(Triang3l): Resolution scaling. + uint32_t tile_width = xenos::kEdramTileWidthSamples; + spv::Id const_tile_half_width = builder_->makeUintConstant(tile_width >> 1); + uint32_t tile_height = xenos::kEdramTileHeightSamples; + spv::Id const_tile_height = builder_->makeUintConstant(tile_height); + spv::Id tile_half_index[2], tile_half_sample_coordinates[2]; + for (uint32_t i = 0; i < 2; ++i) { + spv::Id sample_x_or_y = sample_coordinates[i]; + spv::Id tile_half_width_or_height = + i ? const_tile_height : const_tile_half_width; + tile_half_index[i] = builder_->createBinOp( + spv::OpUDiv, type_uint_, sample_x_or_y, tile_half_width_or_height); + tile_half_sample_coordinates[i] = builder_->createBinOp( + spv::OpUMod, type_uint_, sample_x_or_y, tile_half_width_or_height); + } + + // Convert the Y sample 0 position within the half-tile or tile to the dword + // offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile. + spv::Id const_tile_width = builder_->makeUintConstant(tile_width); + spv::Id row_offset_in_tile_at_32bpp = + builder_->createBinOp(spv::OpIMul, type_uint_, + tile_half_sample_coordinates[1], const_tile_width); + + // Multiply the Y tile position by the surface tile pitch in dwords at 32bpp + // to get the address of the origin of the row of tiles within a 32bpp surface + // in dwords (later it needs to be multiplied by 2 for 64bpp). + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant( + kSystemConstantEdram32bppTilePitchDwordsScaled)); + spv::Id tile_row_offset_at_32bpp = builder_->createBinOp( + spv::OpIMul, type_uint_, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + tile_half_index[1]); + + uint32_t tile_size = tile_width * tile_height; + spv::Id const_tile_size = builder_->makeUintConstant(tile_size); + + // Get the dword offset of the sample 0 in the first half-tile in the tile + // within a 32bpp surface. + spv::Id offset_in_first_tile_half_at_32bpp = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, tile_row_offset_at_32bpp, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpIMul, type_uint_, const_tile_size, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + tile_half_index[0], const_uint_1)), + row_offset_in_tile_at_32bpp)), + tile_half_sample_coordinates[0]); + + // Get whether the sample is in the second half-tile in a 32bpp surface. + spv::Id is_second_tile_half = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, tile_half_index[0], + const_uint_1), + const_uint_0_); + + // Get the offset of the sample 0 within a depth / stencil surface, with + // samples 40...79 in the first half-tile, 0...39 in the second (flipped as + // opposed to color). Then add the EDRAM base for depth / stencil, and wrap + // addressing. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramDepthBaseDwordsScaled)); + main_fsi_address_depth_ = builder_->createBinOp( + spv::OpUMod, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + builder_->createBinOp( + spv::OpIAdd, type_uint_, offset_in_first_tile_half_at_32bpp, + builder_->createTriOp(spv::OpSelect, type_uint_, + is_second_tile_half, const_uint_0_, + const_tile_half_width))), + builder_->makeUintConstant(tile_size * xenos::kEdramTileCount)); + + if (current_shader().writes_color_targets()) { + // Get the offset of the sample 0 within a 32bpp surface, with samples + // 0...39 in the first half-tile, 40...79 in the second. + main_fsi_offset_32bpp_ = builder_->createBinOp( + spv::OpIAdd, type_uint_, offset_in_first_tile_half_at_32bpp, + builder_->createTriOp(spv::OpSelect, type_uint_, is_second_tile_half, + const_tile_half_width, const_uint_0_)); + + // Get the offset of the sample 0 within a 64bpp surface. + main_fsi_offset_64bpp_ = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + tile_row_offset_at_32bpp, const_uint_1), + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp(spv::OpIMul, type_uint_, const_tile_size, + tile_half_index[0]), + row_offset_in_tile_at_32bpp)), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + tile_half_sample_coordinates[0], const_uint_1)); + } +} + +spv::Id SpirvShaderTranslator::FSI_AddSampleOffset(spv::Id sample_0_address, + uint32_t sample_index, + spv::Id is_64bpp) { + if (!sample_index) { + return sample_0_address; + } + spv::Id sample_offset; + // TODO(Triang3l): Resolution scaling. + uint32_t tile_width = xenos::kEdramTileWidthSamples; + if (sample_index == 1) { + sample_offset = builder_->makeIntConstant(tile_width); + } else { + spv::Id sample_offset_32bpp = builder_->makeIntConstant( + tile_width * (sample_index & 1) + (sample_index >> 1)); + if (is_64bpp != spv::NoResult) { + sample_offset = builder_->createTriOp( + spv::OpSelect, type_int_, is_64bpp, + builder_->makeIntConstant(tile_width * (sample_index & 1) + + 2 * (sample_index >> 1)), + sample_offset_32bpp); + } else { + sample_offset = sample_offset_32bpp; + } + } + return builder_->createBinOp(spv::OpIAdd, type_int_, sample_0_address, + sample_offset); +} + +void SpirvShaderTranslator::FSI_DepthStencilTest( + spv::Id msaa_samples, bool sample_mask_potentially_narrowed_previouly) { + bool is_early = FSI_IsDepthStencilEarly(); + bool implicit_early_z_write_allowed = + current_shader().implicit_early_z_write_allowed(); + spv::Id const_uint_1 = builder_->makeUintConstant(1); + spv::Id const_uint_8 = builder_->makeUintConstant(8); + + // Check if depth or stencil testing is needed. + spv::Id depth_stencil_enabled = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthStencil)), + const_uint_0_); + spv::Block& block_depth_stencil_enabled_head = *builder_->getBuildPoint(); + spv::Block& block_depth_stencil_enabled = builder_->makeNewBlock(); + spv::Block& block_depth_stencil_enabled_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_depth_stencil_enabled_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(depth_stencil_enabled, + &block_depth_stencil_enabled, + &block_depth_stencil_enabled_merge); + builder_->setBuildPoint(&block_depth_stencil_enabled); + + // Load the depth in the center of the pixel and calculate the derivatives of + // the depth outside non-uniform control flow. + assert_true(input_fragment_coordinates_ != spv::NoResult); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant(2)); + spv::Id center_depth32_unbiased = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassInput, + input_fragment_coordinates_, id_vector_temp_), + spv::NoPrecision); + builder_->addCapability(spv::CapabilityDerivativeControl); + std::array depth_dxy; + depth_dxy[0] = builder_->createUnaryOp(spv::OpDPdxCoarse, type_float_, + center_depth32_unbiased); + depth_dxy[1] = builder_->createUnaryOp(spv::OpDPdyCoarse, type_float_, + center_depth32_unbiased); + + // Skip everything if potentially discarded all the samples previously in the + // shader. + spv::Block* block_any_sample_covered_head = nullptr; + spv::Block* block_any_sample_covered = nullptr; + spv::Block* block_any_sample_covered_merge = nullptr; + if (sample_mask_potentially_narrowed_previouly) { + spv::Id any_sample_covered = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + block_any_sample_covered_head = builder_->getBuildPoint(); + block_any_sample_covered = &builder_->makeNewBlock(); + block_any_sample_covered_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_any_sample_covered_merge->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(any_sample_covered, + block_any_sample_covered, + block_any_sample_covered_merge); + builder_->setBuildPoint(block_any_sample_covered); + } + + // Load values involved in depth and stencil testing. + spv::Id msaa_is_2x_4x = builder_->createBinOp( + spv::OpUGreaterThanEqual, type_bool_, msaa_samples, + builder_->makeUintConstant(uint32_t(xenos::MsaaSamples::k2X))); + spv::Id msaa_is_4x = builder_->createBinOp( + spv::OpUGreaterThanEqual, type_bool_, msaa_samples, + builder_->makeUintConstant(uint32_t(xenos::MsaaSamples::k4X))); + spv::Id depth_is_float24 = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_DepthFloat24)), + const_uint_0_); + spv::Id depth_pass_if_less = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthPassIfLess)), + const_uint_0_); + spv::Id depth_pass_if_equal = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthPassIfEqual)), + const_uint_0_); + spv::Id depth_pass_if_greater = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthPassIfGreater)), + const_uint_0_); + spv::Id depth_write = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthWrite)), + const_uint_0_); + spv::Id stencil_enabled = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIStencilTest)), + const_uint_0_); + spv::Id early_write = + (is_early && implicit_early_z_write_allowed) + ? builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_system_constant_flags_, + builder_->makeUintConstant( + kSysFlag_FSIDepthStencilEarlyWrite)), + const_uint_0_) + : spv::NoResult; + spv::Id not_early_write = + (is_early && implicit_early_z_write_allowed) + ? builder_->createUnaryOp(spv::OpLogicalNot, type_bool_, early_write) + : spv::NoResult; + assert_true(input_front_facing_ != spv::NoResult); + spv::Id front_facing = + builder_->createLoad(input_front_facing_, spv::NoPrecision); + spv::Id poly_offset_scale, poly_offset_offset, stencil_parameters; + { + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetFrontScale)); + spv::Id poly_offset_front_scale = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetBackScale)); + spv::Id poly_offset_back_scale = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + poly_offset_scale = + builder_->createTriOp(spv::OpSelect, type_float_, front_facing, + poly_offset_front_scale, poly_offset_back_scale); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetFrontOffset)); + spv::Id poly_offset_front_offset = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetBackOffset)); + spv::Id poly_offset_back_offset = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + poly_offset_offset = builder_->createTriOp( + spv::OpSelect, type_float_, front_facing, poly_offset_front_offset, + poly_offset_back_offset); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramStencilFront)); + spv::Id stencil_parameters_front = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramStencilBack)); + spv::Id stencil_parameters_back = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + stencil_parameters = builder_->createTriOp( + spv::OpSelect, type_uint2_, + builder_->smearScalar(spv::NoPrecision, front_facing, type_bool2_), + stencil_parameters_front, stencil_parameters_back); + } + spv::Id stencil_reference_masks = + builder_->createCompositeExtract(stencil_parameters, type_uint_, 0); + spv::Id stencil_reference = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_reference_masks, + const_uint_0_, const_uint_8); + spv::Id stencil_read_mask = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_reference_masks, + const_uint_8, const_uint_8); + spv::Id stencil_reference_read_masked = builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, stencil_reference, stencil_read_mask); + spv::Id stencil_write_mask = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_reference_masks, + builder_->makeUintConstant(16), const_uint_8); + spv::Id stencil_write_keep_mask = + builder_->createUnaryOp(spv::OpNot, type_uint_, stencil_write_mask); + spv::Id stencil_func_ops = + builder_->createCompositeExtract(stencil_parameters, type_uint_, 1); + spv::Id stencil_pass_if_less = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, stencil_func_ops, + builder_->makeUintConstant(uint32_t(1) << 0)), + const_uint_0_); + spv::Id stencil_pass_if_equal = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, stencil_func_ops, + builder_->makeUintConstant(uint32_t(1) << 1)), + const_uint_0_); + spv::Id stencil_pass_if_greater = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, stencil_func_ops, + builder_->makeUintConstant(uint32_t(1) << 2)), + const_uint_0_); + + // Get the maximum depth slope for the polygon offset. + // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/depth-bias + std::array depth_dxy_abs; + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(depth_dxy[i]); + depth_dxy_abs[i] = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450FAbs, id_vector_temp_); + } + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(depth_dxy_abs[0]); + id_vector_temp_.push_back(depth_dxy_abs[1]); + spv::Id depth_max_slope = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450FMax, id_vector_temp_); + // Calculate the polygon offset. + spv::Id slope_scaled_poly_offset = builder_->createBinOp( + spv::OpFMul, type_float_, poly_offset_scale, depth_max_slope); + builder_->addDecoration(slope_scaled_poly_offset, + spv::DecorationNoContraction); + spv::Id poly_offset = builder_->createBinOp( + spv::OpFAdd, type_float_, slope_scaled_poly_offset, poly_offset_offset); + builder_->addDecoration(poly_offset, spv::DecorationNoContraction); + // Apply the post-clip and post-viewport polygon offset to the fragment's + // depth. Not clamping yet as this is at the center, which is not necessarily + // covered and not necessarily inside the bounds - derivatives scaled by + // sample locations will be added to this value, and it must be linear. + spv::Id center_depth32_biased = builder_->createBinOp( + spv::OpFAdd, type_float_, center_depth32_unbiased, poly_offset); + builder_->addDecoration(center_depth32_biased, spv::DecorationNoContraction); + + // Perform depth and stencil testing for each covered sample. + spv::Id new_sample_mask = main_fsi_sample_mask_; + std::array late_write_depth_stencil{}; + for (uint32_t i = 0; i < 4; ++i) { + spv::Id sample_covered = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, new_sample_mask, + builder_->makeUintConstant(uint32_t(1) << i)), + const_uint_0_); + spv::Block& block_sample_covered_head = *builder_->getBuildPoint(); + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(sample_covered, &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + + // Load the original depth and stencil for the sample. + spv::Id sample_address = FSI_AddSampleOffset(main_fsi_address_depth_, i); id_vector_temp_.clear(); id_vector_temp_.reserve(2); - id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantColorExpBias)); - id_vector_temp_.push_back( - builder_->makeIntConstant(int32_t(color_target_index))); - color = builder_->createBinOp( - spv::OpVectorTimesScalar, type_float4_, color, - builder_->createLoad(builder_->createAccessChain( - spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision)); - builder_->addDecoration(color, spv::DecorationNoContraction); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(sample_address); + spv::Id sample_access_chain = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + spv::Id old_depth_stencil = + builder_->createLoad(sample_access_chain, spv::NoPrecision); - // Convert to gamma space - this is incorrect, since it must be done after - // blending on the Xbox 360, but this is just one of many blending issues in - // the host render target path. - // TODO(Triang3l): Gamma as sRGB check. - spv::Id color_rgb; - { - std::unique_ptr color_rgb_shuffle_op = - std::make_unique( - builder_->getUniqueId(), type_float3_, spv::OpVectorShuffle); - color_rgb_shuffle_op->addIdOperand(color); - color_rgb_shuffle_op->addIdOperand(color); - color_rgb_shuffle_op->addImmediateOperand(0); - color_rgb_shuffle_op->addImmediateOperand(1); - color_rgb_shuffle_op->addImmediateOperand(2); - color_rgb = color_rgb_shuffle_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(color_rgb_shuffle_op)); + // Calculate the new depth at the sample. + // interpolateAtSample(gl_FragCoord) is not valid in GLSL because + // gl_FragCoord is not an interpolator, calculating the depths at the + // samples manually. + std::array sample_location; + switch (i) { + case 0: { + // Center sample for no MSAA. + // Top-left sample for native 2x (top - 1 in Vulkan), 2x as 4x, 4x + // (0 in Vulkan). + // 4x on the host case. + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->makeFloatConstant( + draw_util::kD3D10StandardSamplePositions4x[0][j] * + (1.0f / 16.0f)); + } + if (native_2x_msaa_no_attachments_) { + // 2x on the host case. + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->createTriOp( + spv::OpSelect, type_float_, msaa_is_4x, sample_location[j], + builder_->makeFloatConstant( + draw_util::kD3D10StandardSamplePositions2x[1][j] * + (1.0f / 16.0f))); + } + } + // 1x case. + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = + builder_->createTriOp(spv::OpSelect, type_float_, msaa_is_2x_4x, + sample_location[j], const_float_0_); + } + } break; + case 1: { + // For guest 2x: bottom-right sample (bottom - 0 in Vulkan - for native + // 2x, bottom-right - 3 in Vulkan - for 2x as 4x). + // For guest 4x: bottom-left sample (2 in Vulkan). + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->createTriOp( + spv::OpSelect, type_float_, msaa_is_4x, + builder_->makeFloatConstant( + draw_util::kD3D10StandardSamplePositions4x[2][j] * + (1.0f / 16.0f)), + builder_->makeFloatConstant( + (native_2x_msaa_no_attachments_ + ? draw_util::kD3D10StandardSamplePositions2x[0][j] + : draw_util::kD3D10StandardSamplePositions4x[3][j]) * + (1.0f / 16.0f))); + } + } break; + default: { + // Xenia samples 2 and 3 (top-right and bottom-right) -> Vulkan samples + // 1 and 3. + const int8_t* sample_location_int = draw_util:: + kD3D10StandardSamplePositions4x[i ^ (((i & 1) ^ (i >> 1)) * 0b11)]; + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->makeFloatConstant( + sample_location_int[j] * (1.0f / 16.0f)); + } + } break; } - spv::Id is_gamma = builder_->createBinOp( - spv::OpINotEqual, type_bool_, + std::array sample_depth_dxy; + for (uint32_t j = 0; j < 2; ++j) { + sample_depth_dxy[j] = builder_->createBinOp( + spv::OpFMul, type_float_, sample_location[j], depth_dxy[j]); + builder_->addDecoration(sample_depth_dxy[j], + spv::DecorationNoContraction); + } + spv::Id sample_depth32 = builder_->createBinOp( + spv::OpFAdd, type_float_, sample_depth_dxy[0], sample_depth_dxy[1]); + builder_->addDecoration(sample_depth32, spv::DecorationNoContraction); + sample_depth32 = builder_->createBinOp( + spv::OpFAdd, type_float_, center_depth32_biased, sample_depth32); + builder_->addDecoration(sample_depth32, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(sample_depth32); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(const_float_1_); + sample_depth32 = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NClamp, id_vector_temp_); + + // Convert the new depth to 24-bit. + spv::Block& block_depth_format_float = builder_->makeNewBlock(); + spv::Block& block_depth_format_unorm = builder_->makeNewBlock(); + spv::Block& block_depth_format_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_depth_format_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + depth_is_float24, &block_depth_format_float, &block_depth_format_unorm); + // Float24 case. + builder_->setBuildPoint(&block_depth_format_float); + spv::Id sample_depth_float24 = SpirvShaderTranslator::PreClampedDepthTo20e4( + *builder_, sample_depth32, true, false, ext_inst_glsl_std_450_); + builder_->createBranch(&block_depth_format_merge); + spv::Block& block_depth_format_float_end = *builder_->getBuildPoint(); + // Unorm24 case. + builder_->setBuildPoint(&block_depth_format_unorm); + // Round to the nearest even integer. This seems to be the correct + // conversion, adding +0.5 and rounding towards zero results in red instead + // of black in the 4D5307E6 clear shader. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->createBinOp(spv::OpFMul, type_float_, sample_depth32, + builder_->makeFloatConstant(float(0xFFFFFF)))); + builder_->addDecoration(id_vector_temp_.back(), + spv::DecorationNoContraction); + spv::Id sample_depth_unorm24 = builder_->createUnaryOp( + spv::OpConvertFToU, type_uint_, + builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, + GLSLstd450RoundEven, id_vector_temp_)); + builder_->createBranch(&block_depth_format_merge); + spv::Block& block_depth_format_unorm_end = *builder_->getBuildPoint(); + // Merge between the two formats. + builder_->setBuildPoint(&block_depth_format_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(sample_depth_float24); + id_vector_temp_.push_back(block_depth_format_float_end.getId()); + id_vector_temp_.push_back(sample_depth_unorm24); + id_vector_temp_.push_back(block_depth_format_unorm_end.getId()); + spv::Id sample_depth24 = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + + // Perform the depth test. + spv::Id old_depth = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, old_depth_stencil, const_uint_8); + spv::Id depth_passed = builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, depth_pass_if_less, + builder_->createBinOp(spv::OpULessThan, type_bool_, sample_depth24, + old_depth)); + depth_passed = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, depth_passed, builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant(kSysFlag_ConvertColor0ToGamma - << color_target_index)), - const_uint_0_); - spv::Block& block_gamma_head = *builder_->getBuildPoint(); - spv::Block& block_gamma = builder_->makeNewBlock(); - spv::Block& block_gamma_merge = builder_->makeNewBlock(); - SpirvCreateSelectionMerge(block_gamma_merge.getId()); - builder_->createConditionalBranch(is_gamma, &block_gamma, - &block_gamma_merge); - builder_->setBuildPoint(&block_gamma); - spv::Id color_rgb_gamma = LinearToPWLGamma(color_rgb, false); - builder_->createBranch(&block_gamma_merge); - builder_->setBuildPoint(&block_gamma_merge); + spv::OpLogicalAnd, type_bool_, depth_pass_if_equal, + builder_->createBinOp(spv::OpIEqual, type_bool_, sample_depth24, + old_depth))); + depth_passed = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, depth_passed, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, depth_pass_if_greater, + builder_->createBinOp(spv::OpUGreaterThan, type_bool_, + sample_depth24, old_depth))); + + // Begin the stencil test. + spv::Block& block_stencil_enabled_head = *builder_->getBuildPoint(); + spv::Block& block_stencil_enabled = builder_->makeNewBlock(); + spv::Block& block_stencil_enabled_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_stencil_enabled_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(stencil_enabled, &block_stencil_enabled, + &block_stencil_enabled_merge); + builder_->setBuildPoint(&block_stencil_enabled); + + // Perform the stencil test. + // The read mask has zeros in the upper bits, applying it to the combined + // stencil and depth will remove the depth part. + spv::Id old_stencil_read_masked = builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, old_depth_stencil, stencil_read_mask); + spv::Id stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_less, + builder_->createBinOp(spv::OpULessThan, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked)); + stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_equal, + builder_->createBinOp(spv::OpIEqual, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked))); + stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_greater, + builder_->createBinOp(spv::OpUGreaterThan, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked))); + spv::Id stencil_op = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_func_ops, + builder_->createTriOp( + spv::OpSelect, type_uint_, stencil_passed_if_enabled, + builder_->createTriOp(spv::OpSelect, type_uint_, depth_passed, + builder_->makeUintConstant(6), + builder_->makeUintConstant(9)), + builder_->makeUintConstant(3)), + builder_->makeUintConstant(3)); + spv::Block& block_stencil_op_head = *builder_->getBuildPoint(); + spv::Block& block_stencil_op_keep = builder_->makeNewBlock(); + spv::Block& block_stencil_op_zero = builder_->makeNewBlock(); + spv::Block& block_stencil_op_replace = builder_->makeNewBlock(); + spv::Block& block_stencil_op_increment_clamp = builder_->makeNewBlock(); + spv::Block& block_stencil_op_decrement_clamp = builder_->makeNewBlock(); + spv::Block& block_stencil_op_invert = builder_->makeNewBlock(); + spv::Block& block_stencil_op_increment_wrap = builder_->makeNewBlock(); + spv::Block& block_stencil_op_decrement_wrap = builder_->makeNewBlock(); + spv::Block& block_stencil_op_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_stencil_op_merge.getId(), + spv::SelectionControlDontFlattenMask); { - std::unique_ptr gamma_phi_op = - std::make_unique(builder_->getUniqueId(), - type_float3_, spv::OpPhi); - gamma_phi_op->addIdOperand(color_rgb_gamma); - gamma_phi_op->addIdOperand(block_gamma.getId()); - gamma_phi_op->addIdOperand(color_rgb); - gamma_phi_op->addIdOperand(block_gamma_head.getId()); - color_rgb = gamma_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(gamma_phi_op)); - } - { - std::unique_ptr color_rgba_shuffle_op = - std::make_unique( - builder_->getUniqueId(), type_float4_, spv::OpVectorShuffle); - color_rgba_shuffle_op->addIdOperand(color_rgb); - color_rgba_shuffle_op->addIdOperand(color); - color_rgba_shuffle_op->addImmediateOperand(0); - color_rgba_shuffle_op->addImmediateOperand(1); - color_rgba_shuffle_op->addImmediateOperand(2); - color_rgba_shuffle_op->addImmediateOperand(3 + 3); - color = color_rgba_shuffle_op->getResultId(); + std::unique_ptr stencil_op_switch_op = + std::make_unique(spv::OpSwitch); + stencil_op_switch_op->addIdOperand(stencil_op); + // Make keep the default. + stencil_op_switch_op->addIdOperand(block_stencil_op_keep.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kZero)); + stencil_op_switch_op->addIdOperand(block_stencil_op_zero.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kReplace)); + stencil_op_switch_op->addIdOperand(block_stencil_op_replace.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kIncrementClamp)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_increment_clamp.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kDecrementClamp)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_decrement_clamp.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kInvert)); + stencil_op_switch_op->addIdOperand(block_stencil_op_invert.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kIncrementWrap)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_increment_wrap.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kDecrementWrap)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_decrement_wrap.getId()); builder_->getBuildPoint()->addInstruction( - std::move(color_rgba_shuffle_op)); + std::move(stencil_op_switch_op)); + } + block_stencil_op_keep.addPredecessor(&block_stencil_op_head); + block_stencil_op_zero.addPredecessor(&block_stencil_op_head); + block_stencil_op_replace.addPredecessor(&block_stencil_op_head); + block_stencil_op_increment_clamp.addPredecessor(&block_stencil_op_head); + block_stencil_op_decrement_clamp.addPredecessor(&block_stencil_op_head); + block_stencil_op_invert.addPredecessor(&block_stencil_op_head); + block_stencil_op_increment_wrap.addPredecessor(&block_stencil_op_head); + block_stencil_op_decrement_wrap.addPredecessor(&block_stencil_op_head); + // Keep - will use the old stencil in the phi. + builder_->setBuildPoint(&block_stencil_op_keep); + builder_->createBranch(&block_stencil_op_merge); + // Zero - will use the zero constant in the phi. + builder_->setBuildPoint(&block_stencil_op_zero); + builder_->createBranch(&block_stencil_op_merge); + // Replace - will use the stencil reference in the phi. + builder_->setBuildPoint(&block_stencil_op_replace); + builder_->createBranch(&block_stencil_op_merge); + // Increment and clamp. + builder_->setBuildPoint(&block_stencil_op_increment_clamp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(builder_->makeUintConstant(UINT8_MAX - 1)); + id_vector_temp_.push_back( + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, old_depth_stencil, + builder_->makeUintConstant(UINT8_MAX))); + spv::Id new_stencil_in_low_bits_increment_clamp = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBuiltinCall(type_uint_, ext_inst_glsl_std_450_, + GLSLstd450UMin, id_vector_temp_), + const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Decrement and clamp. + builder_->setBuildPoint(&block_stencil_op_decrement_clamp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(const_uint_1); + id_vector_temp_.push_back( + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, old_depth_stencil, + builder_->makeUintConstant(UINT8_MAX))); + spv::Id new_stencil_in_low_bits_decrement_clamp = builder_->createBinOp( + spv::OpISub, type_uint_, + builder_->createBuiltinCall(type_uint_, ext_inst_glsl_std_450_, + GLSLstd450UMax, id_vector_temp_), + const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Invert. + builder_->setBuildPoint(&block_stencil_op_invert); + spv::Id new_stencil_in_low_bits_invert = + builder_->createUnaryOp(spv::OpNot, type_uint_, old_depth_stencil); + builder_->createBranch(&block_stencil_op_merge); + // Increment and wrap. + // The upper bits containing the old depth have no effect on the behavior. + builder_->setBuildPoint(&block_stencil_op_increment_wrap); + spv::Id new_stencil_in_low_bits_increment_wrap = builder_->createBinOp( + spv::OpIAdd, type_uint_, old_depth_stencil, const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Decrement and wrap. + // The upper bits containing the old depth have no effect on the behavior. + builder_->setBuildPoint(&block_stencil_op_decrement_wrap); + spv::Id new_stencil_in_low_bits_decrement_wrap = builder_->createBinOp( + spv::OpISub, type_uint_, old_depth_stencil, const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Select the new stencil (with undefined data in bits starting from 8) + // based on the stencil operation. + builder_->setBuildPoint(&block_stencil_op_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 8); + id_vector_temp_.push_back(old_depth_stencil); + id_vector_temp_.push_back(block_stencil_op_keep.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_stencil_op_zero.getId()); + id_vector_temp_.push_back(stencil_reference); + id_vector_temp_.push_back(block_stencil_op_replace.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_increment_clamp); + id_vector_temp_.push_back(block_stencil_op_increment_clamp.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_clamp); + id_vector_temp_.push_back(block_stencil_op_decrement_clamp.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_invert); + id_vector_temp_.push_back(block_stencil_op_invert.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_increment_wrap); + id_vector_temp_.push_back(block_stencil_op_increment_wrap.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_wrap); + id_vector_temp_.push_back(block_stencil_op_decrement_wrap.getId()); + spv::Id new_stencil_in_low_bits_if_enabled = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + // Merge the old depth / stencil (old depth kept from the old depth / + // stencil so the separate old depth register is not needed anymore after + // the depth test) and the new stencil based on the write mask. + spv::Id new_stencil_and_old_depth_if_stencil_enabled = + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + old_depth_stencil, stencil_write_keep_mask), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + new_stencil_in_low_bits_if_enabled, + stencil_write_mask)); + + // Choose the result based on whether the stencil test was done. + // All phi operations must be the first in the block. + builder_->createBranch(&block_stencil_enabled_merge); + spv::Block& block_stencil_enabled_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(&block_stencil_enabled_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(stencil_passed_if_enabled); + id_vector_temp_.push_back(block_stencil_enabled_end.getId()); + id_vector_temp_.push_back(builder_->makeBoolConstant(true)); + id_vector_temp_.push_back(block_stencil_enabled_head.getId()); + spv::Id stencil_passed = + builder_->createOp(spv::OpPhi, type_bool_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_stencil_and_old_depth_if_stencil_enabled); + id_vector_temp_.push_back(block_stencil_enabled_end.getId()); + id_vector_temp_.push_back(old_depth_stencil); + id_vector_temp_.push_back(block_stencil_enabled_head.getId()); + spv::Id new_stencil_and_old_depth = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + + // Check whether the tests have passed, and exclude the bit from the + // coverage if not. + spv::Id depth_stencil_passed = builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, depth_passed, stencil_passed); + spv::Id new_sample_mask_after_sample = builder_->createTriOp( + spv::OpSelect, type_uint_, depth_stencil_passed, new_sample_mask, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, new_sample_mask, + builder_->makeUintConstant(~(uint32_t(1) << i)))); + + // Combine the new depth and the new stencil taking into account whether the + // new depth should be written. + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(new_stencil_and_old_depth); + id_vector_temp_.push_back(sample_depth24); + id_vector_temp_.push_back(const_uint_8); + id_vector_temp_.push_back(builder_->makeUintConstant(24)); + spv::Id new_stencil_and_unconditional_new_depth = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + spv::Id new_depth_stencil = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createBinOp(spv::OpLogicalAnd, type_bool_, + depth_stencil_passed, depth_write), + new_stencil_and_unconditional_new_depth, new_stencil_and_old_depth); + + // Write (or defer writing if the test is early, but may discard samples + // later still) the new depth and stencil if they're different. + spv::Id new_depth_stencil_different = builder_->createBinOp( + spv::OpINotEqual, type_bool_, new_depth_stencil, old_depth_stencil); + spv::Id new_depth_stencil_write_condition = spv::NoResult; + if (is_early) { + if (implicit_early_z_write_allowed) { + new_sample_mask_after_sample = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createBinOp(spv::OpLogicalAnd, type_bool_, + new_depth_stencil_different, not_early_write), + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, new_sample_mask_after_sample, + builder_->makeUintConstant(uint32_t(1) << (4 + i))), + new_sample_mask_after_sample); + new_depth_stencil_write_condition = + builder_->createBinOp(spv::OpLogicalAnd, type_bool_, + new_depth_stencil_different, early_write); + } else { + // Always need to write late in this shader, as it may do something like + // explicitly killing pixels. + new_sample_mask_after_sample = builder_->createTriOp( + spv::OpSelect, type_uint_, new_depth_stencil_different, + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, new_sample_mask_after_sample, + builder_->makeUintConstant(uint32_t(1) << (4 + i))), + new_sample_mask_after_sample); + } + } else { + new_depth_stencil_write_condition = new_depth_stencil_different; + } + if (new_depth_stencil_write_condition != spv::NoResult) { + spv::Block& block_depth_stencil_write = builder_->makeNewBlock(); + spv::Block& block_depth_stencil_write_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_depth_stencil_write_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(new_depth_stencil_write_condition, + &block_depth_stencil_write, + &block_depth_stencil_write_merge); + builder_->setBuildPoint(&block_depth_stencil_write); + builder_->createStore(new_depth_stencil, sample_access_chain); + builder_->createBranch(&block_depth_stencil_write_merge); + builder_->setBuildPoint(&block_depth_stencil_write_merge); } - builder_->createStore(color, color_variable); + builder_->createBranch(&block_sample_covered_merge); + spv::Block& block_sample_covered_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(&block_sample_covered_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_sample_mask_after_sample); + id_vector_temp_.push_back(block_sample_covered_end.getId()); + id_vector_temp_.push_back(new_sample_mask); + id_vector_temp_.push_back(block_sample_covered_head.getId()); + new_sample_mask = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if (is_early) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_depth_stencil); + id_vector_temp_.push_back(block_sample_covered_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_sample_covered_head.getId()); + late_write_depth_stencil[i] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } } + + // Close the conditionals for whether depth / stencil testing is needed. + if (block_any_sample_covered_merge) { + builder_->createBranch(block_any_sample_covered_merge); + spv::Block& block_any_sample_covered_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(block_any_sample_covered_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_sample_mask); + id_vector_temp_.push_back(block_any_sample_covered_end.getId()); + id_vector_temp_.push_back(main_fsi_sample_mask_); + id_vector_temp_.push_back(block_any_sample_covered_head->getId()); + new_sample_mask = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if (is_early) { + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(late_write_depth_stencil[i]); + id_vector_temp_.push_back(block_any_sample_covered_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_any_sample_covered_head->getId()); + late_write_depth_stencil[i] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + } + } + builder_->createBranch(&block_depth_stencil_enabled_merge); + spv::Block& block_depth_stencil_enabled_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(&block_depth_stencil_enabled_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_sample_mask); + id_vector_temp_.push_back(block_depth_stencil_enabled_end.getId()); + id_vector_temp_.push_back(main_fsi_sample_mask_); + id_vector_temp_.push_back(block_depth_stencil_enabled_head.getId()); + main_fsi_sample_mask_ = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if (is_early) { + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(late_write_depth_stencil[i]); + id_vector_temp_.push_back(block_depth_stencil_enabled_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_depth_stencil_enabled_head.getId()); + main_fsi_late_write_depth_stencil_[i] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + } +} + +std::array SpirvShaderTranslator::FSI_ClampAndPackColor( + spv::Id color_float4, spv::Id format_with_flags) { + spv::Block& block_format_head = *builder_->getBuildPoint(); + spv::Block& block_format_8_8_8_8 = builder_->makeNewBlock(); + spv::Block& block_format_8_8_8_8_gamma = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10 = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10_float = builder_->makeNewBlock(); + spv::Block& block_format_16 = builder_->makeNewBlock(); + spv::Block& block_format_16_float = builder_->makeNewBlock(); + spv::Block& block_format_32_float = builder_->makeNewBlock(); + spv::Block& block_format_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_format_merge.getId()); + { + std::unique_ptr format_switch_op = + std::make_unique(spv::OpSwitch); + format_switch_op->addIdOperand(format_with_flags); + // Make k_32_FLOAT or k_32_32_FLOAT the default. + format_switch_op->addIdOperand(block_format_32_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8))); + format_switch_op->addIdOperand(block_format_8_8_8_8.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA))); + format_switch_op->addIdOperand(block_format_8_8_8_8_gamma.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat :: + k_2_10_10_10_FLOAT_AS_16_16_16_16))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16))); + format_switch_op->addIdOperand(block_format_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16))); + format_switch_op->addIdOperand(block_format_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_float.getId()); + builder_->getBuildPoint()->addInstruction(std::move(format_switch_op)); + } + block_format_8_8_8_8.addPredecessor(&block_format_head); + block_format_8_8_8_8_gamma.addPredecessor(&block_format_head); + block_format_2_10_10_10.addPredecessor(&block_format_head); + block_format_2_10_10_10_float.addPredecessor(&block_format_head); + block_format_16.addPredecessor(&block_format_head); + block_format_16_float.addPredecessor(&block_format_head); + block_format_32_float.addPredecessor(&block_format_head); + + spv::Id unorm_round_offset_float = builder_->makeFloatConstant(0.5f); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, unorm_round_offset_float); + spv::Id unorm_round_offset_float4 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + + // *************************************************************************** + // k_8_8_8_8 + // *************************************************************************** + spv::Id packed_8_8_8_8; + { + builder_->setBuildPoint(&block_format_8_8_8_8); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(color_float4); + id_vector_temp_.push_back(const_float4_0_); + id_vector_temp_.push_back(const_float4_1_); + spv::Id color_scaled = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float4_, + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_), + builder_->makeFloatConstant(255.0f)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, unorm_round_offset_float4); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint4_, color_offset); + packed_8_8_8_8 = + builder_->createCompositeExtract(color_uint4, type_uint_, 0); + spv::Id component_width = builder_->makeUintConstant(8); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_8_8_8_8); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, i)); + id_vector_temp_.push_back(builder_->makeUintConstant(8 * i)); + id_vector_temp_.push_back(component_width); + packed_8_8_8_8 = builder_->createOp(spv::OpBitFieldInsert, type_uint_, + id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_8_8_8_8_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_8_8_8_8_GAMMA + // *************************************************************************** + spv::Id packed_8_8_8_8_gamma; + { + builder_->setBuildPoint(&block_format_8_8_8_8_gamma); + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id color_rgb = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float3_, color_float4, uint_vector_temp_); + spv::Id rgb_gamma = LinearToPWLGamma( + builder_->createRvalueSwizzle(spv::NoPrecision, type_float3_, + color_float4, uint_vector_temp_), + false); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_float4, type_float_, 3)); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(const_float_1_); + spv::Id alpha_clamped = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NClamp, id_vector_temp_); + // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()` + // assertion in createCompositeConstruct, OpCompositeConstruct can + // construct vectors not only from scalars, but also from other vectors. + spv::Id color_gamma; + { + std::unique_ptr color_gamma_composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); + color_gamma_composite_construct_op->addIdOperand(rgb_gamma); + color_gamma_composite_construct_op->addIdOperand(alpha_clamped); + color_gamma = color_gamma_composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(color_gamma_composite_construct_op)); + } + spv::Id color_scaled = + builder_->createBinOp(spv::OpVectorTimesScalar, type_float4_, + color_gamma, builder_->makeFloatConstant(255.0f)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, unorm_round_offset_float4); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint4_, color_offset); + packed_8_8_8_8_gamma = + builder_->createCompositeExtract(color_uint4, type_uint_, 0); + spv::Id component_width = builder_->makeUintConstant(8); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_8_8_8_8_gamma); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, i)); + id_vector_temp_.push_back(builder_->makeUintConstant(8 * i)); + id_vector_temp_.push_back(component_width); + packed_8_8_8_8_gamma = builder_->createOp(spv::OpBitFieldInsert, + type_uint_, id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_8_8_8_8_gamma_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_2_10_10_10 + // k_2_10_10_10_AS_10_10_10_10 + // *************************************************************************** + spv::Id packed_2_10_10_10; + { + builder_->setBuildPoint(&block_format_2_10_10_10); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(color_float4); + id_vector_temp_.push_back(const_float4_0_); + id_vector_temp_.push_back(const_float4_1_); + spv::Id color_clamped = + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.resize(3, builder_->makeFloatConstant(1023.0f)); + id_vector_temp_.push_back(builder_->makeFloatConstant(3.0f)); + spv::Id color_scaled = builder_->createBinOp( + spv::OpFMul, type_float4_, color_clamped, + builder_->makeCompositeConstant(type_float4_, id_vector_temp_)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, unorm_round_offset_float4); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint4_, color_offset); + packed_2_10_10_10 = + builder_->createCompositeExtract(color_uint4, type_uint_, 0); + spv::Id rgb_width = builder_->makeUintConstant(10); + spv::Id alpha_width = builder_->makeUintConstant(2); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_2_10_10_10); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, i)); + id_vector_temp_.push_back(builder_->makeUintConstant(10 * i)); + id_vector_temp_.push_back(i == 3 ? alpha_width : rgb_width); + packed_2_10_10_10 = builder_->createOp(spv::OpBitFieldInsert, type_uint_, + id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_2_10_10_10_FLOAT + // k_2_10_10_10_FLOAT_AS_16_16_16_16 + // *************************************************************************** + spv::Id packed_2_10_10_10_float; + { + builder_->setBuildPoint(&block_format_2_10_10_10_float); + std::array color_components; + // RGB. + for (uint32_t i = 0; i < 3; ++i) { + color_components[i] = UnclampedFloat32To7e3( + *builder_, + builder_->createCompositeExtract(color_float4, type_float_, i), + ext_inst_glsl_std_450_); + } + // Alpha. + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_float4, type_float_, 3)); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(const_float_1_); + spv::Id alpha_scaled = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_), + builder_->makeFloatConstant(3.0f)); + builder_->addDecoration(alpha_scaled, spv::DecorationNoContraction); + spv::Id alpha_offset = builder_->createBinOp( + spv::OpFAdd, type_float_, alpha_scaled, unorm_round_offset_float); + builder_->addDecoration(alpha_offset, spv::DecorationNoContraction); + color_components[3] = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint_, alpha_offset); + // Pack. + packed_2_10_10_10_float = color_components[0]; + spv::Id rgb_width = builder_->makeUintConstant(10); + for (uint32_t i = 1; i < 3; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_2_10_10_10_float); + id_vector_temp_.push_back(color_components[i]); + id_vector_temp_.push_back(builder_->makeUintConstant(10 * i)); + id_vector_temp_.push_back(rgb_width); + packed_2_10_10_10_float = builder_->createOp(spv::OpBitFieldInsert, + type_uint_, id_vector_temp_); + } + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_2_10_10_10_float); + id_vector_temp_.push_back(color_components[3]); + id_vector_temp_.push_back(builder_->makeUintConstant(30)); + id_vector_temp_.push_back(builder_->makeUintConstant(2)); + packed_2_10_10_10_float = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_16_16 + // k_16_16_16_16 + // *************************************************************************** + std::array packed_16; + { + builder_->setBuildPoint(&block_format_16); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(-32.0f)); + spv::Id const_float4_minus_32 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(32.0f)); + spv::Id const_float4_32 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + // NaN to 0, not to -32. + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createUnaryOp(spv::OpIsNan, type_bool4_, color_float4), + const_float4_0_, color_float4)); + id_vector_temp_.push_back(const_float4_minus_32); + id_vector_temp_.push_back(const_float4_32); + spv::Id color_scaled = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float4_, + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450FClamp, id_vector_temp_), + builder_->makeFloatConstant(32767.0f / 32.0f)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(-0.5f)); + spv::Id unorm_round_offset_negative_float4 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, + builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createBinOp(spv::OpFOrdLessThan, type_bool4_, + color_scaled, const_float4_0_), + unorm_round_offset_negative_float4, unorm_round_offset_float4)); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, + builder_->createUnaryOp(spv::OpConvertFToS, type_int4_, color_offset)); + spv::Id component_offset_width = builder_->makeUintConstant(16); + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, 2 * i)); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, 2 * i + 1)); + id_vector_temp_.push_back(component_offset_width); + id_vector_temp_.push_back(component_offset_width); + packed_16[i] = builder_->createOp(spv::OpBitFieldInsert, type_uint_, + id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_16_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_16_16_FLOAT + // k_16_16_16_16_FLOAT + // *************************************************************************** + std::array packed_16_float; + { + builder_->setBuildPoint(&block_format_16_float); + // TODO(Triang3l): Xenos extended-range float16. + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(-65504.0f)); + spv::Id const_float4_minus_float16_max = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(65504.0f)); + spv::Id const_float4_float16_max = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + // NaN to 0, not to -max. + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createUnaryOp(spv::OpIsNan, type_bool4_, color_float4), + const_float4_0_, color_float4)); + id_vector_temp_.push_back(const_float4_minus_float16_max); + id_vector_temp_.push_back(const_float4_float16_max); + spv::Id color_clamped = + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450FClamp, id_vector_temp_); + for (uint32_t i = 0; i < 2; ++i) { + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(2 * i); + uint_vector_temp_.push_back(2 * i + 1); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, color_clamped, uint_vector_temp_)); + packed_16_float[i] = + builder_->createBuiltinCall(type_uint_, ext_inst_glsl_std_450_, + GLSLstd450PackHalf2x16, id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_16_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_32_FLOAT + // k_32_32_FLOAT + // *************************************************************************** + std::array packed_32_float; + { + builder_->setBuildPoint(&block_format_32_float); + for (uint32_t i = 0; i < 2; ++i) { + packed_32_float[i] = builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createCompositeExtract(color_float4, type_float_, i)); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_32_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // Selection of the result depending on the format. + // *************************************************************************** + + builder_->setBuildPoint(&block_format_merge); + std::array packed; + id_vector_temp_.reserve(2 * 7); + // Low 32 bits. + id_vector_temp_.clear(); + id_vector_temp_.push_back(packed_8_8_8_8); + id_vector_temp_.push_back(block_format_8_8_8_8_end.getId()); + id_vector_temp_.push_back(packed_8_8_8_8_gamma); + id_vector_temp_.push_back(block_format_8_8_8_8_gamma_end.getId()); + id_vector_temp_.push_back(packed_2_10_10_10); + id_vector_temp_.push_back(block_format_2_10_10_10_end.getId()); + id_vector_temp_.push_back(packed_2_10_10_10_float); + id_vector_temp_.push_back(block_format_2_10_10_10_float_end.getId()); + id_vector_temp_.push_back(packed_16[0]); + id_vector_temp_.push_back(block_format_16_end.getId()); + id_vector_temp_.push_back(packed_16_float[0]); + id_vector_temp_.push_back(block_format_16_float_end.getId()); + id_vector_temp_.push_back(packed_32_float[0]); + id_vector_temp_.push_back(block_format_32_float_end.getId()); + packed[0] = builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + // High 32 bits. + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_8_8_8_8_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_8_8_8_8_gamma_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_2_10_10_10_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_2_10_10_10_float_end.getId()); + id_vector_temp_.push_back(packed_16[1]); + id_vector_temp_.push_back(block_format_16_end.getId()); + id_vector_temp_.push_back(packed_16_float[1]); + id_vector_temp_.push_back(block_format_16_float_end.getId()); + id_vector_temp_.push_back(packed_32_float[1]); + id_vector_temp_.push_back(block_format_32_float_end.getId()); + packed[1] = builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + return packed; +} + +std::array SpirvShaderTranslator::FSI_UnpackColor( + std::array color_packed, spv::Id format_with_flags) { + spv::Block& block_format_head = *builder_->getBuildPoint(); + spv::Block& block_format_8_8_8_8 = builder_->makeNewBlock(); + spv::Block& block_format_8_8_8_8_gamma = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10 = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10_float = builder_->makeNewBlock(); + spv::Block& block_format_16_16 = builder_->makeNewBlock(); + spv::Block& block_format_16_16_16_16 = builder_->makeNewBlock(); + spv::Block& block_format_16_16_float = builder_->makeNewBlock(); + spv::Block& block_format_16_16_16_16_float = builder_->makeNewBlock(); + spv::Block& block_format_32_float = builder_->makeNewBlock(); + spv::Block& block_format_32_32_float = builder_->makeNewBlock(); + spv::Block& block_format_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_format_merge.getId()); + { + std::unique_ptr format_switch_op = + std::make_unique(spv::OpSwitch); + format_switch_op->addIdOperand(format_with_flags); + // Make k_32_FLOAT the default. + format_switch_op->addIdOperand(block_format_32_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8))); + format_switch_op->addIdOperand(block_format_8_8_8_8.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA))); + format_switch_op->addIdOperand(block_format_8_8_8_8_gamma.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat :: + k_2_10_10_10_FLOAT_AS_16_16_16_16))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16))); + format_switch_op->addIdOperand(block_format_16_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16))); + format_switch_op->addIdOperand(block_format_16_16_16_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_16_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_16_16_16_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_32_32_FLOAT))); + format_switch_op->addIdOperand(block_format_32_32_float.getId()); + builder_->getBuildPoint()->addInstruction(std::move(format_switch_op)); + } + block_format_8_8_8_8.addPredecessor(&block_format_head); + block_format_8_8_8_8_gamma.addPredecessor(&block_format_head); + block_format_2_10_10_10.addPredecessor(&block_format_head); + block_format_2_10_10_10_float.addPredecessor(&block_format_head); + block_format_16_16.addPredecessor(&block_format_head); + block_format_16_16_16_16.addPredecessor(&block_format_head); + block_format_16_16_float.addPredecessor(&block_format_head); + block_format_16_16_16_16_float.addPredecessor(&block_format_head); + block_format_32_float.addPredecessor(&block_format_head); + block_format_32_32_float.addPredecessor(&block_format_head); + + // *************************************************************************** + // k_8_8_8_8 + // k_8_8_8_8_GAMMA + // *************************************************************************** + + std::array, 2> unpacked_8_8_8_8_and_gamma; + std::array block_format_8_8_8_8_and_gamma_end; + { + spv::Id component_width = builder_->makeUintConstant(8); + spv::Id component_scale = builder_->makeFloatConstant(1.0f / 255.0f); + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_8_8_8_8_gamma + : &block_format_8_8_8_8); + for (uint32_t j = 0; j < 4; ++j) { + spv::Id component = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertUToF, type_float_, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, color_packed[0], + builder_->makeUintConstant(8 * j), component_width)), + component_scale); + builder_->addDecoration(component, spv::DecorationNoContraction); + if (i && j <= 2) { + component = PWLGammaToLinear(component, true); + } + unpacked_8_8_8_8_and_gamma[i][j] = component; + } + builder_->createBranch(&block_format_merge); + block_format_8_8_8_8_and_gamma_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // k_2_10_10_10 + // k_2_10_10_10_AS_10_10_10_10 + // *************************************************************************** + + std::array unpacked_2_10_10_10; + { + builder_->setBuildPoint(&block_format_2_10_10_10); + spv::Id rgb_width = builder_->makeUintConstant(10); + spv::Id alpha_width = builder_->makeUintConstant(2); + spv::Id rgb_scale = builder_->makeFloatConstant(1.0f / 1023.0f); + spv::Id alpha_scale = builder_->makeFloatConstant(1.0f / 3.0f); + for (uint32_t i = 0; i < 4; ++i) { + spv::Id component = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertUToF, type_float_, + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, + color_packed[0], + builder_->makeUintConstant(10 * i), + i == 3 ? alpha_width : rgb_width)), + i == 3 ? alpha_scale : rgb_scale); + builder_->addDecoration(component, spv::DecorationNoContraction); + unpacked_2_10_10_10[i] = component; + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_2_10_10_10_FLOAT + // k_2_10_10_10_FLOAT_AS_16_16_16_16 + // *************************************************************************** + + std::array unpacked_2_10_10_10_float; + { + builder_->setBuildPoint(&block_format_2_10_10_10_float); + spv::Id rgb_width = builder_->makeUintConstant(10); + for (uint32_t i = 0; i < 3; ++i) { + unpacked_2_10_10_10_float[i] = + Float7e3To32(*builder_, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, color_packed[0], + builder_->makeUintConstant(10 * i), rgb_width), + 0, false, ext_inst_glsl_std_450_); + } + spv::Id alpha = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertUToF, type_float_, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, color_packed[0], + builder_->makeUintConstant(30), builder_->makeUintConstant(2))), + builder_->makeFloatConstant(1.0f / 3.0f)); + builder_->addDecoration(alpha, spv::DecorationNoContraction); + unpacked_2_10_10_10_float[3] = alpha; + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_16_16 + // k_16_16_16_16 + // *************************************************************************** + + std::array, 2> unpacked_16; + unpacked_16[0][2] = const_float_0_; + unpacked_16[0][3] = const_float_1_; + std::array block_format_16_end; + { + spv::Id component_width = builder_->makeUintConstant(16); + spv::Id component_scale = builder_->makeFloatConstant(32.0f / 32767.0f); + spv::Id component_min = builder_->makeFloatConstant(-1.0f); + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_16_16_16_16 + : &block_format_16_16); + std::array color_packed_signed; + for (uint32_t j = 0; j <= i; ++j) { + color_packed_signed[j] = + builder_->createUnaryOp(spv::OpBitcast, type_int_, color_packed[j]); + } + for (uint32_t j = 0; j < uint32_t(i ? 4 : 2); ++j) { + spv::Id component = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertSToF, type_float_, + builder_->createTriOp(spv::OpBitFieldSExtract, type_int_, + color_packed_signed[j >> 1], + builder_->makeUintConstant(16 * (j & 1)), + component_width)), + component_scale); + builder_->addDecoration(component, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(component_min); + id_vector_temp_.push_back(component); + component = + builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, + GLSLstd450FMax, id_vector_temp_); + unpacked_16[i][j] = component; + } + builder_->createBranch(&block_format_merge); + block_format_16_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // k_16_16_FLOAT + // k_16_16_16_16_FLOAT + // *************************************************************************** + + std::array, 2> unpacked_16_float; + unpacked_16_float[0][2] = const_float_0_; + unpacked_16_float[0][3] = const_float_1_; + std::array block_format_16_float_end; + { + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_16_16_16_16_float + : &block_format_16_16_float); + // TODO(Triang3l): Xenos extended-range float16. + for (uint32_t j = 0; j <= i; ++j) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(color_packed[j]); + spv::Id components_float2 = builder_->createBuiltinCall( + type_float2_, ext_inst_glsl_std_450_, GLSLstd450UnpackHalf2x16, + id_vector_temp_); + for (uint32_t k = 0; k < 2; ++k) { + unpacked_16_float[i][2 * j + k] = builder_->createCompositeExtract( + components_float2, type_float_, k); + } + } + builder_->createBranch(&block_format_merge); + block_format_16_float_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // k_32_FLOAT + // k_32_32_FLOAT + // *************************************************************************** + + std::array, 2> unpacked_32_float; + unpacked_32_float[0][1] = const_float_0_; + unpacked_32_float[0][2] = const_float_0_; + unpacked_32_float[0][3] = const_float_1_; + unpacked_32_float[1][2] = const_float_0_; + unpacked_32_float[1][3] = const_float_1_; + std::array block_format_32_float_end; + { + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_32_32_float + : &block_format_32_float); + for (uint32_t j = 0; j <= i; ++j) { + unpacked_32_float[i][j] = builder_->createUnaryOp( + spv::OpBitcast, type_float_, color_packed[j]); + } + builder_->createBranch(&block_format_merge); + block_format_32_float_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // Selection of the result depending on the format. + // *************************************************************************** + + builder_->setBuildPoint(&block_format_merge); + std::array unpacked; + id_vector_temp_.reserve(2 * 10); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(unpacked_8_8_8_8_and_gamma[0][i]); + id_vector_temp_.push_back(block_format_8_8_8_8_and_gamma_end[0]->getId()); + id_vector_temp_.push_back(unpacked_8_8_8_8_and_gamma[1][i]); + id_vector_temp_.push_back(block_format_8_8_8_8_and_gamma_end[1]->getId()); + id_vector_temp_.push_back(unpacked_2_10_10_10[i]); + id_vector_temp_.push_back(block_format_2_10_10_10_end.getId()); + id_vector_temp_.push_back(unpacked_2_10_10_10_float[i]); + id_vector_temp_.push_back(block_format_2_10_10_10_float_end.getId()); + id_vector_temp_.push_back(unpacked_16[0][i]); + id_vector_temp_.push_back(block_format_16_end[0]->getId()); + id_vector_temp_.push_back(unpacked_16[1][i]); + id_vector_temp_.push_back(block_format_16_end[1]->getId()); + id_vector_temp_.push_back(unpacked_16_float[0][i]); + id_vector_temp_.push_back(block_format_16_float_end[0]->getId()); + id_vector_temp_.push_back(unpacked_16_float[1][i]); + id_vector_temp_.push_back(block_format_16_float_end[1]->getId()); + id_vector_temp_.push_back(unpacked_32_float[0][i]); + id_vector_temp_.push_back(block_format_32_float_end[0]->getId()); + id_vector_temp_.push_back(unpacked_32_float[1][i]); + id_vector_temp_.push_back(block_format_32_float_end[1]->getId()); + unpacked[i] = builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); + } + return unpacked; +} + +spv::Id SpirvShaderTranslator::FSI_FlushNaNClampAndInBlending( + spv::Id color_or_alpha, spv::Id is_fixed_point, spv::Id min_value, + spv::Id max_value) { + spv::Id color_or_alpha_type = builder_->getTypeId(color_or_alpha); + uint32_t component_count = + uint32_t(builder_->getNumTypeConstituents(color_or_alpha_type)); + assert_true(builder_->isScalarType(color_or_alpha_type) || + builder_->isVectorType(color_or_alpha_type)); + assert_true( + builder_->isFloatType(builder_->getScalarTypeId(color_or_alpha_type))); + assert_true(builder_->getTypeId(min_value) == color_or_alpha_type); + assert_true(builder_->getTypeId(max_value) == color_or_alpha_type); + + spv::Block& block_is_fixed_point_head = *builder_->getBuildPoint(); + spv::Block& block_is_fixed_point_if = builder_->makeNewBlock(); + spv::Block& block_is_fixed_point_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_is_fixed_point_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(is_fixed_point, &block_is_fixed_point_if, + &block_is_fixed_point_merge); + builder_->setBuildPoint(&block_is_fixed_point_if); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + // Flush NaN to 0 even for signed (NMax would flush it to the minimum value). + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpSelect, color_or_alpha_type, + builder_->createUnaryOp(spv::OpIsNan, + type_bool_vectors_[component_count - 1], + color_or_alpha), + const_float_vectors_0_[component_count - 1], color_or_alpha)); + id_vector_temp_.push_back(min_value); + id_vector_temp_.push_back(max_value); + spv::Id color_or_alpha_clamped = + builder_->createBuiltinCall(color_or_alpha_type, ext_inst_glsl_std_450_, + GLSLstd450FClamp, id_vector_temp_); + builder_->createBranch(&block_is_fixed_point_merge); + builder_->setBuildPoint(&block_is_fixed_point_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(color_or_alpha_clamped); + id_vector_temp_.push_back(block_is_fixed_point_if.getId()); + id_vector_temp_.push_back(color_or_alpha); + id_vector_temp_.push_back(block_is_fixed_point_head.getId()); + return builder_->createOp(spv::OpPhi, color_or_alpha_type, id_vector_temp_); +} + +spv::Id SpirvShaderTranslator::FSI_ApplyColorBlendFactor( + spv::Id value, spv::Id is_fixed_point, spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, spv::Id source_color, + spv::Id source_alpha, spv::Id dest_color, spv::Id dest_alpha, + spv::Id constant_color, spv::Id constant_alpha) { + // If the factor is zero, don't use it in the multiplication at all, so that + // infinity and NaN are not potentially involved in the multiplication. + // Calculate the condition before the selection merge, which must be the + // penultimate instruction in the block. + spv::Id factor_not_zero = builder_->createBinOp( + spv::OpINotEqual, type_bool_, factor, + builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))); + spv::Block& block_not_zero_head = *builder_->getBuildPoint(); + spv::Block& block_not_zero_if = builder_->makeNewBlock(); + spv::Block& block_not_zero_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_not_zero_merge.getId()); + builder_->createConditionalBranch(factor_not_zero, &block_not_zero_if, + &block_not_zero_merge); + + // Non-zero factor case. + + builder_->setBuildPoint(&block_not_zero_if); + + spv::Block& block_factor_head = *builder_->getBuildPoint(); + spv::Block& block_factor_one = builder_->makeNewBlock(); + std::array color_factor_blocks; + std::array one_minus_color_factor_blocks; + std::array alpha_factor_blocks; + std::array one_minus_alpha_factor_blocks; + color_factor_blocks[0] = &builder_->makeNewBlock(); + one_minus_color_factor_blocks[0] = &builder_->makeNewBlock(); + alpha_factor_blocks[0] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[0] = &builder_->makeNewBlock(); + color_factor_blocks[1] = &builder_->makeNewBlock(); + one_minus_color_factor_blocks[1] = &builder_->makeNewBlock(); + alpha_factor_blocks[1] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[1] = &builder_->makeNewBlock(); + color_factor_blocks[2] = &builder_->makeNewBlock(); + one_minus_color_factor_blocks[2] = &builder_->makeNewBlock(); + alpha_factor_blocks[2] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[2] = &builder_->makeNewBlock(); + spv::Block& block_factor_source_alpha_saturate = builder_->makeNewBlock(); + spv::Block& block_factor_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_factor_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr factor_switch_op = + std::make_unique(spv::OpSwitch); + factor_switch_op->addIdOperand(factor); + // Make one the default factor. + factor_switch_op->addIdOperand(block_factor_one.getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcColor)); + factor_switch_op->addIdOperand(color_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcColor)); + factor_switch_op->addIdOperand(one_minus_color_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstColor)); + factor_switch_op->addIdOperand(color_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstColor)); + factor_switch_op->addIdOperand(one_minus_color_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantColor)); + factor_switch_op->addIdOperand(color_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantColor)); + factor_switch_op->addIdOperand(one_minus_color_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlphaSaturate)); + factor_switch_op->addIdOperand(block_factor_source_alpha_saturate.getId()); + builder_->getBuildPoint()->addInstruction(std::move(factor_switch_op)); + } + block_factor_one.addPredecessor(&block_factor_head); + for (uint32_t i = 0; i < 3; ++i) { + color_factor_blocks[i]->addPredecessor(&block_factor_head); + one_minus_color_factor_blocks[i]->addPredecessor(&block_factor_head); + alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + one_minus_alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + } + block_factor_source_alpha_saturate.addPredecessor(&block_factor_head); + + // kOne + builder_->setBuildPoint(&block_factor_one); + // The result is the value itself. + builder_->createBranch(&block_factor_merge); + + // k[OneMinus]Src/Dest/ConstantColor/Alpha + std::array color_factors = { + source_color, + dest_color, + constant_color, + }; + std::array alpha_factors = { + source_alpha, + dest_alpha, + constant_alpha, + }; + std::array color_factor_results; + std::array one_minus_color_factor_results; + std::array alpha_factor_results; + std::array one_minus_alpha_factor_results; + for (uint32_t i = 0; i < 3; ++i) { + spv::Id color_factor = color_factors[i]; + spv::Id alpha_factor = alpha_factors[i]; + + // kSrc/Dst/ConstantColor + { + builder_->setBuildPoint(color_factor_blocks[i]); + spv::Id result_color = + builder_->createBinOp(spv::OpFMul, type_float3_, value, color_factor); + builder_->addDecoration(result_color, spv::DecorationNoContraction); + color_factor_results[i] = result_color; + builder_->createBranch(&block_factor_merge); + } + + // kOneMinusSrc/Dst/ConstantColor + { + builder_->setBuildPoint(one_minus_color_factor_blocks[i]); + spv::Id one_minus_color_factor = builder_->createBinOp( + spv::OpFSub, type_float3_, const_float3_1_, color_factor); + builder_->addDecoration(one_minus_color_factor, + spv::DecorationNoContraction); + spv::Id result_one_minus_color = builder_->createBinOp( + spv::OpFMul, type_float3_, value, one_minus_color_factor); + builder_->addDecoration(result_one_minus_color, + spv::DecorationNoContraction); + one_minus_color_factor_results[i] = result_one_minus_color; + builder_->createBranch(&block_factor_merge); + } + + // kSrc/Dst/ConstantAlpha + { + builder_->setBuildPoint(alpha_factor_blocks[i]); + spv::Id result_alpha = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float3_, value, alpha_factor); + builder_->addDecoration(result_alpha, spv::DecorationNoContraction); + alpha_factor_results[i] = result_alpha; + builder_->createBranch(&block_factor_merge); + } + + // kOneMinusSrc/Dst/ConstantAlpha + { + builder_->setBuildPoint(one_minus_alpha_factor_blocks[i]); + spv::Id one_minus_alpha_factor = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, alpha_factor); + builder_->addDecoration(one_minus_alpha_factor, + spv::DecorationNoContraction); + spv::Id result_one_minus_alpha = + builder_->createBinOp(spv::OpVectorTimesScalar, type_float3_, value, + one_minus_alpha_factor); + builder_->addDecoration(result_one_minus_alpha, + spv::DecorationNoContraction); + one_minus_alpha_factor_results[i] = result_one_minus_alpha; + builder_->createBranch(&block_factor_merge); + } + } + + // kSrcAlphaSaturate + spv::Id result_source_alpha_saturate; + { + builder_->setBuildPoint(&block_factor_source_alpha_saturate); + spv::Id one_minus_dest_alpha = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, dest_alpha); + builder_->addDecoration(one_minus_dest_alpha, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(source_alpha); + id_vector_temp_.push_back(one_minus_dest_alpha); + spv::Id factor_source_alpha_saturate = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NMin, id_vector_temp_); + result_source_alpha_saturate = + builder_->createBinOp(spv::OpVectorTimesScalar, type_float3_, value, + factor_source_alpha_saturate); + builder_->addDecoration(result_source_alpha_saturate, + spv::DecorationNoContraction); + builder_->createBranch(&block_factor_merge); + } + + // Select the term for the non-zero factor. + builder_->setBuildPoint(&block_factor_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 14); + id_vector_temp_.push_back(value); + id_vector_temp_.push_back(block_factor_one.getId()); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp_.push_back(color_factor_results[i]); + id_vector_temp_.push_back(color_factor_blocks[i]->getId()); + id_vector_temp_.push_back(one_minus_color_factor_results[i]); + id_vector_temp_.push_back(one_minus_color_factor_blocks[i]->getId()); + id_vector_temp_.push_back(alpha_factor_results[i]); + id_vector_temp_.push_back(alpha_factor_blocks[i]->getId()); + id_vector_temp_.push_back(one_minus_alpha_factor_results[i]); + id_vector_temp_.push_back(one_minus_alpha_factor_blocks[i]->getId()); + } + id_vector_temp_.push_back(result_source_alpha_saturate); + id_vector_temp_.push_back(block_factor_source_alpha_saturate.getId()); + spv::Id result_unclamped = + builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); + spv::Id result = FSI_FlushNaNClampAndInBlending( + result_unclamped, is_fixed_point, clamp_min_value, clamp_max_value); + builder_->createBranch(&block_not_zero_merge); + // Get the latest block for a non-zero factor after all the control flow. + spv::Block& block_not_zero_if_end = *builder_->getBuildPoint(); + + // Make the result zero if the factor is zero. + builder_->setBuildPoint(&block_not_zero_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(result); + id_vector_temp_.push_back(block_not_zero_if_end.getId()); + id_vector_temp_.push_back(const_float3_0_); + id_vector_temp_.push_back(block_not_zero_head.getId()); + return builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); +} + +spv::Id SpirvShaderTranslator::FSI_ApplyAlphaBlendFactor( + spv::Id value, spv::Id is_fixed_point, spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, spv::Id source_alpha, + spv::Id dest_alpha, spv::Id constant_alpha) { + // If the factor is zero, don't use it in the multiplication at all, so that + // infinity and NaN are not potentially involved in the multiplication. + // Calculate the condition before the selection merge, which must be the + // penultimate instruction in the block. + spv::Id factor_not_zero = builder_->createBinOp( + spv::OpINotEqual, type_bool_, factor, + builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))); + spv::Block& block_not_zero_head = *builder_->getBuildPoint(); + spv::Block& block_not_zero_if = builder_->makeNewBlock(); + spv::Block& block_not_zero_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_not_zero_merge.getId()); + builder_->createConditionalBranch(factor_not_zero, &block_not_zero_if, + &block_not_zero_merge); + + // Non-zero factor case. + + builder_->setBuildPoint(&block_not_zero_if); + + spv::Block& block_factor_head = *builder_->getBuildPoint(); + spv::Block& block_factor_one = builder_->makeNewBlock(); + std::array alpha_factor_blocks; + std::array one_minus_alpha_factor_blocks; + alpha_factor_blocks[0] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[0] = &builder_->makeNewBlock(); + alpha_factor_blocks[1] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[1] = &builder_->makeNewBlock(); + alpha_factor_blocks[2] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[2] = &builder_->makeNewBlock(); + spv::Block& block_factor_source_alpha_saturate = builder_->makeNewBlock(); + spv::Block& block_factor_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_factor_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr factor_switch_op = + std::make_unique(spv::OpSwitch); + factor_switch_op->addIdOperand(factor); + // Make one the default factor. + factor_switch_op->addIdOperand(block_factor_one.getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcColor)); + factor_switch_op->addIdOperand(alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcColor)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstColor)); + factor_switch_op->addIdOperand(alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstColor)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantColor)); + factor_switch_op->addIdOperand(alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantColor)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlphaSaturate)); + factor_switch_op->addIdOperand(block_factor_source_alpha_saturate.getId()); + builder_->getBuildPoint()->addInstruction(std::move(factor_switch_op)); + } + block_factor_one.addPredecessor(&block_factor_head); + for (uint32_t i = 0; i < 3; ++i) { + alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + one_minus_alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + } + block_factor_source_alpha_saturate.addPredecessor(&block_factor_head); + + // kOne + builder_->setBuildPoint(&block_factor_one); + // The result is the value itself. + builder_->createBranch(&block_factor_merge); + + // k[OneMinus]Src/Dest/ConstantColor/Alpha + std::array alpha_factors = { + source_alpha, + dest_alpha, + constant_alpha, + }; + std::array alpha_factor_results; + std::array one_minus_alpha_factor_results; + for (uint32_t i = 0; i < 3; ++i) { + spv::Id alpha_factor = alpha_factors[i]; + + // kSrc/Dst/ConstantColor/Alpha + { + builder_->setBuildPoint(alpha_factor_blocks[i]); + spv::Id result_alpha = + builder_->createBinOp(spv::OpFMul, type_float_, value, alpha_factor); + builder_->addDecoration(result_alpha, spv::DecorationNoContraction); + alpha_factor_results[i] = result_alpha; + builder_->createBranch(&block_factor_merge); + } + + // kOneMinusSrc/Dst/ConstantColor/Alpha + { + builder_->setBuildPoint(one_minus_alpha_factor_blocks[i]); + spv::Id one_minus_alpha_factor = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, alpha_factor); + builder_->addDecoration(one_minus_alpha_factor, + spv::DecorationNoContraction); + spv::Id result_one_minus_alpha = builder_->createBinOp( + spv::OpFMul, type_float_, value, one_minus_alpha_factor); + builder_->addDecoration(result_one_minus_alpha, + spv::DecorationNoContraction); + one_minus_alpha_factor_results[i] = result_one_minus_alpha; + builder_->createBranch(&block_factor_merge); + } + } + + // kSrcAlphaSaturate + spv::Id result_source_alpha_saturate; + { + builder_->setBuildPoint(&block_factor_source_alpha_saturate); + spv::Id one_minus_dest_alpha = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, dest_alpha); + builder_->addDecoration(one_minus_dest_alpha, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(source_alpha); + id_vector_temp_.push_back(one_minus_dest_alpha); + spv::Id factor_source_alpha_saturate = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NMin, id_vector_temp_); + result_source_alpha_saturate = builder_->createBinOp( + spv::OpFMul, type_float_, value, factor_source_alpha_saturate); + builder_->addDecoration(result_source_alpha_saturate, + spv::DecorationNoContraction); + builder_->createBranch(&block_factor_merge); + } + + // Select the term for the non-zero factor. + builder_->setBuildPoint(&block_factor_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 8); + id_vector_temp_.push_back(value); + id_vector_temp_.push_back(block_factor_one.getId()); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp_.push_back(alpha_factor_results[i]); + id_vector_temp_.push_back(alpha_factor_blocks[i]->getId()); + id_vector_temp_.push_back(one_minus_alpha_factor_results[i]); + id_vector_temp_.push_back(one_minus_alpha_factor_blocks[i]->getId()); + } + id_vector_temp_.push_back(result_source_alpha_saturate); + id_vector_temp_.push_back(block_factor_source_alpha_saturate.getId()); + spv::Id result_unclamped = + builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); + spv::Id result = FSI_FlushNaNClampAndInBlending( + result_unclamped, is_fixed_point, clamp_min_value, clamp_max_value); + builder_->createBranch(&block_not_zero_merge); + // Get the latest block for a non-zero factor after all the control flow. + spv::Block& block_not_zero_if_end = *builder_->getBuildPoint(); + + // Make the result zero if the factor is zero. + builder_->setBuildPoint(&block_not_zero_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(result); + id_vector_temp_.push_back(block_not_zero_if_end.getId()); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(block_not_zero_head.getId()); + return builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); +} + +spv::Id SpirvShaderTranslator::FSI_BlendColorOrAlphaWithUnclampedResult( + spv::Id is_fixed_point, spv::Id clamp_min_value, spv::Id clamp_max_value, + spv::Id source_color_clamped, spv::Id source_alpha_clamped, + spv::Id dest_color, spv::Id dest_alpha, spv::Id constant_color_clamped, + spv::Id constant_alpha_clamped, spv::Id equation, spv::Id source_factor, + spv::Id dest_factor) { + bool is_alpha = source_color_clamped == spv::NoResult; + assert_false(!is_alpha && (dest_color == spv::NoResult || + constant_color_clamped == spv::NoResult)); + assert_false(is_alpha && (dest_color != spv::NoResult || + constant_color_clamped != spv::NoResult)); + spv::Id value_type = is_alpha ? type_float_ : type_float3_; + + // Handle min and max blend operations, which don't involve the factors. + spv::Block& block_min_max_head = *builder_->getBuildPoint(); + spv::Block& block_min_max_min = builder_->makeNewBlock(); + spv::Block& block_min_max_max = builder_->makeNewBlock(); + spv::Block& block_min_max_default = builder_->makeNewBlock(); + spv::Block& block_min_max_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_min_max_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr min_max_switch_op = + std::make_unique(spv::OpSwitch); + min_max_switch_op->addIdOperand(equation); + min_max_switch_op->addIdOperand(block_min_max_default.getId()); + min_max_switch_op->addImmediateOperand(int32_t(xenos::BlendOp::kMin)); + min_max_switch_op->addIdOperand(block_min_max_min.getId()); + min_max_switch_op->addImmediateOperand(int32_t(xenos::BlendOp::kMax)); + min_max_switch_op->addIdOperand(block_min_max_max.getId()); + builder_->getBuildPoint()->addInstruction(std::move(min_max_switch_op)); + } + block_min_max_default.addPredecessor(&block_min_max_head); + block_min_max_min.addPredecessor(&block_min_max_head); + block_min_max_max.addPredecessor(&block_min_max_head); + + // Min case. + builder_->setBuildPoint(&block_min_max_min); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(is_alpha ? source_alpha_clamped + : source_color_clamped); + id_vector_temp_.push_back(is_alpha ? dest_alpha : dest_color); + spv::Id result_min = builder_->createBuiltinCall( + value_type, ext_inst_glsl_std_450_, GLSLstd450FMin, id_vector_temp_); + builder_->createBranch(&block_min_max_merge); + + // Max case. + builder_->setBuildPoint(&block_min_max_max); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(is_alpha ? source_alpha_clamped + : source_color_clamped); + id_vector_temp_.push_back(is_alpha ? dest_alpha : dest_color); + spv::Id result_max = builder_->createBuiltinCall( + value_type, ext_inst_glsl_std_450_, GLSLstd450FMax, id_vector_temp_); + builder_->createBranch(&block_min_max_merge); + + // Blending with factors. + spv::Id result_factors; + { + builder_->setBuildPoint(&block_min_max_default); + + spv::Id term_source, term_dest; + if (is_alpha) { + term_source = FSI_ApplyAlphaBlendFactor( + source_alpha_clamped, is_fixed_point, clamp_min_value, + clamp_max_value, source_factor, source_alpha_clamped, dest_alpha, + constant_alpha_clamped); + term_dest = FSI_ApplyAlphaBlendFactor(dest_alpha, is_fixed_point, + clamp_min_value, clamp_max_value, + dest_factor, source_alpha_clamped, + dest_alpha, constant_alpha_clamped); + } else { + term_source = FSI_ApplyColorBlendFactor( + source_color_clamped, is_fixed_point, clamp_min_value, + clamp_max_value, source_factor, source_color_clamped, + source_alpha_clamped, dest_color, dest_alpha, constant_color_clamped, + constant_alpha_clamped); + term_dest = FSI_ApplyColorBlendFactor( + dest_color, is_fixed_point, clamp_min_value, clamp_max_value, + dest_factor, source_color_clamped, source_alpha_clamped, dest_color, + dest_alpha, constant_color_clamped, constant_alpha_clamped); + } + + spv::Block& block_signs_head = *builder_->getBuildPoint(); + spv::Block& block_signs_add = builder_->makeNewBlock(); + spv::Block& block_signs_subtract = builder_->makeNewBlock(); + spv::Block& block_signs_reverse_subtract = builder_->makeNewBlock(); + spv::Block& block_signs_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_signs_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr signs_switch_op = + std::make_unique(spv::OpSwitch); + signs_switch_op->addIdOperand(equation); + // Make addition the default. + signs_switch_op->addIdOperand(block_signs_add.getId()); + signs_switch_op->addImmediateOperand(int32_t(xenos::BlendOp::kSubtract)); + signs_switch_op->addIdOperand(block_signs_subtract.getId()); + signs_switch_op->addImmediateOperand( + int32_t(xenos::BlendOp::kRevSubtract)); + signs_switch_op->addIdOperand(block_signs_reverse_subtract.getId()); + builder_->getBuildPoint()->addInstruction(std::move(signs_switch_op)); + } + block_signs_add.addPredecessor(&block_signs_head); + block_signs_subtract.addPredecessor(&block_signs_head); + block_signs_reverse_subtract.addPredecessor(&block_signs_head); + + // Addition case. + builder_->setBuildPoint(&block_signs_add); + spv::Id result_add = + builder_->createBinOp(spv::OpFAdd, value_type, term_source, term_dest); + builder_->addDecoration(result_add, spv::DecorationNoContraction); + builder_->createBranch(&block_signs_merge); + + // Subtraction case. + builder_->setBuildPoint(&block_signs_subtract); + spv::Id result_subtract = + builder_->createBinOp(spv::OpFSub, value_type, term_source, term_dest); + builder_->addDecoration(result_subtract, spv::DecorationNoContraction); + builder_->createBranch(&block_signs_merge); + + // Reverse subtraction case. + builder_->setBuildPoint(&block_signs_reverse_subtract); + spv::Id result_reverse_subtract = + builder_->createBinOp(spv::OpFSub, value_type, term_dest, term_source); + builder_->addDecoration(result_reverse_subtract, + spv::DecorationNoContraction); + builder_->createBranch(&block_signs_merge); + + // Selection between the signs involved in the addition. + builder_->setBuildPoint(&block_signs_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 3); + id_vector_temp_.push_back(result_add); + id_vector_temp_.push_back(block_signs_add.getId()); + id_vector_temp_.push_back(result_subtract); + id_vector_temp_.push_back(block_signs_subtract.getId()); + id_vector_temp_.push_back(result_reverse_subtract); + id_vector_temp_.push_back(block_signs_reverse_subtract.getId()); + result_factors = + builder_->createOp(spv::OpPhi, value_type, id_vector_temp_); + builder_->createBranch(&block_min_max_merge); + } + // Get the latest block for blending with factors after all the control flow. + spv::Block& block_min_max_default_end = *builder_->getBuildPoint(); + + builder_->setBuildPoint(&block_min_max_merge); + // Choose out of min, max, and blending with factors. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 3); + id_vector_temp_.push_back(result_min); + id_vector_temp_.push_back(block_min_max_min.getId()); + id_vector_temp_.push_back(result_max); + id_vector_temp_.push_back(block_min_max_max.getId()); + id_vector_temp_.push_back(result_factors); + id_vector_temp_.push_back(block_min_max_default_end.getId()); + return builder_->createOp(spv::OpPhi, value_type, id_vector_temp_); } } // namespace gpu diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 68a00cbe8..e48115894 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -67,9 +67,6 @@ const VkDescriptorPoolSize {VK_DESCRIPTOR_TYPE_SAMPLER, kLinkedTypeDescriptorPoolSetCount}, }; -// No specific reason for 32768 descriptors, just the "too much" amount from -// Direct3D 12 PIX warnings. 2x descriptors for textures because of unsigned and -// signed bindings. VulkanCommandProcessor::VulkanCommandProcessor( VulkanGraphicsSystem* graphics_system, kernel::KernelState* kernel_state) : CommandProcessor(graphics_system, kernel_state), @@ -106,6 +103,32 @@ void VulkanCommandProcessor::TracePlaybackWroteMemory(uint32_t base_ptr, void VulkanCommandProcessor::RestoreEdramSnapshot(const void* snapshot) {} +std::string VulkanCommandProcessor::GetWindowTitleText() const { + std::ostringstream title; + title << "Vulkan"; + if (render_target_cache_) { + switch (render_target_cache_->GetPath()) { + case RenderTargetCache::Path::kHostRenderTargets: + title << " - FBO"; + break; + case RenderTargetCache::Path::kPixelShaderInterlock: + title << " - FSI"; + break; + default: + break; + } + uint32_t draw_resolution_scale_x = + texture_cache_ ? texture_cache_->draw_resolution_scale_x() : 1; + uint32_t draw_resolution_scale_y = + texture_cache_ ? texture_cache_->draw_resolution_scale_y() : 1; + if (draw_resolution_scale_x > 1 || draw_resolution_scale_y > 1) { + title << ' ' << draw_resolution_scale_x << 'x' << draw_resolution_scale_y; + } + } + title << " - HEAVILY INCOMPLETE, early development"; + return title.str(); +} + bool VulkanCommandProcessor::SetupContext() { if (!CommandProcessor::SetupContext()) { XELOGE("Failed to initialize base command processor context"); @@ -146,7 +169,7 @@ bool VulkanCommandProcessor::SetupContext() { size_t(16384)), size_t(uniform_buffer_alignment))); - // Descriptor set layouts. + // Descriptor set layouts that don't depend on the setup of other subsystems. VkShaderStageFlags guest_shader_stages = guest_shader_vertex_stages_ | VK_SHADER_STAGE_FRAGMENT_BIT; // Empty. @@ -163,37 +186,6 @@ bool VulkanCommandProcessor::SetupContext() { XELOGE("Failed to create an empty Vulkan descriptor set layout"); return false; } - // Shared memory and EDRAM. - uint32_t shared_memory_binding_count_log2 = - SpirvShaderTranslator::GetSharedMemoryStorageBufferCountLog2( - provider.device_properties().limits.maxStorageBufferRange); - uint32_t shared_memory_binding_count = UINT32_C(1) - << shared_memory_binding_count_log2; - VkDescriptorSetLayoutBinding - descriptor_set_layout_bindings_shared_memory_and_edram[1]; - descriptor_set_layout_bindings_shared_memory_and_edram[0].binding = 0; - descriptor_set_layout_bindings_shared_memory_and_edram[0].descriptorType = - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - descriptor_set_layout_bindings_shared_memory_and_edram[0].descriptorCount = - shared_memory_binding_count; - descriptor_set_layout_bindings_shared_memory_and_edram[0].stageFlags = - guest_shader_stages; - descriptor_set_layout_bindings_shared_memory_and_edram[0].pImmutableSamplers = - nullptr; - // TODO(Triang3l): EDRAM storage image binding for the fragment shader - // interlocks case. - descriptor_set_layout_create_info.bindingCount = uint32_t( - xe::countof(descriptor_set_layout_bindings_shared_memory_and_edram)); - descriptor_set_layout_create_info.pBindings = - descriptor_set_layout_bindings_shared_memory_and_edram; - if (dfn.vkCreateDescriptorSetLayout( - device, &descriptor_set_layout_create_info, nullptr, - &descriptor_set_layout_shared_memory_and_edram_) != VK_SUCCESS) { - XELOGE( - "Failed to create a Vulkan descriptor set layout for the shared memory " - "and the EDRAM"); - return false; - } // Guest draw constants. VkDescriptorSetLayoutBinding descriptor_set_layout_bindings_constants [SpirvShaderTranslator::kConstantBufferCount] = {}; @@ -289,16 +281,70 @@ bool VulkanCommandProcessor::SetupContext() { return false; } + uint32_t shared_memory_binding_count_log2 = + SpirvShaderTranslator::GetSharedMemoryStorageBufferCountLog2( + provider.device_properties().limits.maxStorageBufferRange); + uint32_t shared_memory_binding_count = UINT32_C(1) + << shared_memory_binding_count_log2; + // Requires the transient descriptor set layouts. // TODO(Triang3l): Get the actual draw resolution scale when the texture cache // supports resolution scaling. render_target_cache_ = std::make_unique( *register_file_, *memory_, trace_writer_, 1, 1, *this); - if (!render_target_cache_->Initialize()) { + if (!render_target_cache_->Initialize(shared_memory_binding_count)) { XELOGE("Failed to initialize the render target cache"); return false; } + // Shared memory and EDRAM descriptor set layout. + bool edram_fragment_shader_interlock = + render_target_cache_->GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + VkDescriptorSetLayoutBinding + shared_memory_and_edram_descriptor_set_layout_bindings[2]; + shared_memory_and_edram_descriptor_set_layout_bindings[0].binding = 0; + shared_memory_and_edram_descriptor_set_layout_bindings[0].descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + shared_memory_and_edram_descriptor_set_layout_bindings[0].descriptorCount = + shared_memory_binding_count; + shared_memory_and_edram_descriptor_set_layout_bindings[0].stageFlags = + guest_shader_stages; + shared_memory_and_edram_descriptor_set_layout_bindings[0].pImmutableSamplers = + nullptr; + VkDescriptorSetLayoutCreateInfo + shared_memory_and_edram_descriptor_set_layout_create_info; + shared_memory_and_edram_descriptor_set_layout_create_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + shared_memory_and_edram_descriptor_set_layout_create_info.pNext = nullptr; + shared_memory_and_edram_descriptor_set_layout_create_info.flags = 0; + shared_memory_and_edram_descriptor_set_layout_create_info.pBindings = + shared_memory_and_edram_descriptor_set_layout_bindings; + if (edram_fragment_shader_interlock) { + // EDRAM. + shared_memory_and_edram_descriptor_set_layout_bindings[1].binding = 1; + shared_memory_and_edram_descriptor_set_layout_bindings[1].descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + shared_memory_and_edram_descriptor_set_layout_bindings[1].descriptorCount = + 1; + shared_memory_and_edram_descriptor_set_layout_bindings[1].stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT; + shared_memory_and_edram_descriptor_set_layout_bindings[1] + .pImmutableSamplers = nullptr; + shared_memory_and_edram_descriptor_set_layout_create_info.bindingCount = 2; + } else { + shared_memory_and_edram_descriptor_set_layout_create_info.bindingCount = 1; + } + if (dfn.vkCreateDescriptorSetLayout( + device, &shared_memory_and_edram_descriptor_set_layout_create_info, + nullptr, + &descriptor_set_layout_shared_memory_and_edram_) != VK_SUCCESS) { + XELOGE( + "Failed to create a Vulkan descriptor set layout for the shared memory " + "and the EDRAM"); + return false; + } + pipeline_cache_ = std::make_unique( *this, *register_file_, *render_target_cache_, guest_shader_vertex_stages_); @@ -320,9 +366,8 @@ bool VulkanCommandProcessor::SetupContext() { // Shared memory and EDRAM common bindings. VkDescriptorPoolSize descriptor_pool_sizes[1]; descriptor_pool_sizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - descriptor_pool_sizes[0].descriptorCount = shared_memory_binding_count; - // TODO(Triang3l): EDRAM storage image binding for the fragment shader - // interlocks case. + descriptor_pool_sizes[0].descriptorCount = + shared_memory_binding_count + uint32_t(edram_fragment_shader_interlock); VkDescriptorPoolCreateInfo descriptor_pool_create_info; descriptor_pool_create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; @@ -369,20 +414,45 @@ bool VulkanCommandProcessor::SetupContext() { shared_memory_binding_range * i; shared_memory_descriptor_buffer_info.range = shared_memory_binding_range; } - VkWriteDescriptorSet write_descriptor_sets[1]; - write_descriptor_sets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - write_descriptor_sets[0].pNext = nullptr; - write_descriptor_sets[0].dstSet = shared_memory_and_edram_descriptor_set_; - write_descriptor_sets[0].dstBinding = 0; - write_descriptor_sets[0].dstArrayElement = 0; - write_descriptor_sets[0].descriptorCount = shared_memory_binding_count; - write_descriptor_sets[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - write_descriptor_sets[0].pImageInfo = nullptr; - write_descriptor_sets[0].pBufferInfo = shared_memory_descriptor_buffers_info; - write_descriptor_sets[0].pTexelBufferView = nullptr; - // TODO(Triang3l): EDRAM storage image binding for the fragment shader - // interlocks case. - dfn.vkUpdateDescriptorSets(device, 1, write_descriptor_sets, 0, nullptr); + VkWriteDescriptorSet write_descriptor_sets[2]; + VkWriteDescriptorSet& write_descriptor_set_shared_memory = + write_descriptor_sets[0]; + write_descriptor_set_shared_memory.sType = + VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write_descriptor_set_shared_memory.pNext = nullptr; + write_descriptor_set_shared_memory.dstSet = + shared_memory_and_edram_descriptor_set_; + write_descriptor_set_shared_memory.dstBinding = 0; + write_descriptor_set_shared_memory.dstArrayElement = 0; + write_descriptor_set_shared_memory.descriptorCount = + shared_memory_binding_count; + write_descriptor_set_shared_memory.descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write_descriptor_set_shared_memory.pImageInfo = nullptr; + write_descriptor_set_shared_memory.pBufferInfo = + shared_memory_descriptor_buffers_info; + write_descriptor_set_shared_memory.pTexelBufferView = nullptr; + VkDescriptorBufferInfo edram_descriptor_buffer_info; + if (edram_fragment_shader_interlock) { + edram_descriptor_buffer_info.buffer = render_target_cache_->edram_buffer(); + edram_descriptor_buffer_info.offset = 0; + edram_descriptor_buffer_info.range = VK_WHOLE_SIZE; + VkWriteDescriptorSet& write_descriptor_set_edram = write_descriptor_sets[1]; + write_descriptor_set_edram.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write_descriptor_set_edram.pNext = nullptr; + write_descriptor_set_edram.dstSet = shared_memory_and_edram_descriptor_set_; + write_descriptor_set_edram.dstBinding = 1; + write_descriptor_set_edram.dstArrayElement = 0; + write_descriptor_set_edram.descriptorCount = 1; + write_descriptor_set_edram.descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write_descriptor_set_edram.pImageInfo = nullptr; + write_descriptor_set_edram.pBufferInfo = &edram_descriptor_buffer_info; + write_descriptor_set_edram.pTexelBufferView = nullptr; + } + dfn.vkUpdateDescriptorSets(device, + 1 + uint32_t(edram_fragment_shader_interlock), + write_descriptor_sets, 0, nullptr); // Swap objects. @@ -1041,6 +1111,9 @@ void VulkanCommandProcessor::ShutdownContext() { } descriptor_set_layouts_textures_.clear(); + ui::vulkan::util::DestroyAndNullHandle( + dfn.vkDestroyDescriptorSetLayout, device, + descriptor_set_layout_shared_memory_and_edram_); for (VkDescriptorSetLayout& descriptor_set_layout_single_transient : descriptor_set_layouts_single_transient_) { ui::vulkan::util::DestroyAndNullHandle( @@ -1050,9 +1123,6 @@ void VulkanCommandProcessor::ShutdownContext() { ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, device, descriptor_set_layout_constants_); - ui::vulkan::util::DestroyAndNullHandle( - dfn.vkDestroyDescriptorSetLayout, device, - descriptor_set_layout_shared_memory_and_edram_); ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, device, descriptor_set_layout_empty_); @@ -2401,7 +2471,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // Update system constants before uploading them. UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result, shader_32bit_index_dma, viewport_info, - used_texture_mask); + used_texture_mask, normalized_depth_control, + normalized_color_mask); // Update uniform buffers and descriptor sets after binding the pipeline with // the new layout. @@ -2461,6 +2532,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // After all commands that may dispatch, copy or insert barriers, submit the // barriers (may end the render pass), and (re)enter the render pass before // drawing. + // TODO(Triang3l): Handle disabled variableMultisampleRate by restarting the + // render pass with no attachments if the sample count becomes different. SubmitBarriersAndEnterRenderTargetCacheRenderPass( render_target_cache_->last_update_render_pass(), render_target_cache_->last_update_framebuffer()); @@ -3180,175 +3253,180 @@ void VulkanCommandProcessor::UpdateDynamicState( scissor_rect.extent.height = scissor.extent[1]; SetScissor(scissor_rect); - // Depth bias. - // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB - // implementation. - float depth_bias_constant_factor, depth_bias_slope_factor; - draw_util::GetPreferredFacePolygonOffset(regs, primitive_polygonal, - depth_bias_slope_factor, - depth_bias_constant_factor); - depth_bias_constant_factor *= - regs.Get().depth_format == - xenos::DepthRenderTargetFormat::kD24S8 - ? draw_util::kD3D10PolygonOffsetFactorUnorm24 - : draw_util::kD3D10PolygonOffsetFactorFloat24; - // With non-square resolution scaling, make sure the worst-case impact is - // reverted (slope only along the scaled axis), thus max. More bias is better - // than less bias, because less bias means Z fighting with the background is - // more likely. - depth_bias_slope_factor *= - xenos::kPolygonOffsetScaleSubpixelUnit * - float(std::max(render_target_cache_->draw_resolution_scale_x(), - render_target_cache_->draw_resolution_scale_y())); - // std::memcmp instead of != so in case of NaN, every draw won't be - // invalidating it. - dynamic_depth_bias_update_needed_ |= - std::memcmp(&dynamic_depth_bias_constant_factor_, - &depth_bias_constant_factor, sizeof(float)) != 0; - dynamic_depth_bias_update_needed_ |= - std::memcmp(&dynamic_depth_bias_slope_factor_, &depth_bias_slope_factor, - sizeof(float)) != 0; - if (dynamic_depth_bias_update_needed_) { - dynamic_depth_bias_constant_factor_ = depth_bias_constant_factor; - dynamic_depth_bias_slope_factor_ = depth_bias_slope_factor; - deferred_command_buffer_.CmdVkSetDepthBias( - dynamic_depth_bias_constant_factor_, 0.0f, - dynamic_depth_bias_slope_factor_); - dynamic_depth_bias_update_needed_ = false; - } + if (render_target_cache_->GetPath() == + RenderTargetCache::Path::kHostRenderTargets) { + // Depth bias. + float depth_bias_constant_factor, depth_bias_slope_factor; + draw_util::GetPreferredFacePolygonOffset(regs, primitive_polygonal, + depth_bias_slope_factor, + depth_bias_constant_factor); + depth_bias_constant_factor *= + regs.Get().depth_format == + xenos::DepthRenderTargetFormat::kD24S8 + ? draw_util::kD3D10PolygonOffsetFactorUnorm24 + : draw_util::kD3D10PolygonOffsetFactorFloat24; + // With non-square resolution scaling, make sure the worst-case impact is + // reverted (slope only along the scaled axis), thus max. More bias is + // better than less bias, because less bias means Z fighting with the + // background is more likely. + depth_bias_slope_factor *= + xenos::kPolygonOffsetScaleSubpixelUnit * + float(std::max(render_target_cache_->draw_resolution_scale_x(), + render_target_cache_->draw_resolution_scale_y())); + // std::memcmp instead of != so in case of NaN, every draw won't be + // invalidating it. + dynamic_depth_bias_update_needed_ |= + std::memcmp(&dynamic_depth_bias_constant_factor_, + &depth_bias_constant_factor, sizeof(float)) != 0; + dynamic_depth_bias_update_needed_ |= + std::memcmp(&dynamic_depth_bias_slope_factor_, &depth_bias_slope_factor, + sizeof(float)) != 0; + if (dynamic_depth_bias_update_needed_) { + dynamic_depth_bias_constant_factor_ = depth_bias_constant_factor; + dynamic_depth_bias_slope_factor_ = depth_bias_slope_factor; + deferred_command_buffer_.CmdVkSetDepthBias( + dynamic_depth_bias_constant_factor_, 0.0f, + dynamic_depth_bias_slope_factor_); + dynamic_depth_bias_update_needed_ = false; + } - // Blend constants. - float blend_constants[] = { - regs[XE_GPU_REG_RB_BLEND_RED].f32, - regs[XE_GPU_REG_RB_BLEND_GREEN].f32, - regs[XE_GPU_REG_RB_BLEND_BLUE].f32, - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32, - }; - dynamic_blend_constants_update_needed_ |= - std::memcmp(dynamic_blend_constants_, blend_constants, - sizeof(float) * 4) != 0; - if (dynamic_blend_constants_update_needed_) { - std::memcpy(dynamic_blend_constants_, blend_constants, sizeof(float) * 4); - deferred_command_buffer_.CmdVkSetBlendConstants(dynamic_blend_constants_); - dynamic_blend_constants_update_needed_ = false; - } + // Blend constants. + float blend_constants[] = { + regs[XE_GPU_REG_RB_BLEND_RED].f32, + regs[XE_GPU_REG_RB_BLEND_GREEN].f32, + regs[XE_GPU_REG_RB_BLEND_BLUE].f32, + regs[XE_GPU_REG_RB_BLEND_ALPHA].f32, + }; + dynamic_blend_constants_update_needed_ |= + std::memcmp(dynamic_blend_constants_, blend_constants, + sizeof(float) * 4) != 0; + if (dynamic_blend_constants_update_needed_) { + std::memcpy(dynamic_blend_constants_, blend_constants, sizeof(float) * 4); + deferred_command_buffer_.CmdVkSetBlendConstants(dynamic_blend_constants_); + dynamic_blend_constants_update_needed_ = false; + } - // Stencil masks and references. - // Due to pretty complex conditions involving registers not directly related - // to stencil (primitive type, culling), changing the values only when stencil - // is actually needed. However, due to the way dynamic state needs to be set - // in Vulkan, which doesn't take into account whether the state actually has - // effect on drawing, and because the masks and the references are always - // dynamic in Xenia guest pipelines, they must be set in the command buffer - // before any draw. - if (normalized_depth_control.stencil_enable) { - Register stencil_ref_mask_front_reg, stencil_ref_mask_back_reg; - if (primitive_polygonal && normalized_depth_control.backface_enable) { - const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); - const VkPhysicalDevicePortabilitySubsetFeaturesKHR* - device_portability_subset_features = - provider.device_portability_subset_features(); - if (!device_portability_subset_features || - device_portability_subset_features->separateStencilMaskRef) { - // Choose the back face values only if drawing only back faces. - stencil_ref_mask_front_reg = - regs.Get().cull_front - ? XE_GPU_REG_RB_STENCILREFMASK_BF - : XE_GPU_REG_RB_STENCILREFMASK; - stencil_ref_mask_back_reg = stencil_ref_mask_front_reg; + // Stencil masks and references. + // Due to pretty complex conditions involving registers not directly related + // to stencil (primitive type, culling), changing the values only when + // stencil is actually needed. However, due to the way dynamic state needs + // to be set in Vulkan, which doesn't take into account whether the state + // actually has effect on drawing, and because the masks and the references + // are always dynamic in Xenia guest pipelines, they must be set in the + // command buffer before any draw. + if (normalized_depth_control.stencil_enable) { + Register stencil_ref_mask_front_reg, stencil_ref_mask_back_reg; + if (primitive_polygonal && normalized_depth_control.backface_enable) { + const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); + const VkPhysicalDevicePortabilitySubsetFeaturesKHR* + device_portability_subset_features = + provider.device_portability_subset_features(); + if (!device_portability_subset_features || + device_portability_subset_features->separateStencilMaskRef) { + // Choose the back face values only if drawing only back faces. + stencil_ref_mask_front_reg = + regs.Get().cull_front + ? XE_GPU_REG_RB_STENCILREFMASK_BF + : XE_GPU_REG_RB_STENCILREFMASK; + stencil_ref_mask_back_reg = stencil_ref_mask_front_reg; + } else { + stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK; + stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK_BF; + } } else { stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK; - stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK_BF; + stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK; } - } else { - stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK; - stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK; + auto stencil_ref_mask_front = + regs.Get(stencil_ref_mask_front_reg); + auto stencil_ref_mask_back = + regs.Get(stencil_ref_mask_back_reg); + // Compare mask. + dynamic_stencil_compare_mask_front_update_needed_ |= + dynamic_stencil_compare_mask_front_ != + stencil_ref_mask_front.stencilmask; + dynamic_stencil_compare_mask_front_ = stencil_ref_mask_front.stencilmask; + dynamic_stencil_compare_mask_back_update_needed_ |= + dynamic_stencil_compare_mask_back_ != + stencil_ref_mask_back.stencilmask; + dynamic_stencil_compare_mask_back_ = stencil_ref_mask_back.stencilmask; + // Write mask. + dynamic_stencil_write_mask_front_update_needed_ |= + dynamic_stencil_write_mask_front_ != + stencil_ref_mask_front.stencilwritemask; + dynamic_stencil_write_mask_front_ = + stencil_ref_mask_front.stencilwritemask; + dynamic_stencil_write_mask_back_update_needed_ |= + dynamic_stencil_write_mask_back_ != + stencil_ref_mask_back.stencilwritemask; + dynamic_stencil_write_mask_back_ = stencil_ref_mask_back.stencilwritemask; + // Reference. + dynamic_stencil_reference_front_update_needed_ |= + dynamic_stencil_reference_front_ != stencil_ref_mask_front.stencilref; + dynamic_stencil_reference_front_ = stencil_ref_mask_front.stencilref; + dynamic_stencil_reference_back_update_needed_ |= + dynamic_stencil_reference_back_ != stencil_ref_mask_back.stencilref; + dynamic_stencil_reference_back_ = stencil_ref_mask_back.stencilref; } - auto stencil_ref_mask_front = - regs.Get(stencil_ref_mask_front_reg); - auto stencil_ref_mask_back = - regs.Get(stencil_ref_mask_back_reg); - // Compare mask. - dynamic_stencil_compare_mask_front_update_needed_ |= - dynamic_stencil_compare_mask_front_ != - stencil_ref_mask_front.stencilmask; - dynamic_stencil_compare_mask_front_ = stencil_ref_mask_front.stencilmask; - dynamic_stencil_compare_mask_back_update_needed_ |= - dynamic_stencil_compare_mask_back_ != stencil_ref_mask_back.stencilmask; - dynamic_stencil_compare_mask_back_ = stencil_ref_mask_back.stencilmask; - // Write mask. - dynamic_stencil_write_mask_front_update_needed_ |= - dynamic_stencil_write_mask_front_ != - stencil_ref_mask_front.stencilwritemask; - dynamic_stencil_write_mask_front_ = stencil_ref_mask_front.stencilwritemask; - dynamic_stencil_write_mask_back_update_needed_ |= - dynamic_stencil_write_mask_back_ != - stencil_ref_mask_back.stencilwritemask; - dynamic_stencil_write_mask_back_ = stencil_ref_mask_back.stencilwritemask; - // Reference. - dynamic_stencil_reference_front_update_needed_ |= - dynamic_stencil_reference_front_ != stencil_ref_mask_front.stencilref; - dynamic_stencil_reference_front_ = stencil_ref_mask_front.stencilref; - dynamic_stencil_reference_back_update_needed_ |= - dynamic_stencil_reference_back_ != stencil_ref_mask_back.stencilref; - dynamic_stencil_reference_back_ = stencil_ref_mask_back.stencilref; - } - // Using VK_STENCIL_FACE_FRONT_AND_BACK for higher safety when running on the - // Vulkan portability subset without separateStencilMaskRef. - if (dynamic_stencil_compare_mask_front_update_needed_ || - dynamic_stencil_compare_mask_back_update_needed_) { - if (dynamic_stencil_compare_mask_front_ == - dynamic_stencil_compare_mask_back_) { - deferred_command_buffer_.CmdVkSetStencilCompareMask( - VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_compare_mask_front_); - } else { - if (dynamic_stencil_compare_mask_front_update_needed_) { + // Using VK_STENCIL_FACE_FRONT_AND_BACK for higher safety when running on + // the Vulkan portability subset without separateStencilMaskRef. + if (dynamic_stencil_compare_mask_front_update_needed_ || + dynamic_stencil_compare_mask_back_update_needed_) { + if (dynamic_stencil_compare_mask_front_ == + dynamic_stencil_compare_mask_back_) { deferred_command_buffer_.CmdVkSetStencilCompareMask( - VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_compare_mask_front_); - } - if (dynamic_stencil_compare_mask_back_update_needed_) { - deferred_command_buffer_.CmdVkSetStencilCompareMask( - VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_compare_mask_back_); + VK_STENCIL_FACE_FRONT_AND_BACK, + dynamic_stencil_compare_mask_front_); + } else { + if (dynamic_stencil_compare_mask_front_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilCompareMask( + VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_compare_mask_front_); + } + if (dynamic_stencil_compare_mask_back_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilCompareMask( + VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_compare_mask_back_); + } } + dynamic_stencil_compare_mask_front_update_needed_ = false; + dynamic_stencil_compare_mask_back_update_needed_ = false; } - dynamic_stencil_compare_mask_front_update_needed_ = false; - dynamic_stencil_compare_mask_back_update_needed_ = false; - } - if (dynamic_stencil_write_mask_front_update_needed_ || - dynamic_stencil_write_mask_back_update_needed_) { - if (dynamic_stencil_write_mask_front_ == dynamic_stencil_write_mask_back_) { - deferred_command_buffer_.CmdVkSetStencilWriteMask( - VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_write_mask_front_); - } else { - if (dynamic_stencil_write_mask_front_update_needed_) { + if (dynamic_stencil_write_mask_front_update_needed_ || + dynamic_stencil_write_mask_back_update_needed_) { + if (dynamic_stencil_write_mask_front_ == + dynamic_stencil_write_mask_back_) { deferred_command_buffer_.CmdVkSetStencilWriteMask( - VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_write_mask_front_); - } - if (dynamic_stencil_write_mask_back_update_needed_) { - deferred_command_buffer_.CmdVkSetStencilWriteMask( - VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_write_mask_back_); + VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_write_mask_front_); + } else { + if (dynamic_stencil_write_mask_front_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilWriteMask( + VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_write_mask_front_); + } + if (dynamic_stencil_write_mask_back_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilWriteMask( + VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_write_mask_back_); + } } + dynamic_stencil_write_mask_front_update_needed_ = false; + dynamic_stencil_write_mask_back_update_needed_ = false; } - dynamic_stencil_write_mask_front_update_needed_ = false; - dynamic_stencil_write_mask_back_update_needed_ = false; - } - if (dynamic_stencil_reference_front_update_needed_ || - dynamic_stencil_reference_back_update_needed_) { - if (dynamic_stencil_reference_front_ == dynamic_stencil_reference_back_) { - deferred_command_buffer_.CmdVkSetStencilReference( - VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_reference_front_); - } else { - if (dynamic_stencil_reference_front_update_needed_) { + if (dynamic_stencil_reference_front_update_needed_ || + dynamic_stencil_reference_back_update_needed_) { + if (dynamic_stencil_reference_front_ == dynamic_stencil_reference_back_) { deferred_command_buffer_.CmdVkSetStencilReference( - VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_reference_front_); - } - if (dynamic_stencil_reference_back_update_needed_) { - deferred_command_buffer_.CmdVkSetStencilReference( - VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_reference_back_); + VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_reference_front_); + } else { + if (dynamic_stencil_reference_front_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilReference( + VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_reference_front_); + } + if (dynamic_stencil_reference_back_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilReference( + VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_reference_back_); + } } + dynamic_stencil_reference_front_update_needed_ = false; + dynamic_stencil_reference_back_update_needed_ = false; } - dynamic_stencil_reference_front_update_needed_ = false; - dynamic_stencil_reference_back_update_needed_ = false; } // TODO(Triang3l): VK_EXT_extended_dynamic_state and @@ -3359,23 +3437,67 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, - uint32_t used_texture_mask) { + uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control, + uint32_t normalized_color_mask) { #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES const RegisterFile& regs = *register_file_; auto pa_cl_vte_cntl = regs.Get(); + auto pa_su_sc_mode_cntl = regs.Get(); float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32; auto rb_colorcontrol = regs.Get(); + auto rb_depth_info = regs.Get(); + auto rb_stencilrefmask = regs.Get(); + auto rb_stencilrefmask_bf = + regs.Get(XE_GPU_REG_RB_STENCILREFMASK_BF); + auto rb_surface_info = regs.Get(); auto vgt_draw_initiator = regs.Get(); int32_t vgt_indx_offset = int32_t(regs[XE_GPU_REG_VGT_INDX_OFFSET].u32); - // Get the color info register values for each render target. + bool edram_fragment_shader_interlock = + render_target_cache_->GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); + uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); + + // Get the color info register values for each render target. Also, for FSI, + // exclude components that don't exist in the format from the write mask. + // Don't exclude fully overlapping render targets, however - two render + // targets with the same base address are used in the lighting pass of + // 4D5307E6, for example, with the needed one picked with dynamic control + // flow. reg::RB_COLOR_INFO color_infos[xenos::kMaxColorRenderTargets]; + float rt_clamp[4][4]; + // Two UINT32_MAX if no components actually existing in the RT are written. + uint32_t rt_keep_masks[4][2]; for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { - color_infos[i] = regs.Get( + auto color_info = regs.Get( reg::RB_COLOR_INFO::rt_register_indices[i]); + color_infos[i] = color_info; + if (edram_fragment_shader_interlock) { + RenderTargetCache::GetPSIColorFormatInfo( + color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111, + rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3], + rt_keep_masks[i][0], rt_keep_masks[i][1]); + } + } + + // Disable depth and stencil if it aliases a color render target (for + // instance, during the XBLA logo in 58410954, though depth writing is already + // disabled there). + bool depth_stencil_enabled = normalized_depth_control.stencil_enable || + normalized_depth_control.z_enable; + if (edram_fragment_shader_interlock && depth_stencil_enabled) { + for (uint32_t i = 0; i < 4; ++i) { + if (rb_depth_info.depth_base == color_infos[i].color_base && + (rt_keep_masks[i][0] != UINT32_MAX || + rt_keep_masks[i][1] != UINT32_MAX)) { + depth_stencil_enabled = false; + break; + } + } } bool dirty = false; @@ -3419,6 +3541,13 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( if (draw_util::IsPrimitiveLine(regs)) { flags |= SpirvShaderTranslator::kSysFlag_PrimitiveLine; } + // MSAA sample count. + flags |= uint32_t(rb_surface_info.msaa_samples) + << SpirvShaderTranslator::kSysFlag_MsaaSamples_Shift; + // Depth format. + if (rb_depth_info.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { + flags |= SpirvShaderTranslator::kSysFlag_DepthFloat24; + } // Alpha test. xenos::CompareFunction alpha_test_function = rb_colorcontrol.alpha_test_enable ? rb_colorcontrol.alpha_func @@ -3433,6 +3562,30 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( flags |= SpirvShaderTranslator::kSysFlag_ConvertColor0ToGamma << i; } } + if (edram_fragment_shader_interlock && depth_stencil_enabled) { + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthStencil; + if (normalized_depth_control.z_enable) { + flags |= uint32_t(normalized_depth_control.zfunc) + << SpirvShaderTranslator::kSysFlag_FSIDepthPassIfLess_Shift; + if (normalized_depth_control.z_write_enable) { + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthWrite; + } + } else { + // In case stencil is used without depth testing - always pass, and + // don't modify the stored depth. + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthPassIfLess | + SpirvShaderTranslator::kSysFlag_FSIDepthPassIfEqual | + SpirvShaderTranslator::kSysFlag_FSIDepthPassIfGreater; + } + if (normalized_depth_control.stencil_enable) { + flags |= SpirvShaderTranslator::kSysFlag_FSIStencilTest; + } + // Hint - if not applicable to the shader, will not have effect. + if (alpha_test_function == xenos::CompareFunction::kAlways && + !rb_colorcontrol.alpha_to_mask_enable) { + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthStencilEarlyWrite; + } + } dirty |= system_constants_.flags != flags; system_constants_.flags = flags; @@ -3492,10 +3645,10 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( // to radius conversion to avoid multiplying the per-vertex diameter by an // additional constant in the shader. float point_screen_diameter_to_ndc_radius_x = - (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_x())) / + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) / std::max(viewport_info.xy_extent[0], uint32_t(1)); float point_screen_diameter_to_ndc_radius_y = - (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_y())) / + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) / std::max(viewport_info.xy_extent[1], uint32_t(1)); dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] != point_screen_diameter_to_ndc_radius_x; @@ -3560,7 +3713,25 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.alpha_test_reference != rb_alpha_ref; system_constants_.alpha_test_reference = rb_alpha_ref; - // Color exponent bias. + uint32_t edram_tile_dwords_scaled = + xenos::kEdramTileWidthSamples * xenos::kEdramTileHeightSamples * + (draw_resolution_scale_x * draw_resolution_scale_y); + + // EDRAM pitch for FSI render target writing. + if (edram_fragment_shader_interlock) { + // Align, then multiply by 32bpp tile size in dwords. + uint32_t edram_32bpp_tile_pitch_dwords_scaled = + ((rb_surface_info.surface_pitch * + (rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1)) + + (xenos::kEdramTileWidthSamples - 1)) / + xenos::kEdramTileWidthSamples * edram_tile_dwords_scaled; + dirty |= system_constants_.edram_32bpp_tile_pitch_dwords_scaled != + edram_32bpp_tile_pitch_dwords_scaled; + system_constants_.edram_32bpp_tile_pitch_dwords_scaled = + edram_32bpp_tile_pitch_dwords_scaled; + } + + // Color exponent bias and FSI render target writing. for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { reg::RB_COLOR_INFO color_info = color_infos[i]; // Exponent bias is in bits 20:25 of RB_COLOR_INFO. @@ -3581,6 +3752,148 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( UINT32_C(0x3F800000) + (color_exp_bias << 23); dirty |= system_constants_.color_exp_bias[i] != color_exp_bias_scale; system_constants_.color_exp_bias[i] = color_exp_bias_scale; + if (edram_fragment_shader_interlock) { + dirty |= + system_constants_.edram_rt_keep_mask[i][0] != rt_keep_masks[i][0]; + system_constants_.edram_rt_keep_mask[i][0] = rt_keep_masks[i][0]; + dirty |= + system_constants_.edram_rt_keep_mask[i][1] != rt_keep_masks[i][1]; + system_constants_.edram_rt_keep_mask[i][1] = rt_keep_masks[i][1]; + if (rt_keep_masks[i][0] != UINT32_MAX || + rt_keep_masks[i][1] != UINT32_MAX) { + uint32_t rt_base_dwords_scaled = + color_info.color_base * edram_tile_dwords_scaled; + dirty |= system_constants_.edram_rt_base_dwords_scaled[i] != + rt_base_dwords_scaled; + system_constants_.edram_rt_base_dwords_scaled[i] = + rt_base_dwords_scaled; + uint32_t format_flags = + RenderTargetCache::AddPSIColorFormatFlags(color_info.color_format); + dirty |= system_constants_.edram_rt_format_flags[i] != format_flags; + system_constants_.edram_rt_format_flags[i] = format_flags; + uint32_t blend_factors_ops = + regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF; + dirty |= system_constants_.edram_rt_blend_factors_ops[i] != + blend_factors_ops; + system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops; + // Can't do float comparisons here because NaNs would result in always + // setting the dirty flag. + dirty |= std::memcmp(system_constants_.edram_rt_clamp[i], rt_clamp[i], + 4 * sizeof(float)) != 0; + std::memcpy(system_constants_.edram_rt_clamp[i], rt_clamp[i], + 4 * sizeof(float)); + } + } + } + + if (edram_fragment_shader_interlock) { + uint32_t depth_base_dwords_scaled = + rb_depth_info.depth_base * edram_tile_dwords_scaled; + dirty |= system_constants_.edram_depth_base_dwords_scaled != + depth_base_dwords_scaled; + system_constants_.edram_depth_base_dwords_scaled = depth_base_dwords_scaled; + + // For non-polygons, front polygon offset is used, and it's enabled if + // POLY_OFFSET_PARA_ENABLED is set, for polygons, separate front and back + // are used. + float poly_offset_front_scale = 0.0f, poly_offset_front_offset = 0.0f; + float poly_offset_back_scale = 0.0f, poly_offset_back_offset = 0.0f; + if (primitive_polygonal) { + if (pa_su_sc_mode_cntl.poly_offset_front_enable) { + poly_offset_front_scale = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + poly_offset_front_offset = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + } + if (pa_su_sc_mode_cntl.poly_offset_back_enable) { + poly_offset_back_scale = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE].f32; + poly_offset_back_offset = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET].f32; + } + } else { + if (pa_su_sc_mode_cntl.poly_offset_para_enable) { + poly_offset_front_scale = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + poly_offset_front_offset = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + poly_offset_back_scale = poly_offset_front_scale; + poly_offset_back_offset = poly_offset_front_offset; + } + } + // With non-square resolution scaling, make sure the worst-case impact is + // reverted (slope only along the scaled axis), thus max. More bias is + // better than less bias, because less bias means Z fighting with the + // background is more likely. + float poly_offset_scale_factor = + xenos::kPolygonOffsetScaleSubpixelUnit * + std::max(draw_resolution_scale_x, draw_resolution_scale_y); + poly_offset_front_scale *= poly_offset_scale_factor; + poly_offset_back_scale *= poly_offset_scale_factor; + dirty |= system_constants_.edram_poly_offset_front_scale != + poly_offset_front_scale; + system_constants_.edram_poly_offset_front_scale = poly_offset_front_scale; + dirty |= system_constants_.edram_poly_offset_front_offset != + poly_offset_front_offset; + system_constants_.edram_poly_offset_front_offset = poly_offset_front_offset; + dirty |= system_constants_.edram_poly_offset_back_scale != + poly_offset_back_scale; + system_constants_.edram_poly_offset_back_scale = poly_offset_back_scale; + dirty |= system_constants_.edram_poly_offset_back_offset != + poly_offset_back_offset; + system_constants_.edram_poly_offset_back_offset = poly_offset_back_offset; + + if (depth_stencil_enabled && normalized_depth_control.stencil_enable) { + uint32_t stencil_front_reference_masks = + rb_stencilrefmask.value & 0xFFFFFF; + dirty |= system_constants_.edram_stencil_front_reference_masks != + stencil_front_reference_masks; + system_constants_.edram_stencil_front_reference_masks = + stencil_front_reference_masks; + uint32_t stencil_func_ops = + (normalized_depth_control.value >> 8) & ((1 << 12) - 1); + dirty |= + system_constants_.edram_stencil_front_func_ops != stencil_func_ops; + system_constants_.edram_stencil_front_func_ops = stencil_func_ops; + + if (primitive_polygonal && normalized_depth_control.backface_enable) { + uint32_t stencil_back_reference_masks = + rb_stencilrefmask_bf.value & 0xFFFFFF; + dirty |= system_constants_.edram_stencil_back_reference_masks != + stencil_back_reference_masks; + system_constants_.edram_stencil_back_reference_masks = + stencil_back_reference_masks; + uint32_t stencil_func_ops_bf = + (normalized_depth_control.value >> 20) & ((1 << 12) - 1); + dirty |= system_constants_.edram_stencil_back_func_ops != + stencil_func_ops_bf; + system_constants_.edram_stencil_back_func_ops = stencil_func_ops_bf; + } else { + dirty |= std::memcmp(system_constants_.edram_stencil_back, + system_constants_.edram_stencil_front, + 2 * sizeof(uint32_t)) != 0; + std::memcpy(system_constants_.edram_stencil_back, + system_constants_.edram_stencil_front, + 2 * sizeof(uint32_t)); + } + } + + dirty |= system_constants_.edram_blend_constant[0] != + regs[XE_GPU_REG_RB_BLEND_RED].f32; + system_constants_.edram_blend_constant[0] = + regs[XE_GPU_REG_RB_BLEND_RED].f32; + dirty |= system_constants_.edram_blend_constant[1] != + regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + system_constants_.edram_blend_constant[1] = + regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + dirty |= system_constants_.edram_blend_constant[2] != + regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + system_constants_.edram_blend_constant[2] = + regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + dirty |= system_constants_.edram_blend_constant[3] != + regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; + system_constants_.edram_blend_constant[3] = + regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; } if (dirty) { diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 7920981fb..8e1df02ef 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -256,6 +257,9 @@ class VulkanCommandProcessor : public CommandProcessor { void SetViewport(const VkViewport& viewport); void SetScissor(const VkRect2D& scissor); + // Returns the text to display in the GPU backend name in the window title. + std::string GetWindowTitleText() const; + protected: bool SetupContext() override; void ShutdownContext() override; @@ -437,7 +441,8 @@ class VulkanCommandProcessor : public CommandProcessor { bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, - uint32_t used_texture_mask); + uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control, + uint32_t normalized_color_mask); bool UpdateBindings(const VulkanShader* vertex_shader, const VulkanShader* pixel_shader); // Allocates a descriptor set and fills one or two VkWriteDescriptorSet @@ -514,12 +519,12 @@ class VulkanCommandProcessor : public CommandProcessor { // Descriptor set layouts used by different shaders. VkDescriptorSetLayout descriptor_set_layout_empty_ = VK_NULL_HANDLE; - VkDescriptorSetLayout descriptor_set_layout_shared_memory_and_edram_ = - VK_NULL_HANDLE; VkDescriptorSetLayout descriptor_set_layout_constants_ = VK_NULL_HANDLE; std::array descriptor_set_layouts_single_transient_{}; + VkDescriptorSetLayout descriptor_set_layout_shared_memory_and_edram_ = + VK_NULL_HANDLE; // Descriptor set layouts are referenced by pipeline_layouts_. std::unordered_map(command_processor()); + if (vulkan_command_processor != nullptr) { + return vulkan_command_processor->GetWindowTitleText(); + } + return "Vulkan - HEAVILY INCOMPLETE, early development"; +} + X_STATUS VulkanGraphicsSystem::Setup(cpu::Processor* processor, kernel::KernelState* kernel_state, ui::WindowedAppContext* app_context, diff --git a/src/xenia/gpu/vulkan/vulkan_graphics_system.h b/src/xenia/gpu/vulkan/vulkan_graphics_system.h index ae81e144c..e06892aa1 100644 --- a/src/xenia/gpu/vulkan/vulkan_graphics_system.h +++ b/src/xenia/gpu/vulkan/vulkan_graphics_system.h @@ -26,9 +26,7 @@ class VulkanGraphicsSystem : public GraphicsSystem { static bool IsAvailable() { return true; } - std::string name() const override { - return "Vulkan - HEAVILY INCOMPLETE, early development"; - } + std::string name() const override; X_STATUS Setup(cpu::Processor* processor, kernel::KernelState* kernel_state, ui::WindowedAppContext* app_context, diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index aff800c1a..f1af57a23 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include "third_party/fmt/include/fmt/format.h" #include "third_party/glslang/SPIRV/SpvBuilder.h" @@ -53,8 +54,32 @@ bool VulkanPipelineCache::Initialize() { const ui::vulkan::VulkanProvider& provider = command_processor_.GetVulkanProvider(); + bool edram_fragment_shader_interlock = + render_target_cache_.GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + shader_translator_ = std::make_unique( - SpirvShaderTranslator::Features(provider)); + SpirvShaderTranslator::Features(provider), + render_target_cache_.msaa_2x_attachments_supported(), + render_target_cache_.msaa_2x_no_attachments_supported(), + edram_fragment_shader_interlock); + + if (edram_fragment_shader_interlock) { + std::vector depth_only_fragment_shader_code = + shader_translator_->CreateDepthOnlyFragmentShader(); + depth_only_fragment_shader_ = ui::vulkan::util::CreateShaderModule( + provider, + reinterpret_cast( + depth_only_fragment_shader_code.data()), + depth_only_fragment_shader_code.size()); + if (depth_only_fragment_shader_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanPipelineCache: Failed to create the depth/stencil-only " + "fragment shader for the fragment shader interlock render backend " + "implementation"); + return false; + } + } return true; } @@ -75,6 +100,8 @@ void VulkanPipelineCache::Shutdown() { pipelines_.clear(); // Destroy all internal shaders. + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyShaderModule, device, + depth_only_fragment_shader_); for (const auto& geometry_shader_pair : geometry_shaders_) { if (geometry_shader_pair.second != VK_NULL_HANDLE) { dfn.vkDestroyShaderModule(device, geometry_shader_pair.second, nullptr); @@ -179,15 +206,18 @@ VulkanPipelineCache::GetCurrentPixelShaderModification( modification.pixel.param_gen_point = 0; } - using DepthStencilMode = - SpirvShaderTranslator::Modification::DepthStencilMode; - if (shader.implicit_early_z_write_allowed() && - (!shader.writes_color_target(0) || - !draw_util::DoesCoverageDependOnAlpha( - regs.Get()))) { - modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint; - } else { - modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers; + if (render_target_cache_.GetPath() == + RenderTargetCache::Path::kHostRenderTargets) { + using DepthStencilMode = + SpirvShaderTranslator::Modification::DepthStencilMode; + if (shader.implicit_early_z_write_allowed() && + (!shader.writes_color_target(0) || + !draw_util::DoesCoverageDependOnAlpha( + regs.Get()))) { + modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint; + } else { + modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers; + } } return modification; @@ -303,7 +333,11 @@ bool VulkanPipelineCache::ConfigurePipeline( } } VkRenderPass render_pass = - render_target_cache_.GetRenderPass(render_pass_key); + render_target_cache_.GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock + ? render_target_cache_.GetFragmentShaderInterlockRenderPass() + : render_target_cache_.GetHostRenderTargetsRenderPass( + render_pass_key); if (render_pass == VK_NULL_HANDLE) { return false; } @@ -603,123 +637,127 @@ bool VulkanPipelineCache::GetCurrentStateDescription( description_out.polygon_mode = PipelinePolygonMode::kFill; } - // TODO(Triang3l): Skip depth / stencil and color state for the fragment - // shader interlock RB implementation. - - if (render_pass_key.depth_and_color_used & 1) { - if (normalized_depth_control.z_enable) { - description_out.depth_write_enable = - normalized_depth_control.z_write_enable; - description_out.depth_compare_op = normalized_depth_control.zfunc; - } else { - description_out.depth_compare_op = xenos::CompareFunction::kAlways; - } - if (normalized_depth_control.stencil_enable) { - description_out.stencil_test_enable = 1; - description_out.stencil_front_fail_op = - normalized_depth_control.stencilfail; - description_out.stencil_front_pass_op = - normalized_depth_control.stencilzpass; - description_out.stencil_front_depth_fail_op = - normalized_depth_control.stencilzfail; - description_out.stencil_front_compare_op = - normalized_depth_control.stencilfunc; - if (primitive_polygonal && normalized_depth_control.backface_enable) { - description_out.stencil_back_fail_op = - normalized_depth_control.stencilfail_bf; - description_out.stencil_back_pass_op = - normalized_depth_control.stencilzpass_bf; - description_out.stencil_back_depth_fail_op = - normalized_depth_control.stencilzfail_bf; - description_out.stencil_back_compare_op = - normalized_depth_control.stencilfunc_bf; + if (render_target_cache_.GetPath() == + RenderTargetCache::Path::kHostRenderTargets) { + if (render_pass_key.depth_and_color_used & 1) { + if (normalized_depth_control.z_enable) { + description_out.depth_write_enable = + normalized_depth_control.z_write_enable; + description_out.depth_compare_op = normalized_depth_control.zfunc; } else { - description_out.stencil_back_fail_op = - description_out.stencil_front_fail_op; - description_out.stencil_back_pass_op = - description_out.stencil_front_pass_op; - description_out.stencil_back_depth_fail_op = - description_out.stencil_front_depth_fail_op; - description_out.stencil_back_compare_op = - description_out.stencil_front_compare_op; + description_out.depth_compare_op = xenos::CompareFunction::kAlways; + } + if (normalized_depth_control.stencil_enable) { + description_out.stencil_test_enable = 1; + description_out.stencil_front_fail_op = + normalized_depth_control.stencilfail; + description_out.stencil_front_pass_op = + normalized_depth_control.stencilzpass; + description_out.stencil_front_depth_fail_op = + normalized_depth_control.stencilzfail; + description_out.stencil_front_compare_op = + normalized_depth_control.stencilfunc; + if (primitive_polygonal && normalized_depth_control.backface_enable) { + description_out.stencil_back_fail_op = + normalized_depth_control.stencilfail_bf; + description_out.stencil_back_pass_op = + normalized_depth_control.stencilzpass_bf; + description_out.stencil_back_depth_fail_op = + normalized_depth_control.stencilzfail_bf; + description_out.stencil_back_compare_op = + normalized_depth_control.stencilfunc_bf; + } else { + description_out.stencil_back_fail_op = + description_out.stencil_front_fail_op; + description_out.stencil_back_pass_op = + description_out.stencil_front_pass_op; + description_out.stencil_back_depth_fail_op = + description_out.stencil_front_depth_fail_op; + description_out.stencil_back_compare_op = + description_out.stencil_front_compare_op; + } } } - } - // Color blending and write masks (filled only for the attachments present in - // the render pass object). - uint32_t render_pass_color_rts = render_pass_key.depth_and_color_used >> 1; - if (device_features.independentBlend) { - uint32_t render_pass_color_rts_remaining = render_pass_color_rts; - uint32_t color_rt_index; - while (xe::bit_scan_forward(render_pass_color_rts_remaining, - &color_rt_index)) { - render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index); - WritePipelineRenderTargetDescription( - regs.Get( - reg::RB_BLENDCONTROL::rt_register_indices[color_rt_index]), - (normalized_color_mask >> (color_rt_index * 4)) & 0b1111, - description_out.render_targets[color_rt_index]); - } - } else { - // Take the blend control for the first render target that the guest wants - // to write to (consider it the most important) and use it for all render - // targets, if any. - // TODO(Triang3l): Implement an option for independent blending via multiple - // draw calls with different pipelines maybe? Though independent blending - // support is pretty wide, with a quite prominent exception of Adreno 4xx - // apparently. - uint32_t render_pass_color_rts_remaining = render_pass_color_rts; - uint32_t render_pass_first_color_rt_index; - if (xe::bit_scan_forward(render_pass_color_rts_remaining, - &render_pass_first_color_rt_index)) { - render_pass_color_rts_remaining &= - ~(uint32_t(1) << render_pass_first_color_rt_index); - PipelineRenderTarget& render_pass_first_color_rt = - description_out.render_targets[render_pass_first_color_rt_index]; - uint32_t common_blend_rt_index; - if (xe::bit_scan_forward(normalized_color_mask, &common_blend_rt_index)) { - common_blend_rt_index >>= 2; - // If a common write mask will be used for multiple render targets, use - // the original RB_COLOR_MASK instead of the normalized color mask as - // the normalized color mask has non-existent components forced to - // written (don't need reading to be preserved), while the number of - // components may vary between render targets. The attachments in the - // pass that must not be written to at all will be excluded via a shader - // modification. - WritePipelineRenderTargetDescription( - regs.Get( - reg::RB_BLENDCONTROL::rt_register_indices - [common_blend_rt_index]), - (((normalized_color_mask & - ~(uint32_t(0b1111) << (4 * common_blend_rt_index))) - ? regs[XE_GPU_REG_RB_COLOR_MASK].u32 - : normalized_color_mask) >> - (4 * common_blend_rt_index)) & - 0b1111, - render_pass_first_color_rt); - } else { - // No render targets are written to, though the render pass still may - // contain color attachments - set them to not written and not blending. - render_pass_first_color_rt.src_color_blend_factor = - PipelineBlendFactor::kOne; - render_pass_first_color_rt.dst_color_blend_factor = - PipelineBlendFactor::kZero; - render_pass_first_color_rt.color_blend_op = xenos::BlendOp::kAdd; - render_pass_first_color_rt.src_alpha_blend_factor = - PipelineBlendFactor::kOne; - render_pass_first_color_rt.dst_alpha_blend_factor = - PipelineBlendFactor::kZero; - render_pass_first_color_rt.alpha_blend_op = xenos::BlendOp::kAdd; - } - // Reuse the same blending settings for all render targets in the pass, - // for description consistency. + // Color blending and write masks (filled only for the attachments present + // in the render pass object). + uint32_t render_pass_color_rts = render_pass_key.depth_and_color_used >> 1; + if (device_features.independentBlend) { + uint32_t render_pass_color_rts_remaining = render_pass_color_rts; uint32_t color_rt_index; while (xe::bit_scan_forward(render_pass_color_rts_remaining, &color_rt_index)) { render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index); - description_out.render_targets[color_rt_index] = - render_pass_first_color_rt; + WritePipelineRenderTargetDescription( + regs.Get( + reg::RB_BLENDCONTROL::rt_register_indices[color_rt_index]), + (normalized_color_mask >> (color_rt_index * 4)) & 0b1111, + description_out.render_targets[color_rt_index]); + } + } else { + // Take the blend control for the first render target that the guest wants + // to write to (consider it the most important) and use it for all render + // targets, if any. + // TODO(Triang3l): Implement an option for independent blending via + // replaying the render pass for each set of render targets with unique + // blending parameters, with depth / stencil saved before the first and + // restored before each of the rest maybe? Though independent blending + // support is pretty wide, with a quite prominent exception of Adreno 4xx + // apparently. + uint32_t render_pass_color_rts_remaining = render_pass_color_rts; + uint32_t render_pass_first_color_rt_index; + if (xe::bit_scan_forward(render_pass_color_rts_remaining, + &render_pass_first_color_rt_index)) { + render_pass_color_rts_remaining &= + ~(uint32_t(1) << render_pass_first_color_rt_index); + PipelineRenderTarget& render_pass_first_color_rt = + description_out.render_targets[render_pass_first_color_rt_index]; + uint32_t common_blend_rt_index; + if (xe::bit_scan_forward(normalized_color_mask, + &common_blend_rt_index)) { + common_blend_rt_index >>= 2; + // If a common write mask will be used for multiple render targets, + // use the original RB_COLOR_MASK instead of the normalized color mask + // as the normalized color mask has non-existent components forced to + // written (don't need reading to be preserved), while the number of + // components may vary between render targets. The attachments in the + // pass that must not be written to at all will be excluded via a + // shader modification. + WritePipelineRenderTargetDescription( + regs.Get( + reg::RB_BLENDCONTROL::rt_register_indices + [common_blend_rt_index]), + (((normalized_color_mask & + ~(uint32_t(0b1111) << (4 * common_blend_rt_index))) + ? regs[XE_GPU_REG_RB_COLOR_MASK].u32 + : normalized_color_mask) >> + (4 * common_blend_rt_index)) & + 0b1111, + render_pass_first_color_rt); + } else { + // No render targets are written to, though the render pass still may + // contain color attachments - set them to not written and not + // blending. + render_pass_first_color_rt.src_color_blend_factor = + PipelineBlendFactor::kOne; + render_pass_first_color_rt.dst_color_blend_factor = + PipelineBlendFactor::kZero; + render_pass_first_color_rt.color_blend_op = xenos::BlendOp::kAdd; + render_pass_first_color_rt.src_alpha_blend_factor = + PipelineBlendFactor::kOne; + render_pass_first_color_rt.dst_alpha_blend_factor = + PipelineBlendFactor::kZero; + render_pass_first_color_rt.alpha_blend_op = xenos::BlendOp::kAdd; + } + // Reuse the same blending settings for all render targets in the pass, + // for description consistency. + uint32_t color_rt_index; + while (xe::bit_scan_forward(render_pass_color_rts_remaining, + &color_rt_index)) { + render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index); + description_out.render_targets[color_rt_index] = + render_pass_first_color_rt; + } } } } @@ -1929,6 +1967,10 @@ bool VulkanPipelineCache::EnsurePipelineCreated( command_processor_.GetVulkanProvider(); const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + bool edram_fragment_shader_interlock = + render_target_cache_.GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + std::array shader_stages; uint32_t shader_stage_count = 0; @@ -1962,24 +2004,32 @@ bool VulkanPipelineCache::EnsurePipelineCreated( shader_stage_geometry.pName = "main"; shader_stage_geometry.pSpecializationInfo = nullptr; } - // Pixel shader. + // Fragment shader. + VkPipelineShaderStageCreateInfo& shader_stage_fragment = + shader_stages[shader_stage_count++]; + shader_stage_fragment.sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stage_fragment.pNext = nullptr; + shader_stage_fragment.flags = 0; + shader_stage_fragment.stage = VK_SHADER_STAGE_FRAGMENT_BIT; + shader_stage_fragment.module = VK_NULL_HANDLE; + shader_stage_fragment.pName = "main"; + shader_stage_fragment.pSpecializationInfo = nullptr; if (creation_arguments.pixel_shader) { assert_true(creation_arguments.pixel_shader->is_translated()); if (!creation_arguments.pixel_shader->is_valid()) { return false; } - VkPipelineShaderStageCreateInfo& shader_stage_fragment = - shader_stages[shader_stage_count++]; - shader_stage_fragment.sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shader_stage_fragment.pNext = nullptr; - shader_stage_fragment.flags = 0; - shader_stage_fragment.stage = VK_SHADER_STAGE_FRAGMENT_BIT; shader_stage_fragment.module = creation_arguments.pixel_shader->shader_module(); assert_true(shader_stage_fragment.module != VK_NULL_HANDLE); - shader_stage_fragment.pName = "main"; - shader_stage_fragment.pSpecializationInfo = nullptr; + } else { + if (edram_fragment_shader_interlock) { + shader_stage_fragment.module = depth_only_fragment_shader_; + } + } + if (shader_stage_fragment.module == VK_NULL_HANDLE) { + --shader_stage_count; } VkPipelineVertexInputStateCreateInfo vertex_input_state = {}; @@ -2087,11 +2137,11 @@ bool VulkanPipelineCache::EnsurePipelineCreated( // formula, though Z has no effect on anything if a depth attachment is not // used (the guest shader can't access Z), enabling only when there's a // depth / stencil attachment for correctness. - // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB - // implementation. rasterization_state.depthBiasEnable = - (description.render_pass_key.depth_and_color_used & 0b1) ? VK_TRUE - : VK_FALSE; + (!edram_fragment_shader_interlock && + (description.render_pass_key.depth_and_color_used & 0b1)) + ? VK_TRUE + : VK_FALSE; // TODO(Triang3l): Wide lines. rasterization_state.lineWidth = 1.0f; @@ -2101,6 +2151,7 @@ bool VulkanPipelineCache::EnsurePipelineCreated( VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; if (description.render_pass_key.msaa_samples == xenos::MsaaSamples::k2X && !render_target_cache_.IsMsaa2xSupported( + !edram_fragment_shader_interlock && description.render_pass_key.depth_and_color_used != 0)) { // Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same // sample locations, but still top-left and bottom-right - however, this can @@ -2119,126 +2170,131 @@ bool VulkanPipelineCache::EnsurePipelineCreated( depth_stencil_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; depth_stencil_state.pNext = nullptr; - if (description.depth_write_enable || - description.depth_compare_op != xenos::CompareFunction::kAlways) { - depth_stencil_state.depthTestEnable = VK_TRUE; - depth_stencil_state.depthWriteEnable = - description.depth_write_enable ? VK_TRUE : VK_FALSE; - depth_stencil_state.depthCompareOp = VkCompareOp( - uint32_t(VK_COMPARE_OP_NEVER) + uint32_t(description.depth_compare_op)); - } - if (description.stencil_test_enable) { - depth_stencil_state.stencilTestEnable = VK_TRUE; - depth_stencil_state.front.failOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_front_fail_op)); - depth_stencil_state.front.passOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_front_pass_op)); - depth_stencil_state.front.depthFailOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_front_depth_fail_op)); - depth_stencil_state.front.compareOp = - VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + - uint32_t(description.stencil_front_compare_op)); - depth_stencil_state.back.failOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_back_fail_op)); - depth_stencil_state.back.passOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_back_pass_op)); - depth_stencil_state.back.depthFailOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_back_depth_fail_op)); - depth_stencil_state.back.compareOp = - VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + - uint32_t(description.stencil_back_compare_op)); - } - - VkPipelineColorBlendAttachmentState - color_blend_attachments[xenos::kMaxColorRenderTargets] = {}; - uint32_t color_rts_used = - description.render_pass_key.depth_and_color_used >> 1; - { - static const VkBlendFactor kBlendFactorMap[] = { - VK_BLEND_FACTOR_ZERO, - VK_BLEND_FACTOR_ONE, - VK_BLEND_FACTOR_SRC_COLOR, - VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR, - VK_BLEND_FACTOR_DST_COLOR, - VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR, - VK_BLEND_FACTOR_SRC_ALPHA, - VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, - VK_BLEND_FACTOR_DST_ALPHA, - VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA, - VK_BLEND_FACTOR_CONSTANT_COLOR, - VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR, - VK_BLEND_FACTOR_CONSTANT_ALPHA, - VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA, - VK_BLEND_FACTOR_SRC_ALPHA_SATURATE, - }; - // 8 entries for safety since 3 bits from the guest are passed directly. - static const VkBlendOp kBlendOpMap[] = {VK_BLEND_OP_ADD, - VK_BLEND_OP_SUBTRACT, - VK_BLEND_OP_MIN, - VK_BLEND_OP_MAX, - VK_BLEND_OP_REVERSE_SUBTRACT, - VK_BLEND_OP_ADD, - VK_BLEND_OP_ADD, - VK_BLEND_OP_ADD}; - uint32_t color_rts_remaining = color_rts_used; - uint32_t color_rt_index; - while (xe::bit_scan_forward(color_rts_remaining, &color_rt_index)) { - color_rts_remaining &= ~(uint32_t(1) << color_rt_index); - VkPipelineColorBlendAttachmentState& color_blend_attachment = - color_blend_attachments[color_rt_index]; - const PipelineRenderTarget& color_rt = - description.render_targets[color_rt_index]; - if (color_rt.src_color_blend_factor != PipelineBlendFactor::kOne || - color_rt.dst_color_blend_factor != PipelineBlendFactor::kZero || - color_rt.color_blend_op != xenos::BlendOp::kAdd || - color_rt.src_alpha_blend_factor != PipelineBlendFactor::kOne || - color_rt.dst_alpha_blend_factor != PipelineBlendFactor::kZero || - color_rt.alpha_blend_op != xenos::BlendOp::kAdd) { - color_blend_attachment.blendEnable = VK_TRUE; - color_blend_attachment.srcColorBlendFactor = - kBlendFactorMap[uint32_t(color_rt.src_color_blend_factor)]; - color_blend_attachment.dstColorBlendFactor = - kBlendFactorMap[uint32_t(color_rt.dst_color_blend_factor)]; - color_blend_attachment.colorBlendOp = - kBlendOpMap[uint32_t(color_rt.color_blend_op)]; - color_blend_attachment.srcAlphaBlendFactor = - kBlendFactorMap[uint32_t(color_rt.src_alpha_blend_factor)]; - color_blend_attachment.dstAlphaBlendFactor = - kBlendFactorMap[uint32_t(color_rt.dst_alpha_blend_factor)]; - color_blend_attachment.alphaBlendOp = - kBlendOpMap[uint32_t(color_rt.alpha_blend_op)]; - } - color_blend_attachment.colorWriteMask = - VkColorComponentFlags(color_rt.color_write_mask); - if (!device_features.independentBlend) { - // For non-independent blend, the pAttachments element for the first - // actually used color will be replicated into all. - break; - } + if (!edram_fragment_shader_interlock) { + if (description.depth_write_enable || + description.depth_compare_op != xenos::CompareFunction::kAlways) { + depth_stencil_state.depthTestEnable = VK_TRUE; + depth_stencil_state.depthWriteEnable = + description.depth_write_enable ? VK_TRUE : VK_FALSE; + depth_stencil_state.depthCompareOp = + VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + + uint32_t(description.depth_compare_op)); + } + if (description.stencil_test_enable) { + depth_stencil_state.stencilTestEnable = VK_TRUE; + depth_stencil_state.front.failOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_front_fail_op)); + depth_stencil_state.front.passOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_front_pass_op)); + depth_stencil_state.front.depthFailOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_front_depth_fail_op)); + depth_stencil_state.front.compareOp = + VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + + uint32_t(description.stencil_front_compare_op)); + depth_stencil_state.back.failOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_back_fail_op)); + depth_stencil_state.back.passOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_back_pass_op)); + depth_stencil_state.back.depthFailOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_back_depth_fail_op)); + depth_stencil_state.back.compareOp = + VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + + uint32_t(description.stencil_back_compare_op)); } } + VkPipelineColorBlendStateCreateInfo color_blend_state = {}; color_blend_state.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - color_blend_state.attachmentCount = 32 - xe::lzcnt(color_rts_used); - color_blend_state.pAttachments = color_blend_attachments; - if (color_rts_used && !device_features.independentBlend) { - // "If the independent blending feature is not enabled, all elements of - // pAttachments must be identical." - uint32_t first_color_rt_index; - xe::bit_scan_forward(color_rts_used, &first_color_rt_index); - for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) { - if (i == first_color_rt_index) { - continue; + VkPipelineColorBlendAttachmentState + color_blend_attachments[xenos::kMaxColorRenderTargets] = {}; + if (!edram_fragment_shader_interlock) { + uint32_t color_rts_used = + description.render_pass_key.depth_and_color_used >> 1; + { + static const VkBlendFactor kBlendFactorMap[] = { + VK_BLEND_FACTOR_ZERO, + VK_BLEND_FACTOR_ONE, + VK_BLEND_FACTOR_SRC_COLOR, + VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR, + VK_BLEND_FACTOR_DST_COLOR, + VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR, + VK_BLEND_FACTOR_SRC_ALPHA, + VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + VK_BLEND_FACTOR_DST_ALPHA, + VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA, + VK_BLEND_FACTOR_CONSTANT_COLOR, + VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR, + VK_BLEND_FACTOR_CONSTANT_ALPHA, + VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA, + VK_BLEND_FACTOR_SRC_ALPHA_SATURATE, + }; + // 8 entries for safety since 3 bits from the guest are passed directly. + static const VkBlendOp kBlendOpMap[] = {VK_BLEND_OP_ADD, + VK_BLEND_OP_SUBTRACT, + VK_BLEND_OP_MIN, + VK_BLEND_OP_MAX, + VK_BLEND_OP_REVERSE_SUBTRACT, + VK_BLEND_OP_ADD, + VK_BLEND_OP_ADD, + VK_BLEND_OP_ADD}; + uint32_t color_rts_remaining = color_rts_used; + uint32_t color_rt_index; + while (xe::bit_scan_forward(color_rts_remaining, &color_rt_index)) { + color_rts_remaining &= ~(uint32_t(1) << color_rt_index); + VkPipelineColorBlendAttachmentState& color_blend_attachment = + color_blend_attachments[color_rt_index]; + const PipelineRenderTarget& color_rt = + description.render_targets[color_rt_index]; + if (color_rt.src_color_blend_factor != PipelineBlendFactor::kOne || + color_rt.dst_color_blend_factor != PipelineBlendFactor::kZero || + color_rt.color_blend_op != xenos::BlendOp::kAdd || + color_rt.src_alpha_blend_factor != PipelineBlendFactor::kOne || + color_rt.dst_alpha_blend_factor != PipelineBlendFactor::kZero || + color_rt.alpha_blend_op != xenos::BlendOp::kAdd) { + color_blend_attachment.blendEnable = VK_TRUE; + color_blend_attachment.srcColorBlendFactor = + kBlendFactorMap[uint32_t(color_rt.src_color_blend_factor)]; + color_blend_attachment.dstColorBlendFactor = + kBlendFactorMap[uint32_t(color_rt.dst_color_blend_factor)]; + color_blend_attachment.colorBlendOp = + kBlendOpMap[uint32_t(color_rt.color_blend_op)]; + color_blend_attachment.srcAlphaBlendFactor = + kBlendFactorMap[uint32_t(color_rt.src_alpha_blend_factor)]; + color_blend_attachment.dstAlphaBlendFactor = + kBlendFactorMap[uint32_t(color_rt.dst_alpha_blend_factor)]; + color_blend_attachment.alphaBlendOp = + kBlendOpMap[uint32_t(color_rt.alpha_blend_op)]; + } + color_blend_attachment.colorWriteMask = + VkColorComponentFlags(color_rt.color_write_mask); + if (!device_features.independentBlend) { + // For non-independent blend, the pAttachments element for the first + // actually used color will be replicated into all. + break; + } + } + } + color_blend_state.attachmentCount = 32 - xe::lzcnt(color_rts_used); + color_blend_state.pAttachments = color_blend_attachments; + if (color_rts_used && !device_features.independentBlend) { + // "If the independent blending feature is not enabled, all elements of + // pAttachments must be identical." + uint32_t first_color_rt_index; + xe::bit_scan_forward(color_rts_used, &first_color_rt_index); + for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) { + if (i == first_color_rt_index) { + continue; + } + color_blend_attachments[i] = + color_blend_attachments[first_color_rt_index]; } - color_blend_attachments[i] = - color_blend_attachments[first_color_rt_index]; } } @@ -2255,16 +2311,18 @@ bool VulkanPipelineCache::EnsurePipelineCreated( // invalidated (again, even if it has no effect). dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_VIEWPORT; dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_SCISSOR; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_DEPTH_BIAS; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_BLEND_CONSTANTS; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_STENCIL_WRITE_MASK; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_STENCIL_REFERENCE; + if (!edram_fragment_shader_interlock) { + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_DEPTH_BIAS; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_BLEND_CONSTANTS; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_REFERENCE; + } VkGraphicsPipelineCreateInfo pipeline_create_info; pipeline_create_info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h index 56346d1bc..09a26caa4 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h @@ -314,6 +314,10 @@ class VulkanPipelineCache { GeometryShaderKey::Hasher> geometry_shaders_; + // Empty depth-only pixel shader for writing to depth buffer using fragment + // shader interlock when no Xenos pixel shader provided. + VkShaderModule depth_only_fragment_shader_ = VK_NULL_HANDLE; + std::unordered_map pipelines_; diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc index 92efc4189..8113827e5 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc @@ -22,6 +22,7 @@ #include "third_party/glslang/SPIRV/GLSL.std.450.h" #include "third_party/glslang/SPIRV/SpvBuilder.h" #include "xenia/base/assert.h" +#include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" @@ -33,6 +34,27 @@ #include "xenia/gpu/xenos.h" #include "xenia/ui/vulkan/vulkan_util.h" +DEFINE_string( + render_target_path_vulkan, "", + "Render target emulation path to use on Vulkan.\n" + "Use: [any, fbo, fsi]\n" + " fbo:\n" + " Host framebuffers and fixed-function blending and depth / stencil " + "testing, copying between render targets when needed.\n" + " Lower accuracy (limited pixel format support).\n" + " Performance limited primarily by render target layout changes requiring " + "copying, but generally higher.\n" + " fsi:\n" + " Manual pixel packing, blending and depth / stencil testing, with free " + "render target layout changes.\n" + " Requires a GPU supporting fragment shader interlock.\n" + " Highest accuracy (all pixel formats handled in software).\n" + " Performance limited primarily by overdraw.\n" + " Any other value:\n" + " Choose what is considered the most optimal for the system (currently " + "always FB because the FSI path is much slower now).", + "GPU"); + namespace xe { namespace gpu { namespace vulkan { @@ -43,6 +65,10 @@ namespace shaders { #include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_2xmsaa_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_4xmsaa_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/passthrough_position_xy_vs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_32bpp_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_32bpp_scaled_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_64bpp_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_64bpp_scaled_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_fast_32bpp_1x2xmsaa_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_fast_32bpp_1x2xmsaa_scaled_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_fast_32bpp_4xmsaa_cs.h" @@ -180,13 +206,61 @@ VulkanRenderTargetCache::VulkanRenderTargetCache( VulkanRenderTargetCache::~VulkanRenderTargetCache() { Shutdown(true); } -bool VulkanRenderTargetCache::Initialize() { +bool VulkanRenderTargetCache::Initialize(uint32_t shared_memory_binding_count) { const ui::vulkan::VulkanProvider& provider = command_processor_.GetVulkanProvider(); const ui::vulkan::VulkanProvider::InstanceFunctions& ifn = provider.ifn(); VkPhysicalDevice physical_device = provider.physical_device(); const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; + + if (cvars::render_target_path_vulkan == "fsi") { + path_ = Path::kPixelShaderInterlock; + } else { + path_ = Path::kHostRenderTargets; + } + // Fragment shader interlock is a feature implemented by pretty advanced GPUs, + // closer to Direct3D 11 / OpenGL ES 3.2 level mainly, not Direct3D 10 / + // OpenGL ES 3.1. Thus, it's fine to demand a wide range of other optional + // features for the fragment shader interlock backend to work. + if (path_ == Path::kPixelShaderInterlock) { + const VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT& + device_fragment_shader_interlock_features = + provider.device_fragment_shader_interlock_features(); + const VkPhysicalDeviceFeatures& device_features = + provider.device_features(); + // Interlocking between fragments with common sample coverage is enough, but + // interlocking more is acceptable too (fragmentShaderShadingRateInterlock + // would be okay too, but it's unlikely that an implementation would + // advertise only it and not any other ones, as it's a very specific feature + // interacting with another optional feature that is variable shading rate, + // so there's no need to overcomplicate the checks and the shader execution + // mode setting). + // Sample-rate shading is required by certain SPIR-V revisions to access the + // sample mask fragment shader input. + // Stanard sample locations are needed for calculating the depth at the + // samples. + // It's unlikely that a device exposing fragment shader interlock won't have + // a large enough storage buffer range and a sufficient SSBO slot count for + // all the shared memory buffers and the EDRAM buffer - an in a conflict + // between, for instance, the ability to vfetch and memexport in fragment + // shaders, and the usage of fragment shader interlock, prefer the former + // for simplicity. + if (!provider.device_extensions().ext_fragment_shader_interlock || + !(device_fragment_shader_interlock_features + .fragmentShaderSampleInterlock || + device_fragment_shader_interlock_features + .fragmentShaderPixelInterlock) || + !device_features.fragmentStoresAndAtomics || + !device_features.sampleRateShading || + !device_limits.standardSampleLocations || + shared_memory_binding_count >= + device_limits.maxDescriptorSetStorageBuffers) { + path_ = Path::kHostRenderTargets; + } + } // Format support. constexpr VkFormatFeatureFlags kUsedDepthFormatFeatures = @@ -199,6 +273,30 @@ bool VulkanRenderTargetCache::Initialize() { (depth_unorm24_properties.optimalTilingFeatures & kUsedDepthFormatFeatures) == kUsedDepthFormatFeatures; + // 2x MSAA support. + // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in + // transfers. + if (cvars::native_2x_msaa) { + // Multisampled integer sampled images are optional in Vulkan and in Xenia. + msaa_2x_attachments_supported_ = + (device_limits.framebufferColorSampleCounts & + device_limits.framebufferDepthSampleCounts & + device_limits.framebufferStencilSampleCounts & + device_limits.sampledImageColorSampleCounts & + device_limits.sampledImageDepthSampleCounts & + device_limits.sampledImageStencilSampleCounts & + VK_SAMPLE_COUNT_2_BIT) && + (device_limits.sampledImageIntegerSampleCounts & + (VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT)) != + VK_SAMPLE_COUNT_4_BIT; + msaa_2x_no_attachments_supported_ = + (device_limits.framebufferNoAttachmentsSampleCounts & + VK_SAMPLE_COUNT_2_BIT) != 0; + } else { + msaa_2x_attachments_supported_ = false; + msaa_2x_no_attachments_supported_ = false; + } + // Descriptor set layouts. VkDescriptorSetLayoutBinding descriptor_set_layout_bindings[2]; descriptor_set_layout_bindings[0].binding = 0; @@ -429,227 +527,355 @@ bool VulkanRenderTargetCache::Initialize() { // TODO(Triang3l): All paths (FSI). - depth_float24_round_ = cvars::depth_float24_round; + if (path_ == Path::kHostRenderTargets) { + // Host render targets. - // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in - // transfers. - if (cvars::native_2x_msaa) { - const VkPhysicalDeviceLimits& device_limits = - provider.device_properties().limits; - // Multisampled integer sampled images are optional in Vulkan and in Xenia. - msaa_2x_attachments_supported_ = - (device_limits.framebufferColorSampleCounts & - device_limits.framebufferDepthSampleCounts & - device_limits.framebufferStencilSampleCounts & - device_limits.sampledImageColorSampleCounts & - device_limits.sampledImageDepthSampleCounts & - device_limits.sampledImageStencilSampleCounts & - VK_SAMPLE_COUNT_2_BIT) && - (device_limits.sampledImageIntegerSampleCounts & - (VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT)) != - VK_SAMPLE_COUNT_4_BIT; - msaa_2x_no_attachments_supported_ = - (device_limits.framebufferNoAttachmentsSampleCounts & - VK_SAMPLE_COUNT_2_BIT) != 0; - } else { - msaa_2x_attachments_supported_ = false; - msaa_2x_no_attachments_supported_ = false; - } + depth_float24_round_ = cvars::depth_float24_round; - // Host depth storing pipeline layout. - VkDescriptorSetLayout host_depth_store_descriptor_set_layouts[] = { - // Destination EDRAM storage buffer. - descriptor_set_layout_storage_buffer_, - // Source depth / stencil texture (only depth is used). - descriptor_set_layout_sampled_image_x2_, - }; - VkPushConstantRange host_depth_store_push_constant_range; - host_depth_store_push_constant_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - host_depth_store_push_constant_range.offset = 0; - host_depth_store_push_constant_range.size = sizeof(HostDepthStoreConstants); - VkPipelineLayoutCreateInfo host_depth_store_pipeline_layout_create_info; - host_depth_store_pipeline_layout_create_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - host_depth_store_pipeline_layout_create_info.pNext = nullptr; - host_depth_store_pipeline_layout_create_info.flags = 0; - host_depth_store_pipeline_layout_create_info.setLayoutCount = - uint32_t(xe::countof(host_depth_store_descriptor_set_layouts)); - host_depth_store_pipeline_layout_create_info.pSetLayouts = - host_depth_store_descriptor_set_layouts; - host_depth_store_pipeline_layout_create_info.pushConstantRangeCount = 1; - host_depth_store_pipeline_layout_create_info.pPushConstantRanges = - &host_depth_store_push_constant_range; - if (dfn.vkCreatePipelineLayout( - device, &host_depth_store_pipeline_layout_create_info, nullptr, - &host_depth_store_pipeline_layout_) != VK_SUCCESS) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the host depth storing " - "pipeline layout"); - Shutdown(); - return false; - } - const std::pair host_depth_store_shaders[] = { - {shaders::host_depth_store_1xmsaa_cs, - sizeof(shaders::host_depth_store_1xmsaa_cs)}, - {shaders::host_depth_store_2xmsaa_cs, - sizeof(shaders::host_depth_store_2xmsaa_cs)}, - {shaders::host_depth_store_4xmsaa_cs, - sizeof(shaders::host_depth_store_4xmsaa_cs)}, - }; - for (size_t i = 0; i < xe::countof(host_depth_store_shaders); ++i) { - const std::pair host_depth_store_shader = - host_depth_store_shaders[i]; - VkPipeline host_depth_store_pipeline = - ui::vulkan::util::CreateComputePipeline( - provider, host_depth_store_pipeline_layout_, - host_depth_store_shader.first, host_depth_store_shader.second); - if (host_depth_store_pipeline == VK_NULL_HANDLE) { + // Host depth storing pipeline layout. + VkDescriptorSetLayout host_depth_store_descriptor_set_layouts[] = { + // Destination EDRAM storage buffer. + descriptor_set_layout_storage_buffer_, + // Source depth / stencil texture (only depth is used). + descriptor_set_layout_sampled_image_x2_, + }; + VkPushConstantRange host_depth_store_push_constant_range; + host_depth_store_push_constant_range.stageFlags = + VK_SHADER_STAGE_COMPUTE_BIT; + host_depth_store_push_constant_range.offset = 0; + host_depth_store_push_constant_range.size = sizeof(HostDepthStoreConstants); + VkPipelineLayoutCreateInfo host_depth_store_pipeline_layout_create_info; + host_depth_store_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + host_depth_store_pipeline_layout_create_info.pNext = nullptr; + host_depth_store_pipeline_layout_create_info.flags = 0; + host_depth_store_pipeline_layout_create_info.setLayoutCount = + uint32_t(xe::countof(host_depth_store_descriptor_set_layouts)); + host_depth_store_pipeline_layout_create_info.pSetLayouts = + host_depth_store_descriptor_set_layouts; + host_depth_store_pipeline_layout_create_info.pushConstantRangeCount = 1; + host_depth_store_pipeline_layout_create_info.pPushConstantRanges = + &host_depth_store_push_constant_range; + if (dfn.vkCreatePipelineLayout( + device, &host_depth_store_pipeline_layout_create_info, nullptr, + &host_depth_store_pipeline_layout_) != VK_SUCCESS) { XELOGE( - "VulkanRenderTargetCache: Failed to create the {}-sample host depth " - "storing pipeline", - uint32_t(1) << i); + "VulkanRenderTargetCache: Failed to create the host depth storing " + "pipeline layout"); Shutdown(); return false; } - host_depth_store_pipelines_[i] = host_depth_store_pipeline; - } - - // Transfer and clear vertex buffer, for quads of up to tile granularity. - transfer_vertex_buffer_pool_ = - std::make_unique( - provider, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - std::max(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, - sizeof(float) * 2 * 6 * - Transfer::kMaxCutoutBorderRectangles * - xenos::kEdramTileCount)); - - // Transfer vertex shader. - transfer_passthrough_vertex_shader_ = ui::vulkan::util::CreateShaderModule( - provider, shaders::passthrough_position_xy_vs, - sizeof(shaders::passthrough_position_xy_vs)); - if (transfer_passthrough_vertex_shader_ == VK_NULL_HANDLE) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the render target ownership " - "transfer vertex shader"); - Shutdown(); - return false; - } - - // Transfer pipeline layouts. - VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layouts - [kTransferUsedDescriptorSetCount]; - VkPushConstantRange transfer_pipeline_layout_push_constant_range; - transfer_pipeline_layout_push_constant_range.stageFlags = - VK_SHADER_STAGE_FRAGMENT_BIT; - transfer_pipeline_layout_push_constant_range.offset = 0; - VkPipelineLayoutCreateInfo transfer_pipeline_layout_create_info; - transfer_pipeline_layout_create_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - transfer_pipeline_layout_create_info.pNext = nullptr; - transfer_pipeline_layout_create_info.flags = 0; - transfer_pipeline_layout_create_info.pSetLayouts = - transfer_pipeline_layout_descriptor_set_layouts; - transfer_pipeline_layout_create_info.pPushConstantRanges = - &transfer_pipeline_layout_push_constant_range; - for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { - const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = - kTransferPipelineLayoutInfos[i]; - transfer_pipeline_layout_create_info.setLayoutCount = 0; - uint32_t transfer_pipeline_layout_descriptor_sets_remaining = - transfer_pipeline_layout_info.used_descriptor_sets; - uint32_t transfer_pipeline_layout_descriptor_set_index; - while ( - xe::bit_scan_forward(transfer_pipeline_layout_descriptor_sets_remaining, - &transfer_pipeline_layout_descriptor_set_index)) { - transfer_pipeline_layout_descriptor_sets_remaining &= - ~(uint32_t(1) << transfer_pipeline_layout_descriptor_set_index); - VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layout = - VK_NULL_HANDLE; - switch (TransferUsedDescriptorSet( - transfer_pipeline_layout_descriptor_set_index)) { - case kTransferUsedDescriptorSetHostDepthBuffer: - transfer_pipeline_layout_descriptor_set_layout = - descriptor_set_layout_storage_buffer_; - break; - case kTransferUsedDescriptorSetHostDepthStencilTextures: - case kTransferUsedDescriptorSetDepthStencilTextures: - transfer_pipeline_layout_descriptor_set_layout = - descriptor_set_layout_sampled_image_x2_; - break; - case kTransferUsedDescriptorSetColorTexture: - transfer_pipeline_layout_descriptor_set_layout = - descriptor_set_layout_sampled_image_; - break; - default: - assert_unhandled_case(TransferUsedDescriptorSet( - transfer_pipeline_layout_descriptor_set_index)); + const std::pair host_depth_store_shaders[] = { + {shaders::host_depth_store_1xmsaa_cs, + sizeof(shaders::host_depth_store_1xmsaa_cs)}, + {shaders::host_depth_store_2xmsaa_cs, + sizeof(shaders::host_depth_store_2xmsaa_cs)}, + {shaders::host_depth_store_4xmsaa_cs, + sizeof(shaders::host_depth_store_4xmsaa_cs)}, + }; + for (size_t i = 0; i < xe::countof(host_depth_store_shaders); ++i) { + const std::pair host_depth_store_shader = + host_depth_store_shaders[i]; + VkPipeline host_depth_store_pipeline = + ui::vulkan::util::CreateComputePipeline( + provider, host_depth_store_pipeline_layout_, + host_depth_store_shader.first, host_depth_store_shader.second); + if (host_depth_store_pipeline == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the {}-sample host " + "depth storing pipeline", + uint32_t(1) << i); + Shutdown(); + return false; } - transfer_pipeline_layout_descriptor_set_layouts - [transfer_pipeline_layout_create_info.setLayoutCount++] = - transfer_pipeline_layout_descriptor_set_layout; + host_depth_store_pipelines_[i] = host_depth_store_pipeline; } - transfer_pipeline_layout_push_constant_range.size = uint32_t( - sizeof(uint32_t) * - xe::bit_count(transfer_pipeline_layout_info.used_push_constant_dwords)); - transfer_pipeline_layout_create_info.pushConstantRangeCount = - transfer_pipeline_layout_info.used_push_constant_dwords ? 1 : 0; - if (dfn.vkCreatePipelineLayout( - device, &transfer_pipeline_layout_create_info, nullptr, - &transfer_pipeline_layouts_[i]) != VK_SUCCESS) { + + // Transfer and clear vertex buffer, for quads of up to tile granularity. + transfer_vertex_buffer_pool_ = + std::make_unique( + provider, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + std::max(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, + sizeof(float) * 2 * 6 * + Transfer::kMaxCutoutBorderRectangles * + xenos::kEdramTileCount)); + + // Transfer vertex shader. + transfer_passthrough_vertex_shader_ = ui::vulkan::util::CreateShaderModule( + provider, shaders::passthrough_position_xy_vs, + sizeof(shaders::passthrough_position_xy_vs)); + if (transfer_passthrough_vertex_shader_ == VK_NULL_HANDLE) { XELOGE( "VulkanRenderTargetCache: Failed to create the render target " - "ownership transfer pipeline layout {}", - i); + "ownership transfer vertex shader"); Shutdown(); return false; } + + // Transfer pipeline layouts. + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layouts + [kTransferUsedDescriptorSetCount]; + VkPushConstantRange transfer_pipeline_layout_push_constant_range; + transfer_pipeline_layout_push_constant_range.stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT; + transfer_pipeline_layout_push_constant_range.offset = 0; + VkPipelineLayoutCreateInfo transfer_pipeline_layout_create_info; + transfer_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + transfer_pipeline_layout_create_info.pNext = nullptr; + transfer_pipeline_layout_create_info.flags = 0; + transfer_pipeline_layout_create_info.pSetLayouts = + transfer_pipeline_layout_descriptor_set_layouts; + transfer_pipeline_layout_create_info.pPushConstantRanges = + &transfer_pipeline_layout_push_constant_range; + for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { + const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = + kTransferPipelineLayoutInfos[i]; + transfer_pipeline_layout_create_info.setLayoutCount = 0; + uint32_t transfer_pipeline_layout_descriptor_sets_remaining = + transfer_pipeline_layout_info.used_descriptor_sets; + uint32_t transfer_pipeline_layout_descriptor_set_index; + while (xe::bit_scan_forward( + transfer_pipeline_layout_descriptor_sets_remaining, + &transfer_pipeline_layout_descriptor_set_index)) { + transfer_pipeline_layout_descriptor_sets_remaining &= + ~(uint32_t(1) << transfer_pipeline_layout_descriptor_set_index); + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layout = + VK_NULL_HANDLE; + switch (TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)) { + case kTransferUsedDescriptorSetHostDepthBuffer: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_storage_buffer_; + break; + case kTransferUsedDescriptorSetHostDepthStencilTextures: + case kTransferUsedDescriptorSetDepthStencilTextures: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_x2_; + break; + case kTransferUsedDescriptorSetColorTexture: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_; + break; + default: + assert_unhandled_case(TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)); + } + transfer_pipeline_layout_descriptor_set_layouts + [transfer_pipeline_layout_create_info.setLayoutCount++] = + transfer_pipeline_layout_descriptor_set_layout; + } + transfer_pipeline_layout_push_constant_range.size = uint32_t( + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info.used_push_constant_dwords)); + transfer_pipeline_layout_create_info.pushConstantRangeCount = + transfer_pipeline_layout_info.used_push_constant_dwords ? 1 : 0; + if (dfn.vkCreatePipelineLayout( + device, &transfer_pipeline_layout_create_info, nullptr, + &transfer_pipeline_layouts_[i]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target " + "ownership transfer pipeline layout {}", + i); + Shutdown(); + return false; + } + } + + // Dump pipeline layouts. + VkDescriptorSetLayout + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetCount]; + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetEdram] = + descriptor_set_layout_storage_buffer_; + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = + descriptor_set_layout_sampled_image_; + VkPushConstantRange dump_pipeline_layout_push_constant_range; + dump_pipeline_layout_push_constant_range.stageFlags = + VK_SHADER_STAGE_COMPUTE_BIT; + dump_pipeline_layout_push_constant_range.offset = 0; + dump_pipeline_layout_push_constant_range.size = + sizeof(uint32_t) * kDumpPushConstantCount; + VkPipelineLayoutCreateInfo dump_pipeline_layout_create_info; + dump_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + dump_pipeline_layout_create_info.pNext = nullptr; + dump_pipeline_layout_create_info.flags = 0; + dump_pipeline_layout_create_info.setLayoutCount = + uint32_t(xe::countof(dump_pipeline_layout_descriptor_set_layouts)); + dump_pipeline_layout_create_info.pSetLayouts = + dump_pipeline_layout_descriptor_set_layouts; + dump_pipeline_layout_create_info.pushConstantRangeCount = 1; + dump_pipeline_layout_create_info.pPushConstantRanges = + &dump_pipeline_layout_push_constant_range; + if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, + nullptr, &dump_pipeline_layout_color_) != + VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the color render target " + "dumping pipeline layout"); + Shutdown(); + return false; + } + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = + descriptor_set_layout_sampled_image_x2_; + if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, + nullptr, &dump_pipeline_layout_depth_) != + VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the depth render target " + "dumping pipeline layout"); + Shutdown(); + return false; + } + } else if (path_ == Path::kPixelShaderInterlock) { + // Pixel (fragment) shader interlock. + + // Blending is done in linear space directly in shaders. + gamma_render_target_as_srgb_ = false; + + // Always true float24 depth rounded to the nearest even. + depth_float24_round_ = true; + + // The pipeline layout and the pipelines for clearing the EDRAM buffer in + // resolves. + VkPushConstantRange resolve_fsi_clear_push_constant_range; + resolve_fsi_clear_push_constant_range.stageFlags = + VK_SHADER_STAGE_COMPUTE_BIT; + resolve_fsi_clear_push_constant_range.offset = 0; + resolve_fsi_clear_push_constant_range.size = + sizeof(draw_util::ResolveClearShaderConstants); + VkPipelineLayoutCreateInfo resolve_fsi_clear_pipeline_layout_create_info; + resolve_fsi_clear_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + resolve_fsi_clear_pipeline_layout_create_info.pNext = nullptr; + resolve_fsi_clear_pipeline_layout_create_info.flags = 0; + resolve_fsi_clear_pipeline_layout_create_info.setLayoutCount = 1; + resolve_fsi_clear_pipeline_layout_create_info.pSetLayouts = + &descriptor_set_layout_storage_buffer_; + resolve_fsi_clear_pipeline_layout_create_info.pushConstantRangeCount = 1; + resolve_fsi_clear_pipeline_layout_create_info.pPushConstantRanges = + &resolve_fsi_clear_push_constant_range; + if (dfn.vkCreatePipelineLayout( + device, &resolve_fsi_clear_pipeline_layout_create_info, nullptr, + &resolve_fsi_clear_pipeline_layout_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the resolve EDRAM buffer " + "clear pipeline layout"); + Shutdown(); + return false; + } + resolve_fsi_clear_32bpp_pipeline_ = ui::vulkan::util::CreateComputePipeline( + provider, resolve_fsi_clear_pipeline_layout_, + draw_resolution_scaled ? shaders::resolve_clear_32bpp_scaled_cs + : shaders::resolve_clear_32bpp_cs, + draw_resolution_scaled ? sizeof(shaders::resolve_clear_32bpp_scaled_cs) + : sizeof(shaders::resolve_clear_32bpp_cs)); + if (resolve_fsi_clear_32bpp_pipeline_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the 32bpp resolve EDRAM " + "buffer clear pipeline"); + Shutdown(); + return false; + } + resolve_fsi_clear_64bpp_pipeline_ = ui::vulkan::util::CreateComputePipeline( + provider, resolve_fsi_clear_pipeline_layout_, + draw_resolution_scaled ? shaders::resolve_clear_64bpp_scaled_cs + : shaders::resolve_clear_64bpp_cs, + draw_resolution_scaled ? sizeof(shaders::resolve_clear_64bpp_scaled_cs) + : sizeof(shaders::resolve_clear_64bpp_cs)); + if (resolve_fsi_clear_32bpp_pipeline_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the 64bpp resolve EDRAM " + "buffer clear pipeline"); + Shutdown(); + return false; + } + + // Common render pass. + VkSubpassDescription fsi_subpass = {}; + fsi_subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + // Fragment shader interlock provides synchronization and ordering within a + // subpass, create an external by-region dependency to maintain interlocking + // between passes. Framebuffer-global dependencies will be made with + // explicit barriers when the addressing of the EDRAM buffer relatively to + // the fragment coordinates is changed. + VkSubpassDependency fsi_subpass_dependencies[2]; + fsi_subpass_dependencies[0].srcSubpass = VK_SUBPASS_EXTERNAL; + fsi_subpass_dependencies[0].dstSubpass = 0; + fsi_subpass_dependencies[0].srcStageMask = + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + fsi_subpass_dependencies[0].dstStageMask = + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + fsi_subpass_dependencies[0].srcAccessMask = + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + fsi_subpass_dependencies[0].dstAccessMask = + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + fsi_subpass_dependencies[0].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT; + fsi_subpass_dependencies[1] = fsi_subpass_dependencies[0]; + std::swap(fsi_subpass_dependencies[1].srcSubpass, + fsi_subpass_dependencies[1].dstSubpass); + VkRenderPassCreateInfo fsi_render_pass_create_info; + fsi_render_pass_create_info.sType = + VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + fsi_render_pass_create_info.pNext = nullptr; + fsi_render_pass_create_info.flags = 0; + fsi_render_pass_create_info.attachmentCount = 0; + fsi_render_pass_create_info.pAttachments = nullptr; + fsi_render_pass_create_info.subpassCount = 1; + fsi_render_pass_create_info.pSubpasses = &fsi_subpass; + fsi_render_pass_create_info.dependencyCount = + uint32_t(xe::countof(fsi_subpass_dependencies)); + fsi_render_pass_create_info.pDependencies = fsi_subpass_dependencies; + if (dfn.vkCreateRenderPass(device, &fsi_render_pass_create_info, nullptr, + &fsi_render_pass_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the fragment shader " + "interlock render backend render pass"); + Shutdown(); + return false; + } + + // Common framebuffer. + VkFramebufferCreateInfo fsi_framebuffer_create_info; + fsi_framebuffer_create_info.sType = + VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + fsi_framebuffer_create_info.pNext = nullptr; + fsi_framebuffer_create_info.flags = 0; + fsi_framebuffer_create_info.renderPass = fsi_render_pass_; + fsi_framebuffer_create_info.attachmentCount = 0; + fsi_framebuffer_create_info.pAttachments = nullptr; + fsi_framebuffer_create_info.width = std::min( + xenos::kTexture2DCubeMaxWidthHeight * draw_resolution_scale_x(), + device_limits.maxFramebufferWidth); + fsi_framebuffer_create_info.height = std::min( + xenos::kTexture2DCubeMaxWidthHeight * draw_resolution_scale_y(), + device_limits.maxFramebufferHeight); + fsi_framebuffer_create_info.layers = 1; + if (dfn.vkCreateFramebuffer(device, &fsi_framebuffer_create_info, nullptr, + &fsi_framebuffer_.framebuffer) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the fragment shader " + "interlock render backend framebuffer"); + Shutdown(); + return false; + } + fsi_framebuffer_.host_extent.width = fsi_framebuffer_create_info.width; + fsi_framebuffer_.host_extent.height = fsi_framebuffer_create_info.height; + } else { + assert_unhandled_case(path_); + Shutdown(); + return false; } - // Dump pipeline layouts. - VkDescriptorSetLayout - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetCount]; - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetEdram] = - descriptor_set_layout_storage_buffer_; - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = - descriptor_set_layout_sampled_image_; - VkPushConstantRange dump_pipeline_layout_push_constant_range; - dump_pipeline_layout_push_constant_range.stageFlags = - VK_SHADER_STAGE_COMPUTE_BIT; - dump_pipeline_layout_push_constant_range.offset = 0; - dump_pipeline_layout_push_constant_range.size = - sizeof(uint32_t) * kDumpPushConstantCount; - VkPipelineLayoutCreateInfo dump_pipeline_layout_create_info; - dump_pipeline_layout_create_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - dump_pipeline_layout_create_info.pNext = nullptr; - dump_pipeline_layout_create_info.flags = 0; - dump_pipeline_layout_create_info.setLayoutCount = - uint32_t(xe::countof(dump_pipeline_layout_descriptor_set_layouts)); - dump_pipeline_layout_create_info.pSetLayouts = - dump_pipeline_layout_descriptor_set_layouts; - dump_pipeline_layout_create_info.pushConstantRangeCount = 1; - dump_pipeline_layout_create_info.pPushConstantRanges = - &dump_pipeline_layout_push_constant_range; - if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, - nullptr, - &dump_pipeline_layout_color_) != VK_SUCCESS) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the color render target " - "dumping pipeline layout"); - Shutdown(); - return false; - } - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = - descriptor_set_layout_sampled_image_x2_; - if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, - nullptr, - &dump_pipeline_layout_depth_) != VK_SUCCESS) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the depth render target " - "dumping pipeline layout"); - Shutdown(); - return false; - } + // Reset the last update structures, to keep the defaults consistent between + // paths regardless of whether the update for the path actually modifies them. + last_update_render_pass_key_ = RenderPassKey(); + last_update_render_pass_ = VK_NULL_HANDLE; + last_update_framebuffer_pitch_tiles_at_32bpp_ = 0; + std::memset(last_update_framebuffer_attachments_, 0, + sizeof(last_update_framebuffer_attachments_)); + last_update_framebuffer_ = VK_NULL_HANDLE; InitializeCommon(); return true; @@ -667,6 +893,18 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) { // already too late. DestroyAllRenderTargets(true); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipeline, device, + resolve_fsi_clear_64bpp_pipeline_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipeline, device, + resolve_fsi_clear_32bpp_pipeline_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device, + resolve_fsi_clear_pipeline_layout_); + + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyFramebuffer, device, + fsi_framebuffer_.framebuffer); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyRenderPass, device, + fsi_render_pass_); + for (const auto& dump_pipeline_pair : dump_pipelines_) { // May be null to prevent recreation attempts. if (dump_pipeline_pair.second != VK_NULL_HANDLE) { @@ -951,25 +1189,81 @@ bool VulkanRenderTargetCache::Resolve(const Memory& memory, bool clear_depth = resolve_info.IsClearingDepth(); bool clear_color = resolve_info.IsClearingColor(); if (clear_depth || clear_color) { - // TODO(Triang3l): Fragment shader interlock path EDRAM buffer clearing. - if (GetPath() == Path::kHostRenderTargets) { - Transfer::Rectangle clear_rectangle; - RenderTarget* clear_render_targets[2]; - // If PrepareHostRenderTargetsResolveClear returns false, may be just an - // empty region (success) or an error - don't care. - if (PrepareHostRenderTargetsResolveClear( - resolve_info, clear_rectangle, clear_render_targets[0], - clear_transfers_[0], clear_render_targets[1], - clear_transfers_[1])) { - uint64_t clear_values[2]; - clear_values[0] = resolve_info.rb_depth_clear; - clear_values[1] = resolve_info.rb_color_clear | - (uint64_t(resolve_info.rb_color_clear_lo) << 32); - PerformTransfersAndResolveClears(2, clear_render_targets, - clear_transfers_, clear_values, - &clear_rectangle); - } - cleared = true; + switch (GetPath()) { + case Path::kHostRenderTargets: { + Transfer::Rectangle clear_rectangle; + RenderTarget* clear_render_targets[2]; + // If PrepareHostRenderTargetsResolveClear returns false, may be just an + // empty region (success) or an error - don't care. + if (PrepareHostRenderTargetsResolveClear( + resolve_info, clear_rectangle, clear_render_targets[0], + clear_transfers_[0], clear_render_targets[1], + clear_transfers_[1])) { + uint64_t clear_values[2]; + clear_values[0] = resolve_info.rb_depth_clear; + clear_values[1] = resolve_info.rb_color_clear | + (uint64_t(resolve_info.rb_color_clear_lo) << 32); + PerformTransfersAndResolveClears(2, clear_render_targets, + clear_transfers_, clear_values, + &clear_rectangle); + } + cleared = true; + } break; + case Path::kPixelShaderInterlock: { + UseEdramBuffer(EdramBufferUsage::kComputeWrite); + // Should be safe to only commit once (if was accessed as unordered or + // with fragment shader interlock previously - if there was nothing to + // copy, only to clear, for some reason, for instance), overlap of the + // depth and the color ranges is highly unlikely. + CommitEdramBufferShaderWrites(); + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_COMPUTE, resolve_fsi_clear_pipeline_layout_, + 0, 1, &edram_storage_buffer_descriptor_set_, 0, nullptr); + std::pair clear_group_count = + resolve_info.GetClearShaderGroupCount(draw_resolution_scale_x(), + draw_resolution_scale_y()); + assert_true(clear_group_count.first && clear_group_count.second); + if (clear_depth) { + command_processor_.BindExternalComputePipeline( + resolve_fsi_clear_32bpp_pipeline_); + draw_util::ResolveClearShaderConstants depth_clear_constants; + resolve_info.GetDepthClearShaderConstants(depth_clear_constants); + command_buffer.CmdVkPushConstants( + resolve_fsi_clear_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(depth_clear_constants), &depth_clear_constants); + command_processor_.SubmitBarriers(true); + command_buffer.CmdVkDispatch(clear_group_count.first, + clear_group_count.second, 1); + } + if (clear_color) { + command_processor_.BindExternalComputePipeline( + resolve_info.color_edram_info.format_is_64bpp + ? resolve_fsi_clear_64bpp_pipeline_ + : resolve_fsi_clear_32bpp_pipeline_); + draw_util::ResolveClearShaderConstants color_clear_constants; + resolve_info.GetColorClearShaderConstants(color_clear_constants); + if (clear_depth) { + // Non-RT-specific constants have already been set. + command_buffer.CmdVkPushConstants( + resolve_fsi_clear_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + uint32_t(offsetof(draw_util::ResolveClearShaderConstants, + rt_specific)), + sizeof(color_clear_constants.rt_specific), + &color_clear_constants.rt_specific); + } else { + command_buffer.CmdVkPushConstants( + resolve_fsi_clear_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(color_clear_constants), &color_clear_constants); + } + command_processor_.SubmitBarriers(true); + command_buffer.CmdVkDispatch(clear_group_count.first, + clear_group_count.second, 1); + } + MarkEdramBufferModified(); + cleared = true; + } break; + default: + assert_unhandled_case(GetPath()); } } else { cleared = true; @@ -987,128 +1281,161 @@ bool VulkanRenderTargetCache::Update( return false; } - // TODO(Triang3l): All paths (FSI). - - RenderTarget* const* depth_and_color_render_targets = - last_update_accumulated_render_targets(); - - PerformTransfersAndResolveClears(1 + xenos::kMaxColorRenderTargets, - depth_and_color_render_targets, - last_update_transfers()); - auto rb_surface_info = register_file().Get(); - uint32_t render_targets_are_srgb = - gamma_render_target_as_srgb_ - ? last_update_accumulated_color_targets_are_gamma() - : 0; RenderPassKey render_pass_key; + // Needed even with the fragment shader interlock render backend for passing + // the sample count to the pipeline cache. render_pass_key.msaa_samples = rb_surface_info.msaa_samples; - if (depth_and_color_render_targets[0]) { - render_pass_key.depth_and_color_used |= 1 << 0; - render_pass_key.depth_format = - depth_and_color_render_targets[0]->key().GetDepthFormat(); - } - if (depth_and_color_render_targets[1]) { - render_pass_key.depth_and_color_used |= 1 << 1; - render_pass_key.color_0_view_format = - (render_targets_are_srgb & (1 << 0)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[1]->key().GetColorFormat(); - } - if (depth_and_color_render_targets[2]) { - render_pass_key.depth_and_color_used |= 1 << 2; - render_pass_key.color_1_view_format = - (render_targets_are_srgb & (1 << 1)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[2]->key().GetColorFormat(); - } - if (depth_and_color_render_targets[3]) { - render_pass_key.depth_and_color_used |= 1 << 3; - render_pass_key.color_2_view_format = - (render_targets_are_srgb & (1 << 2)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[3]->key().GetColorFormat(); - } - if (depth_and_color_render_targets[4]) { - render_pass_key.depth_and_color_used |= 1 << 4; - render_pass_key.color_3_view_format = - (render_targets_are_srgb & (1 << 3)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[4]->key().GetColorFormat(); - } - const Framebuffer* framebuffer = last_update_framebuffer_; - VkRenderPass render_pass = last_update_render_pass_key_ == render_pass_key - ? last_update_render_pass_ - : VK_NULL_HANDLE; - if (render_pass == VK_NULL_HANDLE) { - render_pass = GetRenderPass(render_pass_key); - if (render_pass == VK_NULL_HANDLE) { + switch (GetPath()) { + case Path::kHostRenderTargets: { + RenderTarget* const* depth_and_color_render_targets = + last_update_accumulated_render_targets(); + + PerformTransfersAndResolveClears(1 + xenos::kMaxColorRenderTargets, + depth_and_color_render_targets, + last_update_transfers()); + + uint32_t render_targets_are_srgb = + gamma_render_target_as_srgb_ + ? last_update_accumulated_color_targets_are_gamma() + : 0; + + if (depth_and_color_render_targets[0]) { + render_pass_key.depth_and_color_used |= 1 << 0; + render_pass_key.depth_format = + depth_and_color_render_targets[0]->key().GetDepthFormat(); + } + if (depth_and_color_render_targets[1]) { + render_pass_key.depth_and_color_used |= 1 << 1; + render_pass_key.color_0_view_format = + (render_targets_are_srgb & (1 << 0)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[1]->key().GetColorFormat(); + } + if (depth_and_color_render_targets[2]) { + render_pass_key.depth_and_color_used |= 1 << 2; + render_pass_key.color_1_view_format = + (render_targets_are_srgb & (1 << 1)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[2]->key().GetColorFormat(); + } + if (depth_and_color_render_targets[3]) { + render_pass_key.depth_and_color_used |= 1 << 3; + render_pass_key.color_2_view_format = + (render_targets_are_srgb & (1 << 2)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[3]->key().GetColorFormat(); + } + if (depth_and_color_render_targets[4]) { + render_pass_key.depth_and_color_used |= 1 << 4; + render_pass_key.color_3_view_format = + (render_targets_are_srgb & (1 << 3)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[4]->key().GetColorFormat(); + } + + const Framebuffer* framebuffer = last_update_framebuffer_; + VkRenderPass render_pass = last_update_render_pass_key_ == render_pass_key + ? last_update_render_pass_ + : VK_NULL_HANDLE; + if (render_pass == VK_NULL_HANDLE) { + render_pass = GetHostRenderTargetsRenderPass(render_pass_key); + if (render_pass == VK_NULL_HANDLE) { + return false; + } + // Framebuffer for a different render pass needed now. + framebuffer = nullptr; + } + + uint32_t pitch_tiles_at_32bpp = + ((rb_surface_info.surface_pitch << uint32_t( + rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X)) + + (xenos::kEdramTileWidthSamples - 1)) / + xenos::kEdramTileWidthSamples; + if (framebuffer) { + if (last_update_framebuffer_pitch_tiles_at_32bpp_ != + pitch_tiles_at_32bpp || + std::memcmp(last_update_framebuffer_attachments_, + depth_and_color_render_targets, + sizeof(last_update_framebuffer_attachments_))) { + framebuffer = nullptr; + } + } + if (!framebuffer) { + framebuffer = GetHostRenderTargetsFramebuffer( + render_pass_key, pitch_tiles_at_32bpp, + depth_and_color_render_targets); + if (!framebuffer) { + return false; + } + } + + // Successful update - write the new configuration. + last_update_render_pass_key_ = render_pass_key; + last_update_render_pass_ = render_pass; + last_update_framebuffer_pitch_tiles_at_32bpp_ = pitch_tiles_at_32bpp; + std::memcpy(last_update_framebuffer_attachments_, + depth_and_color_render_targets, + sizeof(last_update_framebuffer_attachments_)); + last_update_framebuffer_ = framebuffer; + + // Transition the used render targets. + for (uint32_t i = 0; i < 1 + xenos::kMaxColorRenderTargets; ++i) { + RenderTarget* rt = depth_and_color_render_targets[i]; + if (!rt) { + continue; + } + auto& vulkan_rt = *static_cast(rt); + VkPipelineStageFlags rt_dst_stage_mask; + VkAccessFlags rt_dst_access_mask; + VkImageLayout rt_new_layout; + VulkanRenderTarget::GetDrawUsage(i == 0, &rt_dst_stage_mask, + &rt_dst_access_mask, &rt_new_layout); + command_processor_.PushImageMemoryBarrier( + vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + i ? VK_IMAGE_ASPECT_COLOR_BIT + : (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)), + vulkan_rt.current_stage_mask(), rt_dst_stage_mask, + vulkan_rt.current_access_mask(), rt_dst_access_mask, + vulkan_rt.current_layout(), rt_new_layout); + vulkan_rt.SetUsage(rt_dst_stage_mask, rt_dst_access_mask, + rt_new_layout); + } + } break; + + case Path::kPixelShaderInterlock: { + // For FSI, only the barrier is needed - already scheduled if required. + // But the buffer will be used for FSI drawing now. + UseEdramBuffer(EdramBufferUsage::kFragmentReadWrite); + // Commit preceding unordered (but not FSI) writes like clears as they + // aren't synchronized with FSI accesses. + CommitEdramBufferShaderWrites( + EdramBufferModificationStatus::kViaUnordered); + // TODO(Triang3l): Check if this draw call modifies color or depth / + // stencil, at least coarsely, to prevent useless barriers. + MarkEdramBufferModified( + EdramBufferModificationStatus::kViaFragmentShaderInterlock); + last_update_render_pass_key_ = render_pass_key; + last_update_render_pass_ = fsi_render_pass_; + last_update_framebuffer_ = &fsi_framebuffer_; + } break; + + default: + assert_unhandled_case(GetPath()); return false; - } - // Framebuffer for a different render pass needed now. - framebuffer = nullptr; - } - - uint32_t pitch_tiles_at_32bpp = - ((rb_surface_info.surface_pitch - << uint32_t(rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X)) + - (xenos::kEdramTileWidthSamples - 1)) / - xenos::kEdramTileWidthSamples; - if (framebuffer) { - if (last_update_framebuffer_pitch_tiles_at_32bpp_ != pitch_tiles_at_32bpp || - std::memcmp(last_update_framebuffer_attachments_, - depth_and_color_render_targets, - sizeof(last_update_framebuffer_attachments_))) { - framebuffer = nullptr; - } - } - if (!framebuffer) { - framebuffer = GetFramebuffer(render_pass_key, pitch_tiles_at_32bpp, - depth_and_color_render_targets); - if (!framebuffer) { - return false; - } - } - - // Successful update - write the new configuration. - last_update_render_pass_key_ = render_pass_key; - last_update_render_pass_ = render_pass; - last_update_framebuffer_pitch_tiles_at_32bpp_ = pitch_tiles_at_32bpp; - std::memcpy(last_update_framebuffer_attachments_, - depth_and_color_render_targets, - sizeof(last_update_framebuffer_attachments_)); - last_update_framebuffer_ = framebuffer; - - // Transition the used render targets. - for (uint32_t i = 0; i < 1 + xenos::kMaxColorRenderTargets; ++i) { - RenderTarget* rt = depth_and_color_render_targets[i]; - if (!rt) { - continue; - } - auto& vulkan_rt = *static_cast(rt); - VkPipelineStageFlags rt_dst_stage_mask; - VkAccessFlags rt_dst_access_mask; - VkImageLayout rt_new_layout; - VulkanRenderTarget::GetDrawUsage(i == 0, &rt_dst_stage_mask, - &rt_dst_access_mask, &rt_new_layout); - command_processor_.PushImageMemoryBarrier( - vulkan_rt.image(), - ui::vulkan::util::InitializeSubresourceRange( - i ? VK_IMAGE_ASPECT_COLOR_BIT - : (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)), - vulkan_rt.current_stage_mask(), rt_dst_stage_mask, - vulkan_rt.current_access_mask(), rt_dst_access_mask, - vulkan_rt.current_layout(), rt_new_layout); - vulkan_rt.SetUsage(rt_dst_stage_mask, rt_dst_access_mask, rt_new_layout); } return true; } -VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { - auto it = render_passes_.find(key.key); +VkRenderPass VulkanRenderTargetCache::GetHostRenderTargetsRenderPass( + RenderPassKey key) { + assert_true(GetPath() == Path::kHostRenderTargets); + + auto it = render_passes_.find(key); if (it != render_passes_.end()) { return it->second; } @@ -1244,10 +1571,10 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { if (dfn.vkCreateRenderPass(device, &render_pass_create_info, nullptr, &render_pass) != VK_SUCCESS) { XELOGE("VulkanRenderTargetCache: Failed to create a render pass"); - render_passes_.emplace(key.key, VK_NULL_HANDLE); + render_passes_.emplace(key, VK_NULL_HANDLE); return VK_NULL_HANDLE; } - render_passes_.emplace(key.key, render_pass); + render_passes_.emplace(key, render_pass); return render_pass; } @@ -1353,15 +1680,17 @@ VulkanRenderTargetCache::VulkanRenderTarget::~VulkanRenderTarget() { } uint32_t VulkanRenderTargetCache::GetMaxRenderTargetWidth() const { - const ui::vulkan::VulkanProvider& provider = - command_processor_.GetVulkanProvider(); - return provider.device_properties().limits.maxFramebufferWidth; + const VkPhysicalDeviceLimits& device_limits = + command_processor_.GetVulkanProvider().device_properties().limits; + return std::min(device_limits.maxFramebufferWidth, + device_limits.maxImageDimension2D); } uint32_t VulkanRenderTargetCache::GetMaxRenderTargetHeight() const { - const ui::vulkan::VulkanProvider& provider = - command_processor_.GetVulkanProvider(); - return provider.device_properties().limits.maxFramebufferHeight; + const VkPhysicalDeviceLimits& device_limits = + command_processor_.GetVulkanProvider().device_properties().limits; + return std::min(device_limits.maxFramebufferHeight, + device_limits.maxImageDimension2D); } RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( @@ -1615,6 +1944,12 @@ bool VulkanRenderTargetCache::IsHostDepthEncodingDifferent( return false; } +void VulkanRenderTargetCache::RequestPixelShaderInterlockBarrier() { + if (edram_buffer_usage_ == EdramBufferUsage::kFragmentReadWrite) { + CommitEdramBufferShaderWrites(); + } +} + void VulkanRenderTargetCache::GetEdramBufferUsageMasks( EdramBufferUsage usage, VkPipelineStageFlags& stage_mask_out, VkAccessFlags& access_mask_out) { @@ -1715,7 +2050,7 @@ void VulkanRenderTargetCache::CommitEdramBufferShaderWrites( } const VulkanRenderTargetCache::Framebuffer* -VulkanRenderTargetCache::GetFramebuffer( +VulkanRenderTargetCache::GetHostRenderTargetsFramebuffer( RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp, const RenderTarget* const* depth_and_color_render_targets) { FramebufferKey key; @@ -1749,8 +2084,10 @@ VulkanRenderTargetCache::GetFramebuffer( command_processor_.GetVulkanProvider(); const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; - VkRenderPass render_pass = GetRenderPass(render_pass_key); + VkRenderPass render_pass = GetHostRenderTargetsRenderPass(render_pass_key); if (render_pass == VK_NULL_HANDLE) { return nullptr; } @@ -1789,12 +2126,19 @@ VulkanRenderTargetCache::GetFramebuffer( render_pass_key.msaa_samples); } else { assert_zero(render_pass_key.depth_and_color_used); - host_extent.width = 0; - host_extent.height = 0; + // Still needed for occlusion queries. + host_extent.width = xenos::kTexture2DCubeMaxWidthHeight; + host_extent.height = xenos::kTexture2DCubeMaxWidthHeight; } - // Vulkan requires width and height greater than 0. - framebuffer_create_info.width = std::max(host_extent.width, uint32_t(1)); - framebuffer_create_info.height = std::max(host_extent.height, uint32_t(1)); + // Limiting to the device limit for the case of no attachments, for which + // there's no limit imposed by the sizes of the attachments that have been + // created successfully. + host_extent.width = std::min(host_extent.width * draw_resolution_scale_x(), + device_limits.maxFramebufferWidth); + host_extent.height = std::min(host_extent.height * draw_resolution_scale_y(), + device_limits.maxFramebufferHeight); + framebuffer_create_info.width = host_extent.width; + framebuffer_create_info.height = host_extent.height; framebuffer_create_info.layers = 1; VkFramebuffer framebuffer; if (dfn.vkCreateFramebuffer(device, &framebuffer_create_info, nullptr, @@ -4070,7 +4414,8 @@ VkPipeline const* VulkanRenderTargetCache::GetTransferPipelines( : nullptr; } - VkRenderPass render_pass = GetRenderPass(key.render_pass_key); + VkRenderPass render_pass = + GetHostRenderTargetsRenderPass(key.render_pass_key); VkShaderModule fragment_shader_module = GetTransferShader(key.shader_key); if (render_pass == VK_NULL_HANDLE || fragment_shader_module == VK_NULL_HANDLE) { @@ -4643,7 +4988,8 @@ void VulkanRenderTargetCache::PerformTransfersAndResolveClears( dest_rt_key.GetColorFormat(); transfer_render_pass_key.color_rts_use_transfer_formats = 1; } - VkRenderPass transfer_render_pass = GetRenderPass(transfer_render_pass_key); + VkRenderPass transfer_render_pass = + GetHostRenderTargetsRenderPass(transfer_render_pass_key); if (transfer_render_pass == VK_NULL_HANDLE) { continue; } @@ -4651,7 +4997,7 @@ void VulkanRenderTargetCache::PerformTransfersAndResolveClears( transfer_framebuffer_render_targets[1 + xenos::kMaxColorRenderTargets] = {}; transfer_framebuffer_render_targets[dest_rt_key.is_depth ? 0 : 1] = dest_rt; - const Framebuffer* transfer_framebuffer = GetFramebuffer( + const Framebuffer* transfer_framebuffer = GetHostRenderTargetsFramebuffer( transfer_render_pass_key, dest_rt_key.pitch_tiles_at_32bpp, transfer_framebuffer_render_targets); if (!transfer_framebuffer) { diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h index 6fa9c6ab0..c5032f82d 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h @@ -43,6 +43,10 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // true 4x MSAA passes (framebuffers because render target cache render // targets are different for 2x and 4x guest MSAA, pipelines because the // sample mask will have 2 samples excluded for 2x-as-4x). + // This has effect only on the attachments, but even in cases when there + // are no attachments, it can be used to the sample count between + // subsystems, for instance, to specify the desired number of samples to + // use when there are no attachments in pipelines. xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits; // 2 // << 0 is depth, << 1...4 is color. uint32_t depth_and_color_used : 1 + xenos::kMaxColorRenderTargets; // 7 @@ -81,8 +85,9 @@ class VulkanRenderTargetCache final : public RenderTargetCache { static_assert_size(RenderPassKey, sizeof(uint32_t)); struct Framebuffer { - VkFramebuffer framebuffer; - VkExtent2D host_extent; + VkFramebuffer framebuffer = VK_NULL_HANDLE; + VkExtent2D host_extent{}; + Framebuffer() = default; Framebuffer(VkFramebuffer framebuffer, const VkExtent2D& host_extent) : framebuffer(framebuffer), host_extent(host_extent) {} }; @@ -96,15 +101,16 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Transient descriptor set layouts must be initialized in the command // processor. - bool Initialize(); + bool Initialize(uint32_t shared_memory_binding_count); void Shutdown(bool from_destructor = false); void ClearCache() override; void CompletedSubmissionUpdated(); void EndSubmission(); - // TODO(Triang3l): Fragment shader interlock. - Path GetPath() const override { return Path::kHostRenderTargets; } + Path GetPath() const override { return path_; } + + VkBuffer edram_buffer() const { return edram_buffer_; } // Performs the resolve to a shared memory area according to the current // register values, and also clears the render targets if needed. Must be in a @@ -161,7 +167,11 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Returns the render pass object, or VK_NULL_HANDLE if failed to create. // A render pass managed by the render target cache may be ended and resumed // at any time (to allow for things like copying and texture loading). - VkRenderPass GetRenderPass(RenderPassKey key); + VkRenderPass GetHostRenderTargetsRenderPass(RenderPassKey key); + VkRenderPass GetFragmentShaderInterlockRenderPass() const { + assert_true(GetPath() == Path::kPixelShaderInterlock); + return fsi_render_pass_; + } VkFormat GetDepthVulkanFormat(xenos::DepthRenderTargetFormat format) const; VkFormat GetColorVulkanFormat(xenos::ColorRenderTargetFormat format) const; @@ -178,6 +188,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool IsHostDepthEncodingDifferent( xenos::DepthRenderTargetFormat format) const override; + void RequestPixelShaderInterlockBarrier() override; + private: enum class EdramBufferUsage { // There's no need for combined fragment and compute usages. @@ -251,6 +263,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache { VulkanCommandProcessor& command_processor_; TraceWriter& trace_writer_; + Path path_ = Path::kHostRenderTargets; + // Accessible in fragment and compute shaders. VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE; VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE; @@ -276,9 +290,18 @@ class VulkanRenderTargetCache final : public RenderTargetCache { std::array resolve_copy_pipelines_{}; - // RenderPassKey::key -> VkRenderPass. - // VK_NULL_HANDLE if failed to create. - std::unordered_map render_passes_; + // On the fragment shader interlock path, the render pass key is used purely + // for passing parameters to pipeline setup - there's always only one render + // pass. + RenderPassKey last_update_render_pass_key_; + VkRenderPass last_update_render_pass_ = VK_NULL_HANDLE; + // The pitch is not used on the fragment shader interlock path. + uint32_t last_update_framebuffer_pitch_tiles_at_32bpp_ = 0; + // The attachments are not used on the fragment shader interlock path. + const RenderTarget* const* + last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] = + {}; + const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE; // For host render targets. @@ -809,7 +832,7 @@ class VulkanRenderTargetCache final : public RenderTargetCache { }; // Returns the framebuffer object, or VK_NULL_HANDLE if failed to create. - const Framebuffer* GetFramebuffer( + const Framebuffer* GetHostRenderTargetsFramebuffer( RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp, const RenderTarget* const* depth_and_color_render_targets); @@ -845,17 +868,13 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool msaa_2x_attachments_supported_ = false; bool msaa_2x_no_attachments_supported_ = false; + // VK_NULL_HANDLE if failed to create. + std::unordered_map + render_passes_; + std::unordered_map framebuffers_; - RenderPassKey last_update_render_pass_key_; - VkRenderPass last_update_render_pass_ = VK_NULL_HANDLE; - uint32_t last_update_framebuffer_pitch_tiles_at_32bpp_ = 0; - const RenderTarget* const* - last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] = - {}; - const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE; - // Set 0 - EDRAM storage buffer, set 1 - source depth sampled image (and // unused stencil from the transfer descriptor set), HostDepthStoreConstants // passed via push constants. @@ -895,6 +914,15 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Temporary storage for DumpRenderTargets. std::vector dump_rectangles_; std::vector dump_invocations_; + + // For pixel (fragment) shader interlock. + + VkRenderPass fsi_render_pass_ = VK_NULL_HANDLE; + Framebuffer fsi_framebuffer_; + + VkPipelineLayout resolve_fsi_clear_pipeline_layout_ = VK_NULL_HANDLE; + VkPipeline resolve_fsi_clear_32bpp_pipeline_ = VK_NULL_HANDLE; + VkPipeline resolve_fsi_clear_64bpp_pipeline_ = VK_NULL_HANDLE; }; } // namespace vulkan diff --git a/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc b/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc index 45153db06..bdc483c43 100644 --- a/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc +++ b/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc @@ -1,5 +1,7 @@ // VK_KHR_get_physical_device_properties2 functions used in Xenia. // Promoted to Vulkan 1.1 core. +XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceFeatures2KHR, + vkGetPhysicalDeviceFeatures2) XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceMemoryProperties2KHR, vkGetPhysicalDeviceMemoryProperties2) XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceProperties2KHR, diff --git a/src/xenia/ui/vulkan/vulkan_provider.cc b/src/xenia/ui/vulkan/vulkan_provider.cc index 3a30220fb..a1ffd3e61 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.cc +++ b/src/xenia/ui/vulkan/vulkan_provider.cc @@ -696,6 +696,7 @@ bool VulkanProvider::Initialize() { device_extensions_.khr_shader_float_controls = true; device_extensions_.khr_spirv_1_4 = true; if (device_properties_.apiVersion >= VK_MAKE_API_VERSION(0, 1, 3, 0)) { + device_extensions_.ext_shader_demote_to_helper_invocation = true; device_extensions_.khr_maintenance4 = true; } } @@ -709,6 +710,8 @@ bool VulkanProvider::Initialize() { {"VK_EXT_fragment_shader_interlock", offsetof(DeviceExtensions, ext_fragment_shader_interlock)}, {"VK_EXT_memory_budget", offsetof(DeviceExtensions, ext_memory_budget)}, + {"VK_EXT_shader_demote_to_helper_invocation", + offsetof(DeviceExtensions, ext_shader_demote_to_helper_invocation)}, {"VK_EXT_shader_stencil_export", offsetof(DeviceExtensions, ext_shader_stencil_export)}, {"VK_KHR_bind_memory2", offsetof(DeviceExtensions, khr_bind_memory2)}, @@ -816,6 +819,16 @@ bool VulkanProvider::Initialize() { // Get additional device properties. std::memset(&device_float_controls_properties_, 0, sizeof(device_float_controls_properties_)); + device_float_controls_properties_.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR; + std::memset(&device_fragment_shader_interlock_features_, 0, + sizeof(device_fragment_shader_interlock_features_)); + device_fragment_shader_interlock_features_.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT; + std::memset(&device_shader_demote_to_helper_invocation_features_, 0, + sizeof(device_shader_demote_to_helper_invocation_features_)); + device_shader_demote_to_helper_invocation_features_.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT; if (instance_extensions_.khr_get_physical_device_properties2) { VkPhysicalDeviceProperties2KHR device_properties_2; device_properties_2.sType = @@ -824,8 +837,6 @@ bool VulkanProvider::Initialize() { VkPhysicalDeviceProperties2KHR* device_properties_2_last = &device_properties_2; if (device_extensions_.khr_shader_float_controls) { - device_float_controls_properties_.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR; device_float_controls_properties_.pNext = nullptr; device_properties_2_last->pNext = &device_float_controls_properties_; device_properties_2_last = @@ -836,6 +847,28 @@ bool VulkanProvider::Initialize() { ifn_.vkGetPhysicalDeviceProperties2KHR(physical_device_, &device_properties_2); } + VkPhysicalDeviceFeatures2KHR device_features_2; + device_features_2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR; + device_features_2.pNext = nullptr; + VkPhysicalDeviceFeatures2KHR* device_features_2_last = &device_features_2; + if (device_extensions_.ext_fragment_shader_interlock) { + device_fragment_shader_interlock_features_.pNext = nullptr; + device_features_2_last->pNext = + &device_fragment_shader_interlock_features_; + device_features_2_last = reinterpret_cast( + &device_fragment_shader_interlock_features_); + } + if (device_extensions_.ext_shader_demote_to_helper_invocation) { + device_shader_demote_to_helper_invocation_features_.pNext = nullptr; + device_features_2_last->pNext = + &device_shader_demote_to_helper_invocation_features_; + device_features_2_last = reinterpret_cast( + &device_shader_demote_to_helper_invocation_features_); + } + if (device_features_2_last != &device_features_2) { + ifn_.vkGetPhysicalDeviceFeatures2KHR(physical_device_, + &device_features_2); + } } // Create the device. @@ -888,6 +921,21 @@ bool VulkanProvider::Initialize() { device_create_info_last = reinterpret_cast( &device_portability_subset_features_); } + if (device_extensions_.ext_fragment_shader_interlock) { + // TODO(Triang3l): Enable only needed fragment shader interlock features. + device_fragment_shader_interlock_features_.pNext = nullptr; + device_create_info_last->pNext = + &device_fragment_shader_interlock_features_; + device_create_info_last = reinterpret_cast( + &device_fragment_shader_interlock_features_); + } + if (device_extensions_.ext_shader_demote_to_helper_invocation) { + device_shader_demote_to_helper_invocation_features_.pNext = nullptr; + device_create_info_last->pNext = + &device_shader_demote_to_helper_invocation_features_; + device_create_info_last = reinterpret_cast( + &device_shader_demote_to_helper_invocation_features_); + } if (ifn_.vkCreateDevice(physical_device_, &device_create_info, nullptr, &device_) != VK_SUCCESS) { XELOGE("Failed to create a Vulkan device"); @@ -995,8 +1043,30 @@ bool VulkanProvider::Initialize() { XELOGVK("Vulkan device extensions:"); XELOGVK("* VK_EXT_fragment_shader_interlock: {}", device_extensions_.ext_fragment_shader_interlock ? "yes" : "no"); + if (device_extensions_.ext_fragment_shader_interlock) { + XELOGVK( + " * Sample interlock: {}", + device_fragment_shader_interlock_features_.fragmentShaderSampleInterlock + ? "yes" + : "no"); + XELOGVK( + " * Pixel interlock: {}", + device_fragment_shader_interlock_features_.fragmentShaderPixelInterlock + ? "yes" + : "no"); + } XELOGVK("* VK_EXT_memory_budget: {}", device_extensions_.ext_memory_budget ? "yes" : "no"); + XELOGVK( + "* VK_EXT_shader_demote_to_helper_invocation: {}", + device_extensions_.ext_shader_demote_to_helper_invocation ? "yes" : "no"); + if (device_extensions_.ext_shader_demote_to_helper_invocation) { + XELOGVK(" * Demote to helper invocation: {}", + device_shader_demote_to_helper_invocation_features_ + .shaderDemoteToHelperInvocation + ? "yes" + : "no"); + } XELOGVK("* VK_EXT_shader_stencil_export: {}", device_extensions_.ext_shader_stencil_export ? "yes" : "no"); XELOGVK("* VK_KHR_bind_memory2: {}", diff --git a/src/xenia/ui/vulkan/vulkan_provider.h b/src/xenia/ui/vulkan/vulkan_provider.h index 8dc83283c..2d499a614 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.h +++ b/src/xenia/ui/vulkan/vulkan_provider.h @@ -133,6 +133,8 @@ class VulkanProvider : public GraphicsProvider { struct DeviceExtensions { bool ext_fragment_shader_interlock; bool ext_memory_budget; + // Core since 1.3.0. + bool ext_shader_demote_to_helper_invocation; bool ext_shader_stencil_export; // Core since 1.1.0. bool khr_bind_memory2; @@ -198,6 +200,14 @@ class VulkanProvider : public GraphicsProvider { device_float_controls_properties() const { return device_float_controls_properties_; } + const VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT& + device_fragment_shader_interlock_features() const { + return device_fragment_shader_interlock_features_; + } + const VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT& + device_shader_demote_to_helper_invocation_features() const { + return device_shader_demote_to_helper_invocation_features_; + } struct Queue { VkQueue queue = VK_NULL_HANDLE; @@ -320,6 +330,10 @@ class VulkanProvider : public GraphicsProvider { uint32_t queue_family_graphics_compute_; uint32_t queue_family_sparse_binding_; VkPhysicalDeviceFloatControlsPropertiesKHR device_float_controls_properties_; + VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT + device_fragment_shader_interlock_features_; + VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT + device_shader_demote_to_helper_invocation_features_; VkDevice device_ = VK_NULL_HANDLE; DeviceFunctions dfn_ = {}; diff --git a/tools/shader-playground/Editor.Designer.cs b/tools/shader-playground/Editor.Designer.cs index f57e550e4..dfb971e91 100644 --- a/tools/shader-playground/Editor.Designer.cs +++ b/tools/shader-playground/Editor.Designer.cs @@ -191,9 +191,10 @@ this.translationComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList; this.translationComboBox.FormattingEnabled = true; this.translationComboBox.Items.AddRange(new object[] { - "DXBC (RTV/DSV RB)", - "DXBC (ROV RB)", - "SPIR-V"}); + "DXBC (render target RB)", + "DXBC (rasterizer-ordered view RB)", + "SPIR-V (framebuffer RB)", + "SPIR-V (fragment shader interlock RB)"}); this.translationComboBox.Location = new System.Drawing.Point(1224, 0); this.translationComboBox.Margin = new System.Windows.Forms.Padding(3, 0, 3, 0); this.translationComboBox.Name = "translationComboBox"; diff --git a/tools/shader-playground/Editor.cs b/tools/shader-playground/Editor.cs index 52d1f6a6e..cb0aa7145 100644 --- a/tools/shader-playground/Editor.cs +++ b/tools/shader-playground/Editor.cs @@ -235,6 +235,7 @@ namespace shader_playground { outputType = "dxbctext"; break; case 2: + case 3: outputType = "spirvtext"; break; } @@ -269,8 +270,9 @@ namespace shader_playground { "--vertex_shader_output_type=" + vertexShaderType, "--dxbc_source_map=true", }; - if (translationComboBox.SelectedIndex == 1) { - startArguments.Add("--shader_output_dxbc_rov=true"); + if (translationComboBox.SelectedIndex == 1 || + translationComboBox.SelectedIndex == 3) { + startArguments.Add("--shader_output_pixel_shader_interlock=true"); } startInfo = new ProcessStartInfo(compilerPath_);