diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 129f89fd0..e39418cc8 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -3189,15 +3189,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // flow. reg::RB_COLOR_INFO color_infos[4]; float rt_clamp[4][4]; + // Two UINT32_MAX if no components actually existing in the RT are written. uint32_t rt_keep_masks[4][2]; for (uint32_t i = 0; i < 4; ++i) { auto color_info = regs.Get( reg::RB_COLOR_INFO::rt_register_indices[i]); color_infos[i] = color_info; if (edram_rov_used) { - // Get the mask for keeping previous color's components unmodified, - // or two UINT32_MAX if no colors actually existing in the RT are written. - DxbcShaderTranslator::ROV_GetColorFormatSystemConstants( + RenderTargetCache::GetPSIColorFormatInfo( color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111, rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3], rt_keep_masks[i][0], rt_keep_masks[i][1]); @@ -3506,8 +3505,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( rt_base_dwords_scaled; system_constants_.edram_rt_base_dwords_scaled[i] = rt_base_dwords_scaled; - uint32_t format_flags = DxbcShaderTranslator::ROV_AddColorFormatFlags( - color_info.color_format); + uint32_t format_flags = + RenderTargetCache::AddPSIColorFormatFlags(color_info.color_format); dirty |= system_constants_.edram_rt_format_flags[i] != format_flags; system_constants_.edram_rt_format_flags[i] = format_flags; // Can't do float comparisons here because NaNs would result in always diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 9679e43a2..a75597011 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -267,19 +267,6 @@ class DxbcShaderTranslator : public ShaderTranslator { }; static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants"); - // Appended to the format in the format constant. - enum : uint32_t { - // Starting from bit 4 because the format itself needs 4 bits. - kRTFormatFlag_64bpp_Shift = 4, - // Requires clamping of blending sources and factors. - kRTFormatFlag_FixedPointColor_Shift, - kRTFormatFlag_FixedPointAlpha_Shift, - - kRTFormatFlag_64bpp = 1u << kRTFormatFlag_64bpp_Shift, - kRTFormatFlag_FixedPointColor = 1u << kRTFormatFlag_FixedPointColor_Shift, - kRTFormatFlag_FixedPointAlpha = 1u << kRTFormatFlag_FixedPointAlpha_Shift, - }; - // IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED: // - SystemConstants::Index enum. // - system_constant_rdef_. @@ -383,7 +370,8 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t edram_rt_base_dwords_scaled[4]; - // RT format combined with kRTFormatFlags. + // RT format combined with RenderTargetCache::kPSIColorFormatFlag values + // (pass via RenderTargetCache::AddPSIColorFormatFlags). uint32_t edram_rt_format_flags[4]; // Format info - values to clamp the color to before blending or storing. @@ -524,40 +512,6 @@ class DxbcShaderTranslator : public ShaderTranslator { kEdram, }; - // Returns the format with internal flags for passing via the - // edram_rt_format_flags system constant. - static constexpr uint32_t ROV_AddColorFormatFlags( - xenos::ColorRenderTargetFormat format) { - uint32_t format_flags = uint32_t(format); - if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || - format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT || - format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { - format_flags |= kRTFormatFlag_64bpp; - } - if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8 || - format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA || - format == xenos::ColorRenderTargetFormat::k_2_10_10_10 || - format == xenos::ColorRenderTargetFormat::k_16_16 || - format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || - format == xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10) { - format_flags |= - kRTFormatFlag_FixedPointColor | kRTFormatFlag_FixedPointAlpha; - } else if (format == xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT || - format == xenos::ColorRenderTargetFormat:: - k_2_10_10_10_FLOAT_AS_16_16_16_16) { - format_flags |= kRTFormatFlag_FixedPointAlpha; - } - return format_flags; - } - // Returns the bits that need to be added to the RT flags constant - needs to - // be done externally, not in SetColorFormatConstants, because the flags - // contain other state. - static void ROV_GetColorFormatSystemConstants( - xenos::ColorRenderTargetFormat format, uint32_t write_mask, - float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high, - float& clamp_alpha_high, uint32_t& keep_mask_low, - uint32_t& keep_mask_high); - uint64_t GetDefaultVertexShaderModification( uint32_t dynamic_addressable_register_count, Shader::HostVertexShaderType host_vertex_shader_type = @@ -772,6 +726,7 @@ class DxbcShaderTranslator : public ShaderTranslator { // Whether it's possible and worth skipping running the translated shader for // 2x2 quads. bool ROV_IsDepthStencilEarly() const { + assert_true(edram_rov_used_); return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() && !current_shader().is_valid_memexport_used(); } diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index a4d1b3c83..412e003ec 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -14,139 +14,13 @@ #include "xenia/base/assert.h" #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/render_target_cache.h" #include "xenia/gpu/texture_cache.h" namespace xe { namespace gpu { using namespace ucode; -void DxbcShaderTranslator::ROV_GetColorFormatSystemConstants( - xenos::ColorRenderTargetFormat format, uint32_t write_mask, - float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high, - float& clamp_alpha_high, uint32_t& keep_mask_low, - uint32_t& keep_mask_high) { - keep_mask_low = keep_mask_high = 0; - switch (format) { - case xenos::ColorRenderTargetFormat::k_8_8_8_8: - case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { - clamp_rgb_low = clamp_alpha_low = 0.0f; - clamp_rgb_high = clamp_alpha_high = 1.0f; - for (uint32_t i = 0; i < 4; ++i) { - if (!(write_mask & (1 << i))) { - keep_mask_low |= uint32_t(0xFF) << (i * 8); - } - } - } break; - case xenos::ColorRenderTargetFormat::k_2_10_10_10: - case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { - clamp_rgb_low = clamp_alpha_low = 0.0f; - clamp_rgb_high = clamp_alpha_high = 1.0f; - for (uint32_t i = 0; i < 3; ++i) { - if (!(write_mask & (1 << i))) { - keep_mask_low |= uint32_t(0x3FF) << (i * 10); - } - } - if (!(write_mask & 0b1000)) { - keep_mask_low |= uint32_t(3) << 30; - } - } break; - case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: - case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: { - clamp_rgb_low = clamp_alpha_low = 0.0f; - clamp_rgb_high = 31.875f; - clamp_alpha_high = 1.0f; - for (uint32_t i = 0; i < 3; ++i) { - if (!(write_mask & (1 << i))) { - keep_mask_low |= uint32_t(0x3FF) << (i * 10); - } - } - if (!(write_mask & 0b1000)) { - keep_mask_low |= uint32_t(3) << 30; - } - } break; - case xenos::ColorRenderTargetFormat::k_16_16: - case xenos::ColorRenderTargetFormat::k_16_16_16_16: - // Alpha clamping affects blending source, so it's non-zero for alpha for - // k_16_16 (the render target is fixed-point). There's one deviation from - // how Direct3D 11.3 functional specification defines SNorm conversion - // (NaN should be 0, not the lowest negative number), but NaN handling in - // output shouldn't be very important. - clamp_rgb_low = clamp_alpha_low = -32.0f; - clamp_rgb_high = clamp_alpha_high = 32.0f; - if (!(write_mask & 0b0001)) { - keep_mask_low |= 0xFFFFu; - } - if (!(write_mask & 0b0010)) { - keep_mask_low |= 0xFFFF0000u; - } - if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16) { - if (!(write_mask & 0b0100)) { - keep_mask_high |= 0xFFFFu; - } - if (!(write_mask & 0b1000)) { - keep_mask_high |= 0xFFFF0000u; - } - } else { - write_mask &= 0b0011; - } - break; - case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: - case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: - // No NaNs on the Xbox 360 GPU, though can't use the extended range with - // f32tof16. - clamp_rgb_low = clamp_alpha_low = -65504.0f; - clamp_rgb_high = clamp_alpha_high = 65504.0f; - if (!(write_mask & 0b0001)) { - keep_mask_low |= 0xFFFFu; - } - if (!(write_mask & 0b0010)) { - keep_mask_low |= 0xFFFF0000u; - } - if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT) { - if (!(write_mask & 0b0100)) { - keep_mask_high |= 0xFFFFu; - } - if (!(write_mask & 0b1000)) { - keep_mask_high |= 0xFFFF0000u; - } - } else { - write_mask &= 0b0011; - } - break; - case xenos::ColorRenderTargetFormat::k_32_FLOAT: - // No clamping - let min/max always pick the original value. - clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = - std::nanf(""); - write_mask &= 0b0001; - if (!(write_mask & 0b0001)) { - keep_mask_low = ~uint32_t(0); - } - break; - case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: - // No clamping - let min/max always pick the original value. - clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = - std::nanf(""); - write_mask &= 0b0011; - if (!(write_mask & 0b0001)) { - keep_mask_low = ~uint32_t(0); - } - if (!(write_mask & 0b0010)) { - keep_mask_high = ~uint32_t(0); - } - break; - default: - assert_unhandled_case(format); - // Disable invalid render targets. - write_mask = 0; - break; - } - // Special case handled in the shaders for empty write mask to completely skip - // a disabled render target: all keep bits are set. - if (!write_mask) { - keep_mask_low = keep_mask_high = ~uint32_t(0); - } -} - void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { bool any_color_targets_written = current_shader().writes_color_targets() != 0; @@ -484,8 +358,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { { // Copy the 4x AA coverage to system_temp_rov_params_.x, making top-right // the sample [2] and bottom-left the sample [1] (the opposite of Direct3D - // 12), because on the Xbox 360, 2x MSAA doubles the storage width, 4x MSAA - // doubles the storage height. + // 12), because on the Xbox 360, 2x MSAA doubles the storage height, 4x MSAA + // doubles the storage width. // Flip samples in bits 0:1 to bits 29:30. a_.OpBFRev(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::VCoverage()); @@ -1304,7 +1178,7 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_8_8_8_8_GAMMA // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA : xenos::ColorRenderTargetFormat::k_8_8_8_8))); // Unpack the components. @@ -1328,9 +1202,9 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_2_10_10_10 // k_2_10_10_10_AS_10_10_10_10 // *************************************************************************** - a_.OpCase(dxbc::Src::LU( - ROV_AddColorFormatFlags(xenos::ColorRenderTargetFormat::k_2_10_10_10))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); { // Unpack the components. @@ -1350,9 +1224,9 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_2_10_10_10_FLOAT_AS_16_16_16_16 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp // *************************************************************************** - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16))); { // Unpack the alpha. @@ -1381,7 +1255,7 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_16_16_16_16 (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16 : xenos::ColorRenderTargetFormat::k_16_16))); dxbc::Dest color_components_dest( @@ -1404,7 +1278,7 @@ void DxbcShaderTranslator::ROV_UnpackColor( // k_16_16_16_16_FLOAT (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT : xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); dxbc::Dest color_components_dest( @@ -1465,7 +1339,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_8_8_8_8_GAMMA // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA : xenos::ColorRenderTargetFormat::k_8_8_8_8))); for (uint32_t j = 0; j < 4; ++j) { @@ -1496,9 +1370,9 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_2_10_10_10 // k_2_10_10_10_AS_10_10_10_10 // *************************************************************************** - a_.OpCase(dxbc::Src::LU( - ROV_AddColorFormatFlags(xenos::ColorRenderTargetFormat::k_2_10_10_10))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); for (uint32_t i = 0; i < 4; ++i) { // Denormalize and convert to fixed-point. @@ -1518,9 +1392,9 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_2_10_10_10_FLOAT_AS_16_16_16_16 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp // *************************************************************************** - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16))); { // Convert red directly to the destination, which may be the same as the @@ -1550,7 +1424,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_16_16_16_16 (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16 : xenos::ColorRenderTargetFormat::k_16_16))); for (uint32_t j = 0; j < (uint32_t(2) << i); ++j) { @@ -1582,7 +1456,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( // k_16_16_16_16_FLOAT (64bpp) // *************************************************************************** for (uint32_t i = 0; i < 2; ++i) { - a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags( + a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags( i ? xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT : xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); for (uint32_t j = 0; j < (uint32_t(2) << i); ++j) { @@ -2230,7 +2104,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Load whether the render target is 64bpp to system_temp_rov_params_.y to // get the needed relative sample address. a_.OpAnd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), - rt_format_flags_src, dxbc::Src::LU(kRTFormatFlag_64bpp)); + rt_format_flags_src, + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); // Choose the relative sample address for the render target to // system_temp_rov_params_.y. a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010), @@ -2287,7 +2162,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the blending source color is fixed-point for clamping if it is. // temp.x = whether color is fixed-point. a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointColor)); + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)); // Check if the blending source color is fixed-point and needs clamping. // temp.x = free. a_.OpIf(true, temp_x_src); @@ -2306,7 +2182,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the blending source alpha is fixed-point for clamping if it is. // temp.x = whether alpha is fixed-point. a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha)); + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)); // Check if the blending source alpha is fixed-point and needs clamping. // temp.x = free. a_.OpIf(true, temp_x_src); @@ -2387,7 +2264,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the format is 64bpp to temp.w. // temp.w = whether the render target is 64bpp. a_.OpAnd(temp_w_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_64bpp)); + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); // Check if the format is 64bpp. // temp.w = free. a_.OpIf(true, temp_w_src); @@ -2478,8 +2355,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the render target color is fixed-point and the source // color factor needs clamping to temp.x. // temp.x = whether color is fixed-point. - a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointColor)); + a_.OpAnd( + temp_x_dest, rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)); // Check if the source color factor needs clamping. a_.OpIf(true, temp_x_src); { @@ -2558,8 +2437,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the render target color is fixed-point and the // destination color factor needs clamping to temp.x. // temp.x = whether color is fixed-point. - a_.OpAnd(temp_x_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointColor)); + a_.OpAnd( + temp_x_dest, rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)); // Check if the destination color factor needs clamping. a_.OpIf(true, temp_x_src); { @@ -2701,8 +2582,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the render target alpha is fixed-point and the source // alpha factor needs clamping to temp.y. // temp.y = whether alpha is fixed-point. - a_.OpAnd(temp_y_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha)); + a_.OpAnd( + temp_y_dest, rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)); // Check if the source alpha factor needs clamping. a_.OpIf(true, temp_y_src); { @@ -2769,9 +2652,11 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // destination alpha factor needs clamping. // alpha_is_fixed_temp.x = whether alpha is fixed-point. uint32_t alpha_is_fixed_temp = PushSystemTemp(); - a_.OpAnd(dxbc::Dest::R(alpha_is_fixed_temp, 0b0001), - rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha)); + a_.OpAnd( + dxbc::Dest::R(alpha_is_fixed_temp, 0b0001), + rt_format_flags_src, + dxbc::Src::LU( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)); // Check if the destination alpha factor needs clamping. a_.OpIf(true, dxbc::Src::R(alpha_is_fixed_temp, dxbc::Src::kXXXX)); @@ -2925,7 +2810,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Get if the format is 64bpp to temp.z. // temp.z = whether the render target is 64bpp. a_.OpAnd(temp_z_dest, rt_format_flags_src, - dxbc::Src::LU(kRTFormatFlag_64bpp)); + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); // Check if the format is 64bpp. // temp.z = free. a_.OpIf(true, temp_z_src); @@ -2954,16 +2839,29 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Close the sample covered check. a_.OpEndIf(); - // Go to the next sample (samples are at +0, +(80*scale_x), +1, - // +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1), - // +(80*scale_x) and -(80*scale_x+1) after each sample). + // Go to the next sample (samples are at +0, +(80*scale_x), +dwpp, + // +(80*scale_x+dwpp), so need to do +(80*scale_x), -(80*scale_x-dwpp), + // +(80*scale_x) and -(80*scale_x+dwpp) after each sample). // Though no need to do this for the last sample as for the next render // target, the address will be recalculated. if (j < 3) { - a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LI((j & 1) ? -int32_t(tile_width) + 2 - j - : int32_t(tile_width))); + if (j & 1) { + // temp.z = whether the render target is 64bpp. + a_.OpAnd(temp_z_dest, rt_format_flags_src, + dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp)); + // temp.z = offset from the current sample to the next. + a_.OpMovC(temp_z_dest, temp_z_src, + dxbc::Src::LI(-int32_t(tile_width) + 2 * (2 - int32_t(j))), + dxbc::Src::LI(-int32_t(tile_width) + (2 - int32_t(j)))); + // temp.z = free. + a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), + temp_z_src); + } else { + a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), + dxbc::Src::LU(tile_width)); + } } } @@ -2987,6 +2885,17 @@ void DxbcShaderTranslator::CompletePixelShader() { if (current_shader().writes_color_target(0) && !IsForceEarlyDepthStencilGlobalFlagEnabled()) { + if (edram_rov_used_) { + // Check if the render target 0 was written to on the execution path. + uint32_t rt_0_written_temp = PushSystemTemp(); + a_.OpAnd(dxbc::Dest::R(rt_0_written_temp, 0b0001), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX), + dxbc::Src::LU(1 << 8)); + a_.OpIf(true, dxbc::Src::R(rt_0_written_temp, dxbc::Src::kXXXX)); + // Release rt_0_written_temp. + PopSystemTemp(); + } + // Alpha test. // X - mask, then masked result (SGPR for loading, VGPR for masking). // Y - operation result (SGPR for mask operations, VGPR for alpha @@ -3057,10 +2966,15 @@ void DxbcShaderTranslator::CompletePixelShader() { a_.OpEndIf(); // Release alpha_test_temp. PopSystemTemp(); - } - // Discard samples with alpha to coverage. - CompletePixelShader_AlphaToMask(); + // Discard samples with alpha to coverage. + CompletePixelShader_AlphaToMask(); + + if (edram_rov_used_) { + // Close the render target 0 written check. + a_.OpEndIf(); + } + } // Write the values to the render targets. Not applying the exponent bias yet // because the original 0 to 1 alpha value is needed for alpha to coverage, diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 2695b22d9..4bc882bbc 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -207,6 +207,134 @@ DEFINE_bool( namespace xe { namespace gpu { +void RenderTargetCache::GetPSIColorFormatInfo( + xenos::ColorRenderTargetFormat format, uint32_t write_mask, + float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high, + float& clamp_alpha_high, uint32_t& keep_mask_low, + uint32_t& keep_mask_high) { + keep_mask_low = keep_mask_high = 0; + switch (format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + clamp_rgb_low = clamp_alpha_low = 0.0f; + clamp_rgb_high = clamp_alpha_high = 1.0f; + for (uint32_t i = 0; i < 4; ++i) { + if (!(write_mask & (1 << i))) { + keep_mask_low |= uint32_t(0xFF) << (i * 8); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + clamp_rgb_low = clamp_alpha_low = 0.0f; + clamp_rgb_high = clamp_alpha_high = 1.0f; + for (uint32_t i = 0; i < 3; ++i) { + if (!(write_mask & (1 << i))) { + keep_mask_low |= uint32_t(0x3FF) << (i * 10); + } + } + if (!(write_mask & 0b1000)) { + keep_mask_low |= uint32_t(3) << 30; + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: { + clamp_rgb_low = clamp_alpha_low = 0.0f; + clamp_rgb_high = 31.875f; + clamp_alpha_high = 1.0f; + for (uint32_t i = 0; i < 3; ++i) { + if (!(write_mask & (1 << i))) { + keep_mask_low |= uint32_t(0x3FF) << (i * 10); + } + } + if (!(write_mask & 0b1000)) { + keep_mask_low |= uint32_t(3) << 30; + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + // Alpha clamping affects blending source, so it's non-zero for alpha for + // k_16_16 (the render target is fixed-point). There's one deviation from + // how Direct3D 11.3 functional specification defines SNorm conversion + // (NaN should be 0, not the lowest negative number), and that needs to be + // handled separately. + clamp_rgb_low = clamp_alpha_low = -32.0f; + clamp_rgb_high = clamp_alpha_high = 32.0f; + if (!(write_mask & 0b0001)) { + keep_mask_low |= 0xFFFFu; + } + if (!(write_mask & 0b0010)) { + keep_mask_low |= 0xFFFF0000u; + } + if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16) { + if (!(write_mask & 0b0100)) { + keep_mask_high |= 0xFFFFu; + } + if (!(write_mask & 0b1000)) { + keep_mask_high |= 0xFFFF0000u; + } + } else { + write_mask &= 0b0011; + } + break; + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: + // No NaNs on the Xbox 360 GPU, though can't use the extended range with + // Direct3D and Vulkan conversions. + // TODO(Triang3l): Use the extended-range encoding in all implementations. + clamp_rgb_low = clamp_alpha_low = -65504.0f; + clamp_rgb_high = clamp_alpha_high = 65504.0f; + if (!(write_mask & 0b0001)) { + keep_mask_low |= 0xFFFFu; + } + if (!(write_mask & 0b0010)) { + keep_mask_low |= 0xFFFF0000u; + } + if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT) { + if (!(write_mask & 0b0100)) { + keep_mask_high |= 0xFFFFu; + } + if (!(write_mask & 0b1000)) { + keep_mask_high |= 0xFFFF0000u; + } + } else { + write_mask &= 0b0011; + } + break; + case xenos::ColorRenderTargetFormat::k_32_FLOAT: + // No clamping - let min/max always pick the original value. + clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = + std::nanf(""); + write_mask &= 0b0001; + if (!(write_mask & 0b0001)) { + keep_mask_low = ~uint32_t(0); + } + break; + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: + // No clamping - let min/max always pick the original value. + clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high = + std::nanf(""); + write_mask &= 0b0011; + if (!(write_mask & 0b0001)) { + keep_mask_low = ~uint32_t(0); + } + if (!(write_mask & 0b0010)) { + keep_mask_high = ~uint32_t(0); + } + break; + default: + assert_unhandled_case(format); + // Disable invalid render targets. + write_mask = 0; + break; + } + // Special case handled in the shaders for empty write mask to completely skip + // a disabled render target: all keep bits are set. + if (!write_mask) { + keep_mask_low = keep_mask_high = ~uint32_t(0); + } +} + uint32_t RenderTargetCache::Transfer::GetRangeRectangles( uint32_t start_tiles, uint32_t end_tiles, uint32_t base_tiles, uint32_t pitch_tiles, xenos::MsaaSamples msaa_samples, bool is_64bpp, diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index 84cce18fd..5353176ed 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -113,6 +113,54 @@ class RenderTargetCache { kSrgbToLinearExponent); } + // Pixel shader interlock implementation helpers. + + // Appended to the format in the format constant via bitwise OR. + enum : uint32_t { + kPSIColorFormatFlag_64bpp_Shift = xenos::kColorRenderTargetFormatBits, + // Requires clamping of blending sources and factors. + kPSIColorFormatFlag_FixedPointColor_Shift, + kPSIColorFormatFlag_FixedPointAlpha_Shift, + + kPSIColorFormatFlag_64bpp = uint32_t(1) << kPSIColorFormatFlag_64bpp_Shift, + kPSIColorFormatFlag_FixedPointColor = + uint32_t(1) << kPSIColorFormatFlag_FixedPointColor_Shift, + kPSIColorFormatFlag_FixedPointAlpha = + uint32_t(1) << kPSIColorFormatFlag_FixedPointAlpha_Shift, + }; + + static constexpr uint32_t AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat format) { + uint32_t format_flags = uint32_t(format); + if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || + format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT || + format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + format_flags |= kPSIColorFormatFlag_64bpp; + } + if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8 || + format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA || + format == xenos::ColorRenderTargetFormat::k_2_10_10_10 || + format == xenos::ColorRenderTargetFormat::k_16_16 || + format == xenos::ColorRenderTargetFormat::k_16_16_16_16 || + format == xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10) { + format_flags |= kPSIColorFormatFlag_FixedPointColor | + kPSIColorFormatFlag_FixedPointAlpha; + } else if (format == xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT || + format == xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16) { + format_flags |= kPSIColorFormatFlag_FixedPointAlpha; + } + return format_flags; + } + + static void GetPSIColorFormatInfo(xenos::ColorRenderTargetFormat format, + uint32_t write_mask, float& clamp_rgb_low, + float& clamp_alpha_low, + float& clamp_rgb_high, + float& clamp_alpha_high, + uint32_t& keep_mask_low, + uint32_t& keep_mask_high); + virtual ~RenderTargetCache(); virtual Path GetPath() const = 0; diff --git a/src/xenia/gpu/shader_compiler_main.cc b/src/xenia/gpu/shader_compiler_main.cc index ec2e20184..4fdcec736 100644 --- a/src/xenia/gpu/shader_compiler_main.cc +++ b/src/xenia/gpu/shader_compiler_main.cc @@ -54,9 +54,11 @@ DEFINE_string( "GPU"); DEFINE_bool(shader_output_bindless_resources, false, "Output host shader with bindless resources used.", "GPU"); -DEFINE_bool(shader_output_dxbc_rov, false, - "Output ROV-based output-merger code in DXBC pixel shaders.", - "GPU"); +DEFINE_bool( + shader_output_pixel_shader_interlock, false, + "Output host shader with a render backend implementation based on pixel " + "shader interlock.", + "GPU"); namespace xe { namespace gpu { @@ -124,12 +126,15 @@ int shader_compiler_main(const std::vector& args) { SpirvShaderTranslator::Features spirv_features(true); if (cvars::shader_output_type == "spirv" || cvars::shader_output_type == "spirvtext") { - translator = std::make_unique(spirv_features); + translator = std::make_unique( + spirv_features, true, true, + cvars::shader_output_pixel_shader_interlock); } else if (cvars::shader_output_type == "dxbc" || cvars::shader_output_type == "dxbctext") { translator = std::make_unique( ui::GraphicsProvider::GpuVendorID(0), - cvars::shader_output_bindless_resources, cvars::shader_output_dxbc_rov); + cvars::shader_output_bindless_resources, + cvars::shader_output_pixel_shader_interlock); } else { // Just output microcode disassembly generated during microcode information // gathering. diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index bb89e0d41..eb31e13b9 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -21,6 +21,7 @@ #include "third_party/glslang/SPIRV/GLSL.std.450.h" #include "xenia/base/assert.h" #include "xenia/base/math.h" +#include "xenia/base/string_buffer.h" #include "xenia/gpu/spirv_shader.h" namespace xe { @@ -31,6 +32,8 @@ SpirvShaderTranslator::Features::Features(bool all) max_storage_buffer_range(all ? UINT32_MAX : (128 * 1024 * 1024)), clip_distance(all), cull_distance(all), + demote_to_helper_invocation(all), + fragment_shader_sample_interlock(all), full_draw_index_uint32(all), image_view_format_swizzle(all), signed_zero_inf_nan_preserve_float32(all), @@ -42,6 +45,14 @@ SpirvShaderTranslator::Features::Features( provider.device_properties().limits.maxStorageBufferRange), clip_distance(provider.device_features().shaderClipDistance), cull_distance(provider.device_features().shaderCullDistance), + demote_to_helper_invocation( + provider.device_extensions().ext_shader_demote_to_helper_invocation && + provider.device_shader_demote_to_helper_invocation_features() + .shaderDemoteToHelperInvocation), + fragment_shader_sample_interlock( + provider.device_extensions().ext_fragment_shader_interlock && + provider.device_fragment_shader_interlock_features() + .fragmentShaderSampleInterlock), full_draw_index_uint32(provider.device_features().fullDrawIndexUint32) { uint32_t device_version = provider.device_properties().apiVersion; const ui::vulkan::VulkanProvider::DeviceExtensions& device_extensions = @@ -78,9 +89,6 @@ SpirvShaderTranslator::Features::Features( } } -SpirvShaderTranslator::SpirvShaderTranslator(const Features& features) - : features_(features) {} - uint64_t SpirvShaderTranslator::GetDefaultVertexShaderModification( uint32_t dynamic_addressable_register_count, Shader::HostVertexShaderType host_vertex_shader_type) const { @@ -99,6 +107,19 @@ uint64_t SpirvShaderTranslator::GetDefaultPixelShaderModification( return shader_modification.value; } +std::vector SpirvShaderTranslator::CreateDepthOnlyFragmentShader() { + is_depth_only_fragment_shader_ = true; + // TODO(Triang3l): Handle in a nicer way (is_depth_only_fragment_shader_ is a + // leftover from when a Shader object wasn't used during translation). + Shader shader(xenos::ShaderType::kPixel, 0, nullptr, 0); + StringBuffer instruction_disassembly_buffer; + shader.AnalyzeUcode(instruction_disassembly_buffer); + Shader::Translation& translation = *shader.GetOrCreateTranslation(0); + TranslateAnalyzedShader(translation); + is_depth_only_fragment_shader_ = false; + return translation.translated_binary(); +} + void SpirvShaderTranslator::Reset() { ShaderTranslator::Reset(); @@ -109,6 +130,7 @@ void SpirvShaderTranslator::Reset() { input_point_coordinates_ = spv::NoResult; input_fragment_coordinates_ = spv::NoResult; input_front_facing_ = spv::NoResult; + input_sample_mask_ = spv::NoResult; std::fill(input_output_interpolators_.begin(), input_output_interpolators_.end(), spv::NoResult); output_point_coordinates_ = spv::NoResult; @@ -120,6 +142,8 @@ void SpirvShaderTranslator::Reset() { main_interface_.clear(); var_main_registers_ = spv::NoResult; var_main_point_size_edge_flag_kill_vertex_ = spv::NoResult; + var_main_kill_pixel_ = spv::NoResult; + var_main_fsi_color_written_ = spv::NoResult; main_switch_op_.reset(); main_switch_next_pc_phi_operands_.clear(); @@ -217,6 +241,10 @@ void SpirvShaderTranslator::StartTranslation() { size_t offset; spv::Id type; }; + spv::Id type_float4_array_4 = builder_->makeArrayType( + type_float4_, builder_->makeUintConstant(4), sizeof(float) * 4); + builder_->addDecoration(type_float4_array_4, spv::DecorationArrayStride, + sizeof(float) * 4); spv::Id type_uint4_array_2 = builder_->makeArrayType( type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4); builder_->addDecoration(type_uint4_array_2, spv::DecorationArrayStride, @@ -250,8 +278,37 @@ void SpirvShaderTranslator::StartTranslation() { type_uint4_array_4}, {"alpha_test_reference", offsetof(SystemConstants, alpha_test_reference), type_float_}, + {"edram_32bpp_tile_pitch_dwords_scaled", + offsetof(SystemConstants, edram_32bpp_tile_pitch_dwords_scaled), + type_uint_}, + {"edram_depth_base_dwords_scaled", + offsetof(SystemConstants, edram_depth_base_dwords_scaled), type_uint_}, {"color_exp_bias", offsetof(SystemConstants, color_exp_bias), type_float4_}, + {"edram_poly_offset_front_scale", + offsetof(SystemConstants, edram_poly_offset_front_scale), type_float_}, + {"edram_poly_offset_back_scale", + offsetof(SystemConstants, edram_poly_offset_back_scale), type_float_}, + {"edram_poly_offset_front_offset", + offsetof(SystemConstants, edram_poly_offset_front_offset), type_float_}, + {"edram_poly_offset_back_offset", + offsetof(SystemConstants, edram_poly_offset_back_offset), type_float_}, + {"edram_stencil_front", offsetof(SystemConstants, edram_stencil_front), + type_uint2_}, + {"edram_stencil_back", offsetof(SystemConstants, edram_stencil_back), + type_uint2_}, + {"edram_rt_base_dwords_scaled", + offsetof(SystemConstants, edram_rt_base_dwords_scaled), type_uint4_}, + {"edram_rt_format_flags", + offsetof(SystemConstants, edram_rt_format_flags), type_uint4_}, + {"edram_rt_blend_factors_ops", + offsetof(SystemConstants, edram_rt_blend_factors_ops), type_uint4_}, + {"edram_rt_keep_mask", offsetof(SystemConstants, edram_rt_keep_mask), + type_uint4_array_2}, + {"edram_rt_clamp", offsetof(SystemConstants, edram_rt_clamp), + type_float4_array_4}, + {"edram_blend_constant", offsetof(SystemConstants, edram_blend_constant), + type_float4_}, }; id_vector_temp_.clear(); id_vector_temp_.reserve(xe::countof(system_constants)); @@ -281,139 +338,145 @@ void SpirvShaderTranslator::StartTranslation() { main_interface_.push_back(uniform_system_constants_); } - // Common uniform buffer - float constants. - uint32_t float_constant_count = - current_shader().constant_register_map().float_count; - if (float_constant_count) { + if (!is_depth_only_fragment_shader_) { + // Common uniform buffer - float constants. + uint32_t float_constant_count = + current_shader().constant_register_map().float_count; + if (float_constant_count) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeArrayType( + type_float4_, builder_->makeUintConstant(float_constant_count), + sizeof(float) * 4)); + // Currently (as of October 24, 2020) makeArrayType only uses the stride + // to check if deduplication can be done - the array stride decoration + // needs to be applied explicitly. + builder_->addDecoration(id_vector_temp_.back(), + spv::DecorationArrayStride, sizeof(float) * 4); + spv::Id type_float_constants = + builder_->makeStructType(id_vector_temp_, "XeFloatConstants"); + builder_->addMemberName(type_float_constants, 0, "float_constants"); + builder_->addMemberDecoration(type_float_constants, 0, + spv::DecorationOffset, 0); + builder_->addDecoration(type_float_constants, spv::DecorationBlock); + uniform_float_constants_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassUniform, type_float_constants, + "xe_uniform_float_constants"); + builder_->addDecoration(uniform_float_constants_, + spv::DecorationDescriptorSet, + int(kDescriptorSetConstants)); + builder_->addDecoration( + uniform_float_constants_, spv::DecorationBinding, + int(is_pixel_shader() ? kConstantBufferFloatPixel + : kConstantBufferFloatVertex)); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(uniform_float_constants_); + } + } + + // Common uniform buffer - bool and loop constants. + // Uniform buffers must have std140 packing, so using arrays of 4-component + // vectors instead of scalar arrays because the latter would have padding to + // 16 bytes in each element. id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // 256 bool constants. id_vector_temp_.push_back(builder_->makeArrayType( - type_float4_, builder_->makeUintConstant(float_constant_count), - sizeof(float) * 4)); - // Currently (as of October 24, 2020) makeArrayType only uses the stride to - // check if deduplication can be done - the array stride decoration needs to - // be applied explicitly. + type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4)); builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(float) * 4); - spv::Id type_float_constants = - builder_->makeStructType(id_vector_temp_, "XeFloatConstants"); - builder_->addMemberName(type_float_constants, 0, "float_constants"); - builder_->addMemberDecoration(type_float_constants, 0, + sizeof(uint32_t) * 4); + // 32 loop constants. + id_vector_temp_.push_back(builder_->makeArrayType( + type_uint4_, builder_->makeUintConstant(8), sizeof(uint32_t) * 4)); + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t) * 4); + spv::Id type_bool_loop_constants = + builder_->makeStructType(id_vector_temp_, "XeBoolLoopConstants"); + builder_->addMemberName(type_bool_loop_constants, 0, "bool_constants"); + builder_->addMemberDecoration(type_bool_loop_constants, 0, spv::DecorationOffset, 0); - builder_->addDecoration(type_float_constants, spv::DecorationBlock); - uniform_float_constants_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassUniform, type_float_constants, - "xe_uniform_float_constants"); - builder_->addDecoration(uniform_float_constants_, + builder_->addMemberName(type_bool_loop_constants, 1, "loop_constants"); + builder_->addMemberDecoration(type_bool_loop_constants, 1, + spv::DecorationOffset, sizeof(uint32_t) * 8); + builder_->addDecoration(type_bool_loop_constants, spv::DecorationBlock); + uniform_bool_loop_constants_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassUniform, type_bool_loop_constants, + "xe_uniform_bool_loop_constants"); + builder_->addDecoration(uniform_bool_loop_constants_, spv::DecorationDescriptorSet, int(kDescriptorSetConstants)); - builder_->addDecoration( - uniform_float_constants_, spv::DecorationBinding, - int(is_pixel_shader() ? kConstantBufferFloatPixel - : kConstantBufferFloatVertex)); + builder_->addDecoration(uniform_bool_loop_constants_, + spv::DecorationBinding, + int(kConstantBufferBoolLoop)); if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(uniform_float_constants_); + main_interface_.push_back(uniform_bool_loop_constants_); } - } - // Common uniform buffer - bool and loop constants. - // Uniform buffers must have std140 packing, so using arrays of 4-component - // vectors instead of scalar arrays because the latter would have padding to - // 16 bytes in each element. - id_vector_temp_.clear(); - id_vector_temp_.reserve(2); - // 256 bool constants. - id_vector_temp_.push_back(builder_->makeArrayType( - type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4)); - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t) * 4); - // 32 loop constants. - id_vector_temp_.push_back(builder_->makeArrayType( - type_uint4_, builder_->makeUintConstant(8), sizeof(uint32_t) * 4)); - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t) * 4); - spv::Id type_bool_loop_constants = - builder_->makeStructType(id_vector_temp_, "XeBoolLoopConstants"); - builder_->addMemberName(type_bool_loop_constants, 0, "bool_constants"); - builder_->addMemberDecoration(type_bool_loop_constants, 0, - spv::DecorationOffset, 0); - builder_->addMemberName(type_bool_loop_constants, 1, "loop_constants"); - builder_->addMemberDecoration(type_bool_loop_constants, 1, - spv::DecorationOffset, sizeof(uint32_t) * 8); - builder_->addDecoration(type_bool_loop_constants, spv::DecorationBlock); - uniform_bool_loop_constants_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassUniform, type_bool_loop_constants, - "xe_uniform_bool_loop_constants"); - builder_->addDecoration(uniform_bool_loop_constants_, - spv::DecorationDescriptorSet, - int(kDescriptorSetConstants)); - builder_->addDecoration(uniform_bool_loop_constants_, spv::DecorationBinding, - int(kConstantBufferBoolLoop)); - if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(uniform_bool_loop_constants_); - } + // Common uniform buffer - fetch constants (32 x 6 uints packed in std140 as + // 4-component vectors). + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeArrayType( + type_uint4_, builder_->makeUintConstant(32 * 6 / 4), + sizeof(uint32_t) * 4)); + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t) * 4); + spv::Id type_fetch_constants = + builder_->makeStructType(id_vector_temp_, "XeFetchConstants"); + builder_->addMemberName(type_fetch_constants, 0, "fetch_constants"); + builder_->addMemberDecoration(type_fetch_constants, 0, + spv::DecorationOffset, 0); + builder_->addDecoration(type_fetch_constants, spv::DecorationBlock); + uniform_fetch_constants_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassUniform, type_fetch_constants, + "xe_uniform_fetch_constants"); + builder_->addDecoration(uniform_fetch_constants_, + spv::DecorationDescriptorSet, + int(kDescriptorSetConstants)); + builder_->addDecoration(uniform_fetch_constants_, spv::DecorationBinding, + int(kConstantBufferFetch)); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(uniform_fetch_constants_); + } - // Common uniform buffer - fetch constants (32 x 6 uints packed in std140 as - // 4-component vectors). - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeArrayType( - type_uint4_, builder_->makeUintConstant(32 * 6 / 4), - sizeof(uint32_t) * 4)); - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t) * 4); - spv::Id type_fetch_constants = - builder_->makeStructType(id_vector_temp_, "XeFetchConstants"); - builder_->addMemberName(type_fetch_constants, 0, "fetch_constants"); - builder_->addMemberDecoration(type_fetch_constants, 0, spv::DecorationOffset, - 0); - builder_->addDecoration(type_fetch_constants, spv::DecorationBlock); - uniform_fetch_constants_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassUniform, type_fetch_constants, - "xe_uniform_fetch_constants"); - builder_->addDecoration(uniform_fetch_constants_, - spv::DecorationDescriptorSet, - int(kDescriptorSetConstants)); - builder_->addDecoration(uniform_fetch_constants_, spv::DecorationBinding, - int(kConstantBufferFetch)); - if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(uniform_fetch_constants_); - } - - // Common storage buffers - shared memory uint[], each 128 MB or larger, - // depending on what's possible on the device. - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_)); - // Storage buffers have std430 packing, no padding to 4-component vectors. - builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, - sizeof(uint32_t)); - spv::Id type_shared_memory = - builder_->makeStructType(id_vector_temp_, "XeSharedMemory"); - builder_->addMemberName(type_shared_memory, 0, "shared_memory"); - // TODO(Triang3l): Make writable when memexport is implemented. - builder_->addMemberDecoration(type_shared_memory, 0, - spv::DecorationNonWritable); - builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset, - 0); - builder_->addDecoration(type_shared_memory, - features_.spirv_version >= spv::Spv_1_3 - ? spv::DecorationBlock - : spv::DecorationBufferBlock); - unsigned int shared_memory_binding_count = - 1 << GetSharedMemoryStorageBufferCountLog2(); - if (shared_memory_binding_count > 1) { - type_shared_memory = builder_->makeArrayType( - type_shared_memory, - builder_->makeUintConstant(shared_memory_binding_count), 0); - } - buffers_shared_memory_ = builder_->createVariable( - spv::NoPrecision, - features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer - : spv::StorageClassUniform, - type_shared_memory, "xe_shared_memory"); - builder_->addDecoration(buffers_shared_memory_, spv::DecorationDescriptorSet, - int(kDescriptorSetSharedMemoryAndEdram)); - builder_->addDecoration(buffers_shared_memory_, spv::DecorationBinding, 0); - if (features_.spirv_version >= spv::Spv_1_4) { - main_interface_.push_back(buffers_shared_memory_); + // Common storage buffers - shared memory uint[], each 128 MB or larger, + // depending on what's possible on the device. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_)); + // Storage buffers have std430 packing, no padding to 4-component vectors. + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t)); + spv::Id type_shared_memory = + builder_->makeStructType(id_vector_temp_, "XeSharedMemory"); + builder_->addMemberName(type_shared_memory, 0, "shared_memory"); + builder_->addMemberDecoration(type_shared_memory, 0, + spv::DecorationRestrict); + // TODO(Triang3l): Make writable when memexport is implemented. + builder_->addMemberDecoration(type_shared_memory, 0, + spv::DecorationNonWritable); + builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset, + 0); + builder_->addDecoration(type_shared_memory, + features_.spirv_version >= spv::Spv_1_3 + ? spv::DecorationBlock + : spv::DecorationBufferBlock); + unsigned int shared_memory_binding_count = + 1 << GetSharedMemoryStorageBufferCountLog2(); + if (shared_memory_binding_count > 1) { + type_shared_memory = builder_->makeArrayType( + type_shared_memory, + builder_->makeUintConstant(shared_memory_binding_count), 0); + } + buffers_shared_memory_ = builder_->createVariable( + spv::NoPrecision, + features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + type_shared_memory, "xe_shared_memory"); + builder_->addDecoration(buffers_shared_memory_, + spv::DecorationDescriptorSet, + int(kDescriptorSetSharedMemoryAndEdram)); + builder_->addDecoration(buffers_shared_memory_, spv::DecorationBinding, 0); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(buffers_shared_memory_); + } } if (is_vertex_shader()) { @@ -438,41 +501,43 @@ void SpirvShaderTranslator::StartTranslation() { uniform_system_constants_, id_vector_temp_), spv::NoPrecision); - // Begin ucode translation. Initialize everything, even without defined - // defaults, for safety. - var_main_predicate_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_bool_, - "xe_var_predicate", builder_->makeBoolConstant(false)); - var_main_loop_count_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_uint4_, - "xe_var_loop_count", const_uint4_0_); - var_main_address_register_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_int_, - "xe_var_address_register", const_int_0_); - var_main_loop_address_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_int4_, - "xe_var_loop_address", const_int4_0_); - var_main_previous_scalar_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float_, - "xe_var_previous_scalar", const_float_0_); - var_main_vfetch_address_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_int_, - "xe_var_vfetch_address", const_int_0_); - var_main_tfetch_lod_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float_, - "xe_var_tfetch_lod", const_float_0_); - var_main_tfetch_gradients_h_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float3_, - "xe_var_tfetch_gradients_h", const_float3_0_); - var_main_tfetch_gradients_v_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float3_, - "xe_var_tfetch_gradients_v", const_float3_0_); - if (register_count()) { - spv::Id type_register_array = builder_->makeArrayType( - type_float4_, builder_->makeUintConstant(register_count()), 0); - var_main_registers_ = - builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction, - type_register_array, "xe_var_registers"); + if (!is_depth_only_fragment_shader_) { + // Begin ucode translation. Initialize everything, even without defined + // defaults, for safety. + var_main_predicate_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_bool_, + "xe_var_predicate", builder_->makeBoolConstant(false)); + var_main_loop_count_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_uint4_, + "xe_var_loop_count", const_uint4_0_); + var_main_address_register_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_int_, + "xe_var_address_register", const_int_0_); + var_main_loop_address_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_int4_, + "xe_var_loop_address", const_int4_0_); + var_main_previous_scalar_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float_, + "xe_var_previous_scalar", const_float_0_); + var_main_vfetch_address_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_int_, + "xe_var_vfetch_address", const_int_0_); + var_main_tfetch_lod_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float_, + "xe_var_tfetch_lod", const_float_0_); + var_main_tfetch_gradients_h_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float3_, + "xe_var_tfetch_gradients_h", const_float3_0_); + var_main_tfetch_gradients_v_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float3_, + "xe_var_tfetch_gradients_v", const_float3_0_); + if (register_count()) { + spv::Id type_register_array = builder_->makeArrayType( + type_float4_, builder_->makeUintConstant(register_count()), 0); + var_main_registers_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction, + type_register_array, "xe_var_registers"); + } } // Write the execution model-specific prologue with access to variables in the @@ -483,6 +548,10 @@ void SpirvShaderTranslator::StartTranslation() { StartFragmentShaderInMain(); } + if (is_depth_only_fragment_shader_) { + return; + } + // Open the main loop. spv::Block& main_loop_pre_header = *builder_->getBuildPoint(); main_loop_header_ = &builder_->makeNewBlock(); @@ -551,57 +620,62 @@ void SpirvShaderTranslator::StartTranslation() { } std::vector SpirvShaderTranslator::CompleteTranslation() { - // Close flow control within the last switch case. - CloseExecConditionals(); - bool has_main_switch = !current_shader().label_addresses().empty(); - // After the final exec (if it happened to be not exece, which would already - // have a break branch), break from the switch if it exists, or from the - // loop it doesn't. - if (!builder_->getBuildPoint()->isTerminated()) { - builder_->createBranch(has_main_switch ? main_switch_merge_ - : main_loop_merge_); - } - if (has_main_switch) { - // Insert the switch instruction with all cases added as operands. - builder_->setBuildPoint(main_switch_header_); - builder_->getBuildPoint()->addInstruction(std::move(main_switch_op_)); - // Build the main switch merge, breaking out of the loop after falling - // through the end or breaking from exece (only continuing if a jump - from - // a guest loop or from jmp/call - was made). - function_main_->addBlock(main_switch_merge_); - builder_->setBuildPoint(main_switch_merge_); - builder_->createBranch(main_loop_merge_); - } - - // Main loop continuation - choose the program counter based on the path - // taken (-1 if not from a jump as a safe fallback, which would result in not - // hitting any switch case and reaching the final break in the body). - function_main_->addBlock(main_loop_continue_); - builder_->setBuildPoint(main_loop_continue_); - if (has_main_switch) { - // OpPhi, if added, must be the first in the block. - // If labels were added, but not jumps (for example, due to the call - // instruction not being implemented as of October 18, 2020), send an - // impossible program counter value (-1) to the OpPhi at the next iteration. - if (main_switch_next_pc_phi_operands_.empty()) { - main_switch_next_pc_phi_operands_.push_back( - builder_->makeIntConstant(-1)); + if (!is_depth_only_fragment_shader_) { + // Close flow control within the last switch case. + CloseExecConditionals(); + bool has_main_switch = !current_shader().label_addresses().empty(); + // After the final exec (if it happened to be not exece, which would already + // have a break branch), break from the switch if it exists, or from the + // loop it doesn't. + if (!builder_->getBuildPoint()->isTerminated()) { + builder_->createBranch(has_main_switch ? main_switch_merge_ + : main_loop_merge_); } - std::unique_ptr main_loop_pc_next_op = - std::make_unique( - main_loop_pc_next_, type_int_, - main_switch_next_pc_phi_operands_.size() >= 2 ? spv::OpPhi - : spv::OpCopyObject); - for (spv::Id operand : main_switch_next_pc_phi_operands_) { - main_loop_pc_next_op->addIdOperand(operand); + if (has_main_switch) { + // Insert the switch instruction with all cases added as operands. + builder_->setBuildPoint(main_switch_header_); + builder_->getBuildPoint()->addInstruction(std::move(main_switch_op_)); + // Build the main switch merge, breaking out of the loop after falling + // through the end or breaking from exece (only continuing if a jump - + // from a guest loop or from jmp/call - was made). + function_main_->addBlock(main_switch_merge_); + builder_->setBuildPoint(main_switch_merge_); + builder_->createBranch(main_loop_merge_); } - builder_->getBuildPoint()->addInstruction(std::move(main_loop_pc_next_op)); - } - builder_->createBranch(main_loop_header_); - // Add the main loop merge block and go back to the function. - function_main_->addBlock(main_loop_merge_); - builder_->setBuildPoint(main_loop_merge_); + // Main loop continuation - choose the program counter based on the path + // taken (-1 if not from a jump as a safe fallback, which would result in + // not hitting any switch case and reaching the final break in the body). + function_main_->addBlock(main_loop_continue_); + builder_->setBuildPoint(main_loop_continue_); + if (has_main_switch) { + // OpPhi, if added, must be the first in the block. + // If labels were added, but not jumps (for example, due to the call + // instruction not being implemented as of October 18, 2020), send an + // impossible program counter value (-1) to the OpPhi at the next + // iteration. + if (main_switch_next_pc_phi_operands_.empty()) { + main_switch_next_pc_phi_operands_.push_back( + builder_->makeIntConstant(-1)); + } + std::unique_ptr main_loop_pc_next_op = + std::make_unique( + main_loop_pc_next_, type_int_, + main_switch_next_pc_phi_operands_.size() >= 2 + ? spv::OpPhi + : spv::OpCopyObject); + for (spv::Id operand : main_switch_next_pc_phi_operands_) { + main_loop_pc_next_op->addIdOperand(operand); + } + builder_->getBuildPoint()->addInstruction( + std::move(main_loop_pc_next_op)); + } + builder_->createBranch(main_loop_header_); + + // Add the main loop merge block and go back to the function. + function_main_->addBlock(main_loop_merge_); + builder_->setBuildPoint(main_loop_merge_); + } if (is_vertex_shader()) { CompleteVertexOrTessEvalShaderInMain(); @@ -622,6 +696,20 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { builder_->addExecutionMode(function_main_, spv::ExecutionModeEarlyFragmentTests); } + if (edram_fragment_shader_interlock_) { + // Accessing per-sample values, so interlocking just when there's common + // coverage is enough if the device exposes that. + if (features_.fragment_shader_sample_interlock) { + builder_->addCapability( + spv::CapabilityFragmentShaderSampleInterlockEXT); + builder_->addExecutionMode(function_main_, + spv::ExecutionModeSampleInterlockOrderedEXT); + } else { + builder_->addCapability(spv::CapabilityFragmentShaderPixelInterlockEXT); + builder_->addExecutionMode(function_main_, + spv::ExecutionModePixelInterlockOrderedEXT); + } + } } else { assert_true(is_vertex_shader()); execution_model = IsSpirvTessEvalShader() @@ -649,14 +737,17 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { entry_point->addIdOperand(interface_id); } - // Specify the binding indices for samplers when the number of textures is - // known, as samplers are located after images in the texture descriptor set. - size_t texture_binding_count = texture_bindings_.size(); - size_t sampler_binding_count = sampler_bindings_.size(); - for (size_t i = 0; i < sampler_binding_count; ++i) { - builder_->addDecoration(sampler_bindings_[i].variable, - spv::DecorationBinding, - int(texture_binding_count + i)); + if (!is_depth_only_fragment_shader_) { + // Specify the binding indices for samplers when the number of textures is + // known, as samplers are located after images in the texture descriptor + // set. + size_t texture_binding_count = texture_bindings_.size(); + size_t sampler_binding_count = sampler_bindings_.size(); + for (size_t i = 0; i < sampler_binding_count; ++i) { + builder_->addDecoration(sampler_bindings_[i].variable, + spv::DecorationBinding, + int(texture_binding_count + i)); + } } // TODO(Triang3l): Avoid copy? @@ -1682,49 +1773,83 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { Modification shader_modification = GetSpirvShaderModification(); - uint32_t input_location = 0; + if (edram_fragment_shader_interlock_) { + builder_->addExtension("SPV_EXT_fragment_shader_interlock"); - // Interpolator inputs. - { - uint32_t interpolators_remaining = GetModificationInterpolatorMask(); - uint32_t interpolator_index; - while (xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) { - interpolators_remaining &= ~(UINT32_C(1) << interpolator_index); - spv::Id interpolator = builder_->createVariable( - spv::NoPrecision, spv::StorageClassInput, type_float4_, - fmt::format("xe_in_interpolator_{}", interpolator_index).c_str()); - input_output_interpolators_[interpolator_index] = interpolator; - builder_->addDecoration(interpolator, spv::DecorationLocation, - int(input_location)); - if (shader_modification.pixel.interpolators_centroid & - (UINT32_C(1) << interpolator_index)) { - builder_->addDecoration(interpolator, spv::DecorationCentroid); + // EDRAM buffer uint[]. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_)); + // Storage buffers have std430 packing, no padding to 4-component vectors. + builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride, + sizeof(uint32_t)); + spv::Id type_edram = builder_->makeStructType(id_vector_temp_, "XeEdram"); + builder_->addMemberName(type_edram, 0, "edram"); + builder_->addMemberDecoration(type_edram, 0, spv::DecorationCoherent); + builder_->addMemberDecoration(type_edram, 0, spv::DecorationRestrict); + builder_->addMemberDecoration(type_edram, 0, spv::DecorationOffset, 0); + builder_->addDecoration(type_edram, features_.spirv_version >= spv::Spv_1_3 + ? spv::DecorationBlock + : spv::DecorationBufferBlock); + buffer_edram_ = builder_->createVariable( + spv::NoPrecision, + features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + type_edram, "xe_edram"); + builder_->addDecoration(buffer_edram_, spv::DecorationDescriptorSet, + int(kDescriptorSetSharedMemoryAndEdram)); + builder_->addDecoration(buffer_edram_, spv::DecorationBinding, 1); + if (features_.spirv_version >= spv::Spv_1_4) { + main_interface_.push_back(buffer_edram_); + } + } + + bool param_gen_needed = !is_depth_only_fragment_shader_ && + GetPsParamGenInterpolator() != UINT32_MAX; + + if (!is_depth_only_fragment_shader_) { + uint32_t input_location = 0; + + // Interpolator inputs. + { + uint32_t interpolators_remaining = GetModificationInterpolatorMask(); + uint32_t interpolator_index; + while ( + xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) { + interpolators_remaining &= ~(UINT32_C(1) << interpolator_index); + spv::Id interpolator = builder_->createVariable( + spv::NoPrecision, spv::StorageClassInput, type_float4_, + fmt::format("xe_in_interpolator_{}", interpolator_index).c_str()); + input_output_interpolators_[interpolator_index] = interpolator; + builder_->addDecoration(interpolator, spv::DecorationLocation, + int(input_location)); + if (shader_modification.pixel.interpolators_centroid & + (UINT32_C(1) << interpolator_index)) { + builder_->addDecoration(interpolator, spv::DecorationCentroid); + } + main_interface_.push_back(interpolator); + ++input_location; + } + } + + // Point coordinate input. + if (shader_modification.pixel.param_gen_point) { + if (param_gen_needed) { + input_point_coordinates_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassInput, + type_float2_, "xe_in_point_coordinates"); + builder_->addDecoration(input_point_coordinates_, + spv::DecorationLocation, int(input_location)); + main_interface_.push_back(input_point_coordinates_); } - main_interface_.push_back(interpolator); ++input_location; } } - bool param_gen_needed = GetPsParamGenInterpolator() != UINT32_MAX; - - // Point coordinate input. - if (shader_modification.pixel.param_gen_point) { - if (param_gen_needed) { - input_point_coordinates_ = - builder_->createVariable(spv::NoPrecision, spv::StorageClassInput, - type_float2_, "xe_in_point_coordinates"); - builder_->addDecoration(input_point_coordinates_, spv::DecorationLocation, - int(input_location)); - main_interface_.push_back(input_point_coordinates_); - } - ++input_location; - } - // Fragment coordinates. - // TODO(Triang3l): More conditions - fragment shader interlock render backend, - // alpha to coverage (if RT 0 is written, and there's no early depth / - // stencil), depth writing in the fragment shader (per-sample if supported). - if (param_gen_needed) { + // TODO(Triang3l): More conditions - alpha to coverage (if RT 0 is written, + // and there's no early depth / stencil), depth writing in the fragment shader + // (per-sample if supported). + if (edram_fragment_shader_interlock_ || param_gen_needed) { input_fragment_coordinates_ = builder_->createVariable( spv::NoPrecision, spv::StorageClassInput, type_float4_, "gl_FragCoord"); builder_->addDecoration(input_fragment_coordinates_, spv::DecorationBuiltIn, @@ -1733,9 +1858,9 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { } // Is front facing. - // TODO(Triang3l): Needed for stencil in the fragment shader interlock render - // backend. - if (param_gen_needed && !GetSpirvShaderModification().pixel.param_gen_point) { + if (edram_fragment_shader_interlock_ || + (param_gen_needed && + !GetSpirvShaderModification().pixel.param_gen_point)) { input_front_facing_ = builder_->createVariable( spv::NoPrecision, spv::StorageClassInput, type_bool_, "gl_FrontFacing"); builder_->addDecoration(input_front_facing_, spv::DecorationBuiltIn, @@ -1743,33 +1868,165 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { main_interface_.push_back(input_front_facing_); } - // Framebuffer attachment outputs. - std::fill(output_fragment_data_.begin(), output_fragment_data_.end(), - spv::NoResult); - static const char* const kFragmentDataNames[] = { - "xe_out_fragment_data_0", - "xe_out_fragment_data_1", - "xe_out_fragment_data_2", - "xe_out_fragment_data_3", - }; - uint32_t color_targets_remaining = current_shader().writes_color_targets(); - uint32_t color_target_index; - while (xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { - color_targets_remaining &= ~(UINT32_C(1) << color_target_index); - spv::Id output_fragment_data_rt = builder_->createVariable( - spv::NoPrecision, spv::StorageClassOutput, type_float4_, - kFragmentDataNames[color_target_index]); - output_fragment_data_[color_target_index] = output_fragment_data_rt; - builder_->addDecoration(output_fragment_data_rt, spv::DecorationLocation, - int(color_target_index)); - // Make invariant as pixel shaders may be used for various precise - // computations. - builder_->addDecoration(output_fragment_data_rt, spv::DecorationInvariant); - main_interface_.push_back(output_fragment_data_rt); + // Sample mask input. + if (edram_fragment_shader_interlock_) { + // SampleMask depends on SampleRateShading in some SPIR-V revisions. + builder_->addCapability(spv::CapabilitySampleRateShading); + input_sample_mask_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassInput, + builder_->makeArrayType(type_int_, builder_->makeUintConstant(1), 0), + "gl_SampleMaskIn"); + builder_->addDecoration(input_sample_mask_, spv::DecorationFlat); + builder_->addDecoration(input_sample_mask_, spv::DecorationBuiltIn, + spv::BuiltInSampleMask); + main_interface_.push_back(input_sample_mask_); + } + + if (!is_depth_only_fragment_shader_) { + // Framebuffer color attachment outputs. + if (!edram_fragment_shader_interlock_) { + std::fill(output_or_var_fragment_data_.begin(), + output_or_var_fragment_data_.end(), spv::NoResult); + static const char* const kFragmentDataOutputNames[] = { + "xe_out_fragment_data_0", + "xe_out_fragment_data_1", + "xe_out_fragment_data_2", + "xe_out_fragment_data_3", + }; + uint32_t color_targets_remaining = + current_shader().writes_color_targets(); + uint32_t color_target_index; + while ( + xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { + color_targets_remaining &= ~(UINT32_C(1) << color_target_index); + spv::Id output_fragment_data_rt = builder_->createVariable( + spv::NoPrecision, spv::StorageClassOutput, type_float4_, + kFragmentDataOutputNames[color_target_index]); + output_or_var_fragment_data_[color_target_index] = + output_fragment_data_rt; + builder_->addDecoration(output_fragment_data_rt, + spv::DecorationLocation, + int(color_target_index)); + // Make invariant as pixel shaders may be used for various precise + // computations. + builder_->addDecoration(output_fragment_data_rt, + spv::DecorationInvariant); + main_interface_.push_back(output_fragment_data_rt); + } + } } } void SpirvShaderTranslator::StartFragmentShaderInMain() { + // Set up pixel killing from within the translated shader without affecting + // the control flow (unlike with OpKill), similarly to how pixel killing works + // on the Xenos, and also keeping a single critical section exit and return + // for safety across different Vulkan implementations with fragment shader + // interlock. + if (current_shader().kills_pixels()) { + if (features_.demote_to_helper_invocation) { + // TODO(Triang3l): Promoted to SPIR-V 1.6 - don't add the extension there. + builder_->addExtension("SPV_EXT_demote_to_helper_invocation"); + builder_->addCapability(spv::CapabilityDemoteToHelperInvocationEXT); + } else { + var_main_kill_pixel_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_bool_, + "xe_var_kill_pixel", builder_->makeBoolConstant(false)); + } + // For killing with fragment shader interlock when demotion is supported, + // using OpIsHelperInvocationEXT to avoid allocating a variable in addition + // to the execution mask GPUs naturally have. + } + + if (edram_fragment_shader_interlock_) { + // Initialize color output variables with fragment shader interlock. + std::fill(output_or_var_fragment_data_.begin(), + output_or_var_fragment_data_.end(), spv::NoResult); + var_main_fsi_color_written_ = spv::NoResult; + uint32_t color_targets_written = current_shader().writes_color_targets(); + if (color_targets_written) { + static const char* const kFragmentDataVariableNames[] = { + "xe_var_fragment_data_0", + "xe_var_fragment_data_1", + "xe_var_fragment_data_2", + "xe_var_fragment_data_3", + }; + uint32_t color_targets_remaining = color_targets_written; + uint32_t color_target_index; + while ( + xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { + color_targets_remaining &= ~(UINT32_C(1) << color_target_index); + output_or_var_fragment_data_[color_target_index] = + builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float4_, + kFragmentDataVariableNames[color_target_index], + const_float4_0_); + } + var_main_fsi_color_written_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_uint_, + "xe_var_fsi_color_written", const_uint_0_); + } + } + + if (edram_fragment_shader_interlock_ && FSI_IsDepthStencilEarly()) { + spv::Id msaa_samples = LoadMsaaSamplesFromFlags(); + FSI_LoadSampleMask(msaa_samples); + FSI_LoadEdramOffsets(msaa_samples); + builder_->createNoResultOp(spv::OpBeginInvocationInterlockEXT); + FSI_DepthStencilTest(msaa_samples, false); + if (!is_depth_only_fragment_shader_) { + // Skip the rest of the shader if the whole quad (due to derivatives) has + // failed the depth / stencil test, and there are no depth and stencil + // values to conditionally write after running the shader to check if + // samples don't additionally need to be discarded. + spv::Id quad_needs_execution = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + // TODO(Triang3l): Use GroupNonUniformQuad operations where supported. + // If none of the pixels in the quad passed the depth / stencil test, the + // value of (any samples covered ? 1.0f : 0.0f) for the current pixel will + // be 0.0f, and since it will be 0.0f in other pixels too, the derivatives + // will be zero as well. + builder_->addCapability(spv::CapabilityDerivativeControl); + // Query the horizontally adjacent pixel. + quad_needs_execution = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, quad_needs_execution, + builder_->createBinOp( + spv::OpFOrdNotEqual, type_bool_, + builder_->createUnaryOp( + spv::OpDPdxFine, type_float_, + builder_->createTriOp(spv::OpSelect, type_float_, + quad_needs_execution, const_float_1_, + const_float_0_)), + const_float_0_)); + // Query the vertically adjacent pair of pixels. + quad_needs_execution = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, quad_needs_execution, + builder_->createBinOp( + spv::OpFOrdNotEqual, type_bool_, + builder_->createUnaryOp( + spv::OpDPdyCoarse, type_float_, + builder_->createTriOp(spv::OpSelect, type_float_, + quad_needs_execution, const_float_1_, + const_float_0_)), + const_float_0_)); + spv::Block& main_fsi_early_depth_stencil_execute_quad = + builder_->makeNewBlock(); + main_fsi_early_depth_stencil_execute_quad_merge_ = + &builder_->makeNewBlock(); + SpirvCreateSelectionMerge( + main_fsi_early_depth_stencil_execute_quad_merge_->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + quad_needs_execution, &main_fsi_early_depth_stencil_execute_quad, + main_fsi_early_depth_stencil_execute_quad_merge_); + builder_->setBuildPoint(&main_fsi_early_depth_stencil_execute_quad); + } + } + + if (is_depth_only_fragment_shader_) { + return; + } + uint32_t param_gen_interpolator = GetPsParamGenInterpolator(); // Zero general-purpose registers to prevent crashes when the game @@ -1928,11 +2185,13 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() { var_main_registers_, id_vector_temp_)); } - // Initialize the colors for safety. - for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { - spv::Id output_fragment_data_rt = output_fragment_data_[i]; - if (output_fragment_data_rt != spv::NoResult) { - builder_->createStore(const_float4_0_, output_fragment_data_rt); + if (!edram_fragment_shader_interlock_) { + // Initialize the colors for safety. + for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { + spv::Id output_fragment_data_rt = output_or_var_fragment_data_[i]; + if (output_fragment_data_rt != spv::NoResult) { + builder_->createStore(const_float4_0_, output_fragment_data_rt); + } } } } @@ -2299,11 +2558,18 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result, assert_true(is_pixel_shader()); assert_not_zero(used_write_mask); assert_true(current_shader().writes_color_target(result.storage_index)); - target_pointer = output_fragment_data_[result.storage_index]; - // May be spv::NoResult if the color output is explicitly removed due to - // an empty write mask without independent blending. - // TODO(Triang3l): Store the alpha of the first output in this case for - // alpha test and alpha to coverage. + target_pointer = output_or_var_fragment_data_[result.storage_index]; + if (edram_fragment_shader_interlock_) { + assert_true(var_main_fsi_color_written_ != spv::NoResult); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createLoad(var_main_fsi_color_written_, + spv::NoPrecision), + builder_->makeUintConstant(uint32_t(1) + << result.storage_index)), + var_main_fsi_color_written_); + } } break; default: // TODO(Triang3l): All storage targets. diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 3bcd342a3..d453aa329 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -96,6 +96,9 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_WNotReciprocal_Shift, kSysFlag_PrimitivePolygonal_Shift, kSysFlag_PrimitiveLine_Shift, + kSysFlag_MsaaSamples_Shift, + kSysFlag_DepthFloat24_Shift = + kSysFlag_MsaaSamples_Shift + xenos::kMsaaSamplesBits, kSysFlag_AlphaPassIfLess_Shift, kSysFlag_AlphaPassIfEqual_Shift, kSysFlag_AlphaPassIfGreater_Shift, @@ -104,6 +107,26 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_ConvertColor2ToGamma_Shift, kSysFlag_ConvertColor3ToGamma_Shift, + kSysFlag_FSIDepthStencil_Shift, + kSysFlag_FSIDepthPassIfLess_Shift, + kSysFlag_FSIDepthPassIfEqual_Shift, + kSysFlag_FSIDepthPassIfGreater_Shift, + // 1 to write new depth to the depth buffer, 0 to keep the old one if the + // depth test passes. + kSysFlag_FSIDepthWrite_Shift, + kSysFlag_FSIStencilTest_Shift, + // If the depth / stencil test has failed, but resulted in a stencil value + // that is different than the one currently in the depth buffer, write it + // anyway and don't run the rest of the shader (to check if the sample may + // be discarded some way) - use when alpha test and alpha to coverage are + // disabled. Ignored by the shader if not applicable to it (like if it has + // kill instructions or writes the depth output). + // TODO(Triang3l): Investigate replacement with an alpha-to-mask flag, + // checking `(flags & (alpha test | alpha to mask)) == (always | disabled)`, + // taking into account the potential relation with occlusion queries (but + // should be safe at least temporarily). + kSysFlag_FSIDepthStencilEarlyWrite_Shift, + kSysFlag_Count, // For HostVertexShaderType kVertex, if fullDrawIndexUint32 is not @@ -127,6 +150,7 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift, kSysFlag_PrimitivePolygonal = 1u << kSysFlag_PrimitivePolygonal_Shift, kSysFlag_PrimitiveLine = 1u << kSysFlag_PrimitiveLine_Shift, + kSysFlag_DepthFloat24 = 1u << kSysFlag_DepthFloat24_Shift, kSysFlag_AlphaPassIfLess = 1u << kSysFlag_AlphaPassIfLess_Shift, kSysFlag_AlphaPassIfEqual = 1u << kSysFlag_AlphaPassIfEqual_Shift, kSysFlag_AlphaPassIfGreater = 1u << kSysFlag_AlphaPassIfGreater_Shift, @@ -134,6 +158,14 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_ConvertColor1ToGamma = 1u << kSysFlag_ConvertColor1ToGamma_Shift, kSysFlag_ConvertColor2ToGamma = 1u << kSysFlag_ConvertColor2ToGamma_Shift, kSysFlag_ConvertColor3ToGamma = 1u << kSysFlag_ConvertColor3ToGamma_Shift, + kSysFlag_FSIDepthStencil = 1u << kSysFlag_FSIDepthStencil_Shift, + kSysFlag_FSIDepthPassIfLess = 1u << kSysFlag_FSIDepthPassIfLess_Shift, + kSysFlag_FSIDepthPassIfEqual = 1u << kSysFlag_FSIDepthPassIfEqual_Shift, + kSysFlag_FSIDepthPassIfGreater = 1u << kSysFlag_FSIDepthPassIfGreater_Shift, + kSysFlag_FSIDepthWrite = 1u << kSysFlag_FSIDepthWrite_Shift, + kSysFlag_FSIStencilTest = 1u << kSysFlag_FSIStencilTest_Shift, + kSysFlag_FSIDepthStencilEarlyWrite = + 1u << kSysFlag_FSIDepthStencilEarlyWrite_Shift, }; static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants"); @@ -171,9 +203,55 @@ class SpirvShaderTranslator : public ShaderTranslator { uint32_t texture_swizzles[16]; float alpha_test_reference; - float padding_alpha_test_reference[3]; + uint32_t edram_32bpp_tile_pitch_dwords_scaled; + uint32_t edram_depth_base_dwords_scaled; + float padding_edram_depth_base_dwords_scaled; float color_exp_bias[4]; + + float edram_poly_offset_front_scale; + float edram_poly_offset_back_scale; + float edram_poly_offset_front_offset; + float edram_poly_offset_back_offset; + + union { + struct { + uint32_t edram_stencil_front_reference_masks; + uint32_t edram_stencil_front_func_ops; + + uint32_t edram_stencil_back_reference_masks; + uint32_t edram_stencil_back_func_ops; + }; + struct { + uint32_t edram_stencil_front[2]; + uint32_t edram_stencil_back[2]; + }; + }; + + uint32_t edram_rt_base_dwords_scaled[4]; + + // RT format combined with RenderTargetCache::kPSIColorFormatFlag values + // (pass via RenderTargetCache::AddPSIColorFormatFlags). + uint32_t edram_rt_format_flags[4]; + + // Render target blending options - RB_BLENDCONTROL, with only the relevant + // options (factors and operations - AND 0x1FFF1FFF). If 0x00010001 + // (1 * src + 0 * dst), blending is disabled for the render target. + uint32_t edram_rt_blend_factors_ops[4]; + + // Format info - mask to apply to the old packed RT data, and to apply as + // inverted to the new packed data, before storing (more or less the inverse + // of the write mask packed like render target channels). This can be used + // to bypass unpacking if blending is not used. If 0 and not blending, + // reading the old data from the EDRAM buffer is not required. + uint32_t edram_rt_keep_mask[4][2]; + + // Format info - values to clamp the color to before blending or storing. + // Low color, low alpha, high color, high alpha. + float edram_rt_clamp[4][4]; + + // The constant blend factor for the respective modes. + float edram_blend_constant[4]; }; enum ConstantBuffer : uint32_t { @@ -248,12 +326,22 @@ class SpirvShaderTranslator : public ShaderTranslator { uint32_t max_storage_buffer_range; bool clip_distance; bool cull_distance; + bool demote_to_helper_invocation; + bool fragment_shader_sample_interlock; bool full_draw_index_uint32; bool image_view_format_swizzle; bool signed_zero_inf_nan_preserve_float32; bool denorm_flush_to_zero_float32; }; - SpirvShaderTranslator(const Features& features); + + SpirvShaderTranslator(const Features& features, + bool native_2x_msaa_with_attachments, + bool native_2x_msaa_no_attachments, + bool edram_fragment_shader_interlock) + : features_(features), + native_2x_msaa_with_attachments_(native_2x_msaa_with_attachments), + native_2x_msaa_no_attachments_(native_2x_msaa_no_attachments), + edram_fragment_shader_interlock_(edram_fragment_shader_interlock) {} uint64_t GetDefaultVertexShaderModification( uint32_t dynamic_addressable_register_count, @@ -277,6 +365,10 @@ class SpirvShaderTranslator : public ShaderTranslator { features_.max_storage_buffer_range); } + // Creates a special fragment shader without color outputs - this resets the + // state of the translator. + std::vector CreateDepthOnlyFragmentShader(); + // Common functions useful not only for the translator, but also for EDRAM // emulation via conventional render targets. @@ -385,10 +477,10 @@ class SpirvShaderTranslator : public ShaderTranslator { } bool IsExecutionModeEarlyFragmentTests() const { - // TODO(Triang3l): Not applicable to fragment shader interlock. return is_pixel_shader() && GetSpirvShaderModification().pixel.depth_stencil_mode == Modification::DepthStencilMode::kEarlyHint && + !edram_fragment_shader_interlock_ && current_shader().implicit_early_z_write_allowed(); } @@ -528,7 +620,72 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id image_unsigned, spv::Id image_signed, spv::Id sampler, spv::Id is_all_signed); + spv::Id LoadMsaaSamplesFromFlags(); + // Whether it's possible and worth skipping running the translated shader for + // 2x2 quads. + bool FSI_IsDepthStencilEarly() const { + assert_true(edram_fragment_shader_interlock_); + return !is_depth_only_fragment_shader_ && + !current_shader().writes_depth() && + !current_shader().is_valid_memexport_used(); + } + void FSI_LoadSampleMask(spv::Id msaa_samples); + void FSI_LoadEdramOffsets(spv::Id msaa_samples); + // The address must be a signed int. Whether the render target is 64bpp, if + // present at all, must be a bool (if it's NoResult, 32bpp will be assumed). + spv::Id FSI_AddSampleOffset(spv::Id sample_0_address, uint32_t sample_index, + spv::Id is_64bpp = spv::NoResult); + // Updates main_fsi_sample_mask_. Must be called outside non-uniform control + // flow because of taking derivatives of the fragment depth. + void FSI_DepthStencilTest(spv::Id msaa_samples, + bool sample_mask_potentially_narrowed_previouly); + // Returns the first and the second 32 bits as two uints. + std::array FSI_ClampAndPackColor(spv::Id color_float4, + spv::Id format_with_flags); + std::array FSI_UnpackColor(std::array color_packed, + spv::Id format_with_flags); + // The bounds must have the same number of components as the color or alpha. + spv::Id FSI_FlushNaNClampAndInBlending(spv::Id color_or_alpha, + spv::Id is_fixed_point, + spv::Id min_value, spv::Id max_value); + spv::Id FSI_ApplyColorBlendFactor(spv::Id value, spv::Id is_fixed_point, + spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, + spv::Id source_color, spv::Id source_alpha, + spv::Id dest_color, spv::Id dest_alpha, + spv::Id constant_color, + spv::Id constant_alpha); + spv::Id FSI_ApplyAlphaBlendFactor(spv::Id value, spv::Id is_fixed_point, + spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, + spv::Id source_alpha, spv::Id dest_alpha, + spv::Id constant_alpha); + // If source_color_clamped, dest_color, constant_color_clamped are + // spv::NoResult, will blend the alpha. Otherwise, will blend the color. + // The result will be unclamped (color packing is supposed to clamp it). + spv::Id FSI_BlendColorOrAlphaWithUnclampedResult( + spv::Id is_fixed_point, spv::Id clamp_min_value, spv::Id clamp_max_value, + spv::Id source_color_clamped, spv::Id source_alpha_clamped, + spv::Id dest_color, spv::Id dest_alpha, spv::Id constant_color_clamped, + spv::Id constant_alpha_clamped, spv::Id equation, spv::Id source_factor, + spv::Id dest_factor); + Features features_; + bool native_2x_msaa_with_attachments_; + bool native_2x_msaa_no_attachments_; + + // For safety with different drivers (even though fragment shader interlock in + // SPIR-V only has one control flow requirement - that both begin and end must + // be dynamically executed exactly once in this order), adhering to the more + // strict control flow limitations of OpenGL (GLSL) fragment shader interlock, + // that begin and end are called only on the outermost level of the control + // flow of the main function, and that there are no returns before either + // (there's a single return from the shader). + bool edram_fragment_shader_interlock_; + + // Is currently writing the empty depth-only pixel shader, such as for depth + // and stencil testing with fragment shader interlock. + bool is_depth_only_fragment_shader_ = false; std::unique_ptr builder_; @@ -621,7 +778,23 @@ class SpirvShaderTranslator : public ShaderTranslator { kSystemConstantTextureSwizzledSigns, kSystemConstantTextureSwizzles, kSystemConstantAlphaTestReference, + kSystemConstantEdram32bppTilePitchDwordsScaled, + kSystemConstantEdramDepthBaseDwordsScaled, kSystemConstantColorExpBias, + kSystemConstantEdramPolyOffsetFrontScale, + kSystemConstantEdramPolyOffsetBackScale, + kSystemConstantEdramPolyOffsetFrontOffset, + kSystemConstantEdramPolyOffsetBackOffset, + kSystemConstantEdramStencilFront, + kSystemConstantEdramStencilBack, + kSystemConstantEdramRTBaseDwordsScaled, + kSystemConstantEdramRTFormatFlags, + kSystemConstantEdramRTBlendFactorsOps, + // Accessed as float4[2], not float2[4], due to std140 array stride + // alignment. + kSystemConstantEdramRTKeepMask, + kSystemConstantEdramRTClamp, + kSystemConstantEdramBlendConstant, }; spv::Id uniform_system_constants_; spv::Id uniform_float_constants_; @@ -629,6 +802,7 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id uniform_fetch_constants_; spv::Id buffers_shared_memory_; + spv::Id buffer_edram_; // Not using combined images and samplers because // maxPerStageDescriptorSamplers is often lower than @@ -647,6 +821,8 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id input_fragment_coordinates_; // PS, only when needed - bool. spv::Id input_front_facing_; + // PS, only when needed - int[1]. + spv::Id input_sample_mask_; // VS output or PS input, only the ones that are needed (spv::NoResult for the // unneeded interpolators), indexed by the guest interpolator index - float4. @@ -671,7 +847,10 @@ class SpirvShaderTranslator : public ShaderTranslator { }; spv::Id output_per_vertex_; - std::array output_fragment_data_; + // With fragment shader interlock, variables in the main function. + // Otherwise, framebuffer color attachment outputs. + std::array + output_or_var_fragment_data_; std::vector main_interface_; spv::Function* function_main_; @@ -698,6 +877,40 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id var_main_registers_; // VS only - float3 (special exports). spv::Id var_main_point_size_edge_flag_kill_vertex_; + // PS, only when needed - bool. + spv::Id var_main_kill_pixel_; + // PS, only when writing to color render targets with fragment shader + // interlock - uint. + // Whether color buffers have been written to, if not written on the taken + // execution path, don't export according to Direct3D 9 register documentation + // (some games rely on this behavior). + spv::Id var_main_fsi_color_written_; + // Loaded by FSI_LoadSampleMask. + // Can be modified on the outermost control flow level in the main function. + // 0:3 - Per-sample coverage at the current stage of the shader's execution. + // Affected by things like gl_SampleMaskIn, early or late depth / + // stencil (always resets bits for failing, no matter if need to defer + // writing), alpha to coverage. + // 4:7 - Depth write deferred mask - when early depth / stencil resulted in a + // different value for the sample (like different stencil if the test + // failed), but can't write it before running the shader because it's + // not known if the sample will be discarded by the shader, alphatest or + // AtoC. + // Early depth / stencil rejection of the pixel is possible when both 0:3 and + // 4:7 are zero. + spv::Id main_fsi_sample_mask_; + // Loaded by FSI_LoadEdramOffsets. + // Including the depth render target base. + spv::Id main_fsi_address_depth_; + // Not including the render target base. + spv::Id main_fsi_offset_32bpp_; + spv::Id main_fsi_offset_64bpp_; + // Loaded by FSI_DepthStencilTest for early depth / stencil, the depth / + // stencil values to write at the end of the shader if the specified in + // main_fsi_sample_mask_ and if the samples were not discarded later after the + // early test. + std::array main_fsi_late_write_depth_stencil_; + spv::Block* main_fsi_early_depth_stencil_execute_quad_merge_; spv::Block* main_loop_header_; spv::Block* main_loop_continue_; spv::Block* main_loop_merge_; diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc index 9dfbccb09..7188258e9 100644 --- a/src/xenia/gpu/spirv_shader_translator_alu.cc +++ b/src/xenia/gpu/spirv_shader_translator_alu.cc @@ -123,7 +123,7 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( : spv::NoType; // In case the paired scalar instruction (if processed first) terminates the - // block (like via OpKill). + // block. EnsureBuildPointAvailable(); // Lookup table for variants of instructions with similar structure. @@ -838,9 +838,15 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( SpirvCreateSelectionMerge(merge_block.getId()); builder_->createConditionalBranch(condition, &kill_block, &merge_block); builder_->setBuildPoint(&kill_block); - // TODO(Triang3l): Demote to helper invocation to keep derivatives if - // needed (and return 1 if killed in this case). - builder_->createNoResultOp(spv::OpKill); + // Kill without influencing the control flow in the translated shader. + if (var_main_kill_pixel_ != spv::NoResult) { + builder_->createStore(builder_->makeBoolConstant(true), + var_main_kill_pixel_); + } + if (features_.demote_to_helper_invocation) { + builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); + } + builder_->createBranch(&merge_block); builder_->setBuildPoint(&merge_block); return const_float_0_; } @@ -938,7 +944,7 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( } // In case the paired vector instruction (if processed first) terminates the - // block (like via OpKill). + // block. EnsureBuildPointAvailable(); // Lookup table for variants of instructions with similar structure. @@ -1393,9 +1399,15 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( SpirvCreateSelectionMerge(merge_block.getId()); builder_->createConditionalBranch(condition, &kill_block, &merge_block); builder_->setBuildPoint(&kill_block); - // TODO(Triang3l): Demote to helper invocation to keep derivatives if - // needed (and return 1 if killed in this case). - builder_->createNoResultOp(spv::OpKill); + // Kill without influencing the control flow in the translated shader. + if (var_main_kill_pixel_ != spv::NoResult) { + builder_->createStore(builder_->makeBoolConstant(true), + var_main_kill_pixel_); + } + if (features_.demote_to_helper_invocation) { + builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); + } + builder_->createBranch(&merge_block); builder_->setBuildPoint(&merge_block); return const_float_0_; } diff --git a/src/xenia/gpu/spirv_shader_translator_fetch.cc b/src/xenia/gpu/spirv_shader_translator_fetch.cc index 88d3bd5ab..c9655c64f 100644 --- a/src/xenia/gpu/spirv_shader_translator_fetch.cc +++ b/src/xenia/gpu/spirv_shader_translator_fetch.cc @@ -1898,30 +1898,14 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( builder_->setBuildPoint(&block_dimension_stacked_start); if (use_computed_lod) { // Extract 2D gradients for stacked textures which are 2D arrays. - { - std::unique_ptr shuffle_op = - std::make_unique(builder_->getUniqueId(), - type_float2_, - spv::OpVectorShuffle); - shuffle_op->addIdOperand(gradients_h); - shuffle_op->addIdOperand(gradients_h); - shuffle_op->addImmediateOperand(0); - shuffle_op->addImmediateOperand(1); - texture_parameters.gradX = shuffle_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(shuffle_op)); - } - { - std::unique_ptr shuffle_op = - std::make_unique(builder_->getUniqueId(), - type_float2_, - spv::OpVectorShuffle); - shuffle_op->addIdOperand(gradients_v); - shuffle_op->addIdOperand(gradients_v); - shuffle_op->addImmediateOperand(0); - shuffle_op->addImmediateOperand(1); - texture_parameters.gradY = shuffle_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(shuffle_op)); - } + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + texture_parameters.gradX = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, gradients_h, uint_vector_temp_); + texture_parameters.gradY = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, gradients_v, uint_vector_temp_); } // Check if linear filtering is needed. bool vol_mag_filter_is_fetch_const = diff --git a/src/xenia/gpu/spirv_shader_translator_rb.cc b/src/xenia/gpu/spirv_shader_translator_rb.cc index c594a902f..f710ad7a5 100644 --- a/src/xenia/gpu/spirv_shader_translator_rb.cc +++ b/src/xenia/gpu/spirv_shader_translator_rb.cc @@ -16,6 +16,8 @@ #include "third_party/glslang/SPIRV/GLSL.std.450.h" #include "xenia/base/assert.h" #include "xenia/base/math.h" +#include "xenia/gpu/draw_util.h" +#include "xenia/gpu/render_target_cache.h" namespace xe { namespace gpu { @@ -426,15 +428,102 @@ spv::Id SpirvShaderTranslator::Depth20e4To32(spv::Builder& builder, } void SpirvShaderTranslator::CompleteFragmentShaderInMain() { - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeIntConstant(kSystemConstantFlags)); - spv::Id system_constant_flags = builder_->createLoad( - builder_->createAccessChain(spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision); + // Loaded if needed. + spv::Id msaa_samples = spv::NoResult; + + if (edram_fragment_shader_interlock_ && !FSI_IsDepthStencilEarly()) { + if (msaa_samples == spv::NoResult) { + msaa_samples = LoadMsaaSamplesFromFlags(); + } + // Load the sample mask, which may be modified later by killing from + // different sources, if not loaded already. + FSI_LoadSampleMask(msaa_samples); + } + + bool fsi_pixel_potentially_killed = false; + + if (current_shader().kills_pixels()) { + if (edram_fragment_shader_interlock_) { + fsi_pixel_potentially_killed = true; + if (!features_.demote_to_helper_invocation) { + assert_true(var_main_kill_pixel_ != spv::NoResult); + main_fsi_sample_mask_ = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision), + const_uint_0_, main_fsi_sample_mask_); + } + } else { + if (!features_.demote_to_helper_invocation) { + // Kill the pixel once the guest control flow and derivatives are not + // needed anymore. + assert_true(var_main_kill_pixel_ != spv::NoResult); + // Load the condition before the OpSelectionMerge, which must be the + // penultimate instruction. + spv::Id kill_pixel = + builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision); + spv::Block& block_kill = builder_->makeNewBlock(); + spv::Block& block_kill_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_kill_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(kill_pixel, &block_kill, + &block_kill_merge); + builder_->setBuildPoint(&block_kill); + // TODO(Triang3l): Use OpTerminateInvocation when SPIR-V 1.6 is + // targeted. + builder_->createNoResultOp(spv::OpKill); + // OpKill terminates the block. + builder_->setBuildPoint(&block_kill_merge); + } + } + } + + uint32_t color_targets_written = current_shader().writes_color_targets(); + + if ((color_targets_written & 0b1) && !IsExecutionModeEarlyFragmentTests()) { + spv::Id fsi_sample_mask_in_rt_0_alpha_tests = spv::NoResult; + spv::Block* block_fsi_rt_0_alpha_tests_rt_written_head = nullptr; + spv::Block* block_fsi_rt_0_alpha_tests_rt_written_merge = nullptr; + builder_->makeNewBlock(); + if (edram_fragment_shader_interlock_) { + // Skip the alpha test and alpha to coverage if the render target 0 is not + // written to dynamically. + fsi_sample_mask_in_rt_0_alpha_tests = main_fsi_sample_mask_; + spv::Id rt_0_written = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createLoad(var_main_fsi_color_written_, + spv::NoPrecision), + builder_->makeUintConstant(0b1)), + const_uint_0_); + block_fsi_rt_0_alpha_tests_rt_written_head = builder_->getBuildPoint(); + spv::Block& block_fsi_rt_0_alpha_tests_rt_written = + builder_->makeNewBlock(); + block_fsi_rt_0_alpha_tests_rt_written_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge( + block_fsi_rt_0_alpha_tests_rt_written_merge->getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr rt_0_written_branch_conditional_op = + std::make_unique(spv::OpBranchConditional); + rt_0_written_branch_conditional_op->addIdOperand(rt_0_written); + rt_0_written_branch_conditional_op->addIdOperand( + block_fsi_rt_0_alpha_tests_rt_written.getId()); + rt_0_written_branch_conditional_op->addIdOperand( + block_fsi_rt_0_alpha_tests_rt_written_merge->getId()); + // More likely to write to the render target 0 than not. + rt_0_written_branch_conditional_op->addImmediateOperand(2); + rt_0_written_branch_conditional_op->addImmediateOperand(1); + builder_->getBuildPoint()->addInstruction( + std::move(rt_0_written_branch_conditional_op)); + } + block_fsi_rt_0_alpha_tests_rt_written.addPredecessor( + block_fsi_rt_0_alpha_tests_rt_written_head); + block_fsi_rt_0_alpha_tests_rt_written_merge->addPredecessor( + block_fsi_rt_0_alpha_tests_rt_written_head); + builder_->setBuildPoint(&block_fsi_rt_0_alpha_tests_rt_written); + } - if (current_shader().writes_color_target(0) && - !IsExecutionModeEarlyFragmentTests()) { // Alpha test. // TODO(Triang3l): Check how alpha test works with NaN on Direct3D 9. // Extract the comparison function (less, equal, greater bits). @@ -458,11 +547,12 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { { id_vector_temp_.clear(); id_vector_temp_.push_back(builder_->makeIntConstant(3)); - spv::Id alpha_test_alpha = - builder_->createLoad(builder_->createAccessChain( - spv::StorageClassOutput, - output_fragment_data_[0], id_vector_temp_), - spv::NoPrecision); + spv::Id alpha_test_alpha = builder_->createLoad( + builder_->createAccessChain( + edram_fragment_shader_interlock_ ? spv::StorageClassFunction + : spv::StorageClassOutput, + output_or_var_fragment_data_[0], id_vector_temp_), + spv::NoPrecision); id_vector_temp_.clear(); id_vector_temp_.push_back( builder_->makeIntConstant(kSystemConstantAlphaTestReference)); @@ -522,126 +612,3297 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { builder_->createBranch(&block_alpha_test_not_equal_merge); } builder_->setBuildPoint(&block_alpha_test_not_equal_merge); - spv::Id alpha_test_result; - { - std::unique_ptr alpha_test_result_phi_op = - std::make_unique(builder_->getUniqueId(), - type_bool_, spv::OpPhi); - alpha_test_result_phi_op->addIdOperand(alpha_test_result_not_equal); - alpha_test_result_phi_op->addIdOperand( - block_alpha_test_not_equal.getId()); - alpha_test_result_phi_op->addIdOperand(alpha_test_result_non_not_equal); - alpha_test_result_phi_op->addIdOperand( - block_alpha_test_non_not_equal.getId()); - alpha_test_result = alpha_test_result_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(alpha_test_result_phi_op)); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(alpha_test_result_not_equal); + id_vector_temp_.push_back(block_alpha_test_not_equal.getId()); + id_vector_temp_.push_back(alpha_test_result_non_not_equal); + id_vector_temp_.push_back(block_alpha_test_non_not_equal.getId()); + spv::Id alpha_test_result = + builder_->createOp(spv::OpPhi, type_bool_, id_vector_temp_); + // Discard the pixel if the alpha test has failed. + if (edram_fragment_shader_interlock_ && + !features_.demote_to_helper_invocation) { + fsi_pixel_potentially_killed = true; + fsi_sample_mask_in_rt_0_alpha_tests = builder_->createTriOp( + spv::OpSelect, type_uint_, alpha_test_result, + fsi_sample_mask_in_rt_0_alpha_tests, const_uint_0_); + } else { + // Creating a merge block even though it will contain just one OpBranch + // since SPIR-V requires structured control flow in shaders. + spv::Block& block_alpha_test_kill = builder_->makeNewBlock(); + spv::Block& block_alpha_test_kill_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_alpha_test_kill_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(alpha_test_result, + &block_alpha_test_kill_merge, + &block_alpha_test_kill); + builder_->setBuildPoint(&block_alpha_test_kill); + if (edram_fragment_shader_interlock_) { + assert_true(features_.demote_to_helper_invocation); + fsi_pixel_potentially_killed = true; + // TODO(Triang3l): Promoted to SPIR-V 1.6 - don't add the extension + // there. + builder_->addExtension("SPV_EXT_demote_to_helper_invocation"); + builder_->addCapability(spv::CapabilityDemoteToHelperInvocationEXT); + builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); + builder_->createBranch(&block_alpha_test_kill_merge); + } else { + // TODO(Triang3l): Use OpTerminateInvocation when SPIR-V 1.6 is + // targeted. + builder_->createNoResultOp(spv::OpKill); + // OpKill terminates the block. + } + builder_->setBuildPoint(&block_alpha_test_kill_merge); + builder_->createBranch(&block_alpha_test_merge); } - // Discard the pixel if the alpha test has failed. Creating a merge block - // even though it will contain just one OpBranch since SPIR-V requires - // structured control flow in shaders. - spv::Block& block_alpha_test_kill = builder_->makeNewBlock(); - spv::Block& block_alpha_test_kill_merge = builder_->makeNewBlock(); - SpirvCreateSelectionMerge(block_alpha_test_kill_merge.getId(), - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(alpha_test_result, - &block_alpha_test_kill_merge, - &block_alpha_test_kill); - builder_->setBuildPoint(&block_alpha_test_kill); - builder_->createNoResultOp(spv::OpKill); - // OpKill terminates the block. - builder_->setBuildPoint(&block_alpha_test_kill_merge); - builder_->createBranch(&block_alpha_test_merge); } builder_->setBuildPoint(&block_alpha_test_merge); + + // TODO(Triang3l): Alpha to coverage. + + if (edram_fragment_shader_interlock_) { + // Close the render target 0 written check. + builder_->createBranch(block_fsi_rt_0_alpha_tests_rt_written_merge); + spv::Block& block_fsi_rt_0_alpha_tests_rt_written_end = + *builder_->getBuildPoint(); + builder_->setBuildPoint(block_fsi_rt_0_alpha_tests_rt_written_merge); + if (!features_.demote_to_helper_invocation) { + // The tests might have modified the sample mask via + // fsi_sample_mask_in_rt_0_alpha_tests. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(fsi_sample_mask_in_rt_0_alpha_tests); + id_vector_temp_.push_back( + block_fsi_rt_0_alpha_tests_rt_written_end.getId()); + id_vector_temp_.push_back(main_fsi_sample_mask_); + id_vector_temp_.push_back( + block_fsi_rt_0_alpha_tests_rt_written_head->getId()); + main_fsi_sample_mask_ = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + } } - uint32_t color_targets_remaining = current_shader().writes_color_targets(); - uint32_t color_target_index; - while (xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { - color_targets_remaining &= ~(UINT32_C(1) << color_target_index); - spv::Id color_variable = output_fragment_data_[color_target_index]; - spv::Id color = builder_->createLoad(color_variable, spv::NoPrecision); + spv::Block* block_fsi_if_after_kill = nullptr; + spv::Block* block_fsi_if_after_kill_merge = nullptr; - // Apply the exponent bias after the alpha test and alpha to coverage - // because they need the unbiased alpha from the shader. + spv::Block* block_fsi_if_after_depth_stencil = nullptr; + spv::Block* block_fsi_if_after_depth_stencil_merge = nullptr; + + if (edram_fragment_shader_interlock_) { + if (fsi_pixel_potentially_killed) { + if (features_.demote_to_helper_invocation) { + // Don't do anything related to writing to the EDRAM if the pixel was + // killed. + id_vector_temp_.clear(); + // TODO(Triang3l): Use HelperInvocation volatile load on SPIR-V 1.6. + main_fsi_sample_mask_ = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createOp(spv::OpIsHelperInvocationEXT, type_bool_, + id_vector_temp_), + const_uint_0_, main_fsi_sample_mask_); + } + // Check the condition before the OpSelectionMerge, which must be the + // penultimate instruction in a block. + spv::Id pixel_not_killed = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + block_fsi_if_after_kill = &builder_->makeNewBlock(); + block_fsi_if_after_kill_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_fsi_if_after_kill_merge->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(pixel_not_killed, + block_fsi_if_after_kill, + block_fsi_if_after_kill_merge); + builder_->setBuildPoint(block_fsi_if_after_kill); + } + + spv::Id color_write_depth_stencil_condition = spv::NoResult; + if (FSI_IsDepthStencilEarly()) { + // Perform late depth / stencil writes for samples not discarded. + for (uint32_t i = 0; i < 4; ++i) { + spv::Id sample_late_depth_stencil_write_needed = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_fsi_sample_mask_, + builder_->makeUintConstant(uint32_t(1) << (4 + i))), + const_uint_0_); + spv::Block& block_sample_late_depth_stencil_write = + builder_->makeNewBlock(); + spv::Block& block_sample_late_depth_stencil_write_merge = + builder_->makeNewBlock(); + SpirvCreateSelectionMerge( + block_sample_late_depth_stencil_write_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + sample_late_depth_stencil_write_needed, + &block_sample_late_depth_stencil_write, + &block_sample_late_depth_stencil_write_merge); + builder_->setBuildPoint(&block_sample_late_depth_stencil_write); + spv::Id depth_stencil_sample_address = + FSI_AddSampleOffset(main_fsi_address_depth_, i); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(depth_stencil_sample_address); + builder_->createStore( + main_fsi_late_write_depth_stencil_[i], + builder_->createAccessChain(features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_)); + builder_->createBranch(&block_sample_late_depth_stencil_write_merge); + builder_->setBuildPoint(&block_sample_late_depth_stencil_write_merge); + } + if (color_targets_written) { + // Only take the remaining coverage bits, not the late depth / stencil + // write bits, into account in the check whether anything needs to be + // done for the color targets. + color_write_depth_stencil_condition = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_fsi_sample_mask_, + builder_->makeUintConstant((uint32_t(1) << 4) - 1)), + const_uint_0_); + } + } else { + if (msaa_samples == spv::NoResult) { + msaa_samples = LoadMsaaSamplesFromFlags(); + } + FSI_LoadEdramOffsets(msaa_samples); + // Begin the critical section on the outermost control flow level so it's + // entered exactly once on any control flow path as required by the SPIR-V + // extension specification. + builder_->createNoResultOp(spv::OpBeginInvocationInterlockEXT); + // Do the depth / stencil test. + // The sample mask might have been made narrower than the initially loaded + // mask by various conditions that discard the whole pixel, as well as by + // alpha to coverage. + FSI_DepthStencilTest(msaa_samples, fsi_pixel_potentially_killed || + (color_targets_written & 0b1)); + if (color_targets_written) { + // Only bits 0:3 of main_fsi_sample_mask_ are written by the late + // depth / stencil test. + color_write_depth_stencil_condition = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + } + } + + if (color_write_depth_stencil_condition != spv::NoResult) { + // Skip all color operations if the pixel has failed the tests entirely. + block_fsi_if_after_depth_stencil = &builder_->makeNewBlock(); + block_fsi_if_after_depth_stencil_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_fsi_if_after_depth_stencil_merge->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(color_write_depth_stencil_condition, + block_fsi_if_after_depth_stencil, + block_fsi_if_after_depth_stencil_merge); + builder_->setBuildPoint(block_fsi_if_after_depth_stencil); + } + } + + if (color_targets_written) { + spv::Id fsi_color_targets_written = spv::NoResult; + spv::Id fsi_const_int_1 = spv::NoResult; + spv::Id fsi_const_edram_size_dwords = spv::NoResult; + spv::Id fsi_samples_covered[4] = {}; + if (edram_fragment_shader_interlock_) { + fsi_color_targets_written = + builder_->createLoad(var_main_fsi_color_written_, spv::NoPrecision); + fsi_const_int_1 = builder_->makeIntConstant(1); + // TODO(Triang3l): Resolution scaling. + fsi_const_edram_size_dwords = builder_->makeUintConstant( + xenos::kEdramTileWidthSamples * xenos::kEdramTileHeightSamples * + xenos::kEdramTileCount); + for (uint32_t i = 0; i < 4; ++i) { + fsi_samples_covered[i] = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_fsi_sample_mask_, + builder_->makeUintConstant(uint32_t(1) << i)), + const_uint_0_); + } + } + uint32_t color_targets_remaining = color_targets_written; + uint32_t color_target_index; + while (xe::bit_scan_forward(color_targets_remaining, &color_target_index)) { + color_targets_remaining &= ~(UINT32_C(1) << color_target_index); + spv::Id color_variable = output_or_var_fragment_data_[color_target_index]; + spv::Id color = builder_->createLoad(color_variable, spv::NoPrecision); + + // Apply the exponent bias after the alpha test and alpha to coverage + // because they need the unbiased alpha from the shader. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantColorExpBias)); + id_vector_temp_.push_back( + builder_->makeIntConstant(int32_t(color_target_index))); + color = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float4_, color, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision)); + builder_->addDecoration(color, spv::DecorationNoContraction); + + if (edram_fragment_shader_interlock_) { + // Write the color to the target in the EDRAM only it was written on the + // shader's execution path, according to the Direct3D 9 rules that games + // rely on. + spv::Id fsi_color_written = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, fsi_color_targets_written, + builder_->makeUintConstant(uint32_t(1) << color_target_index)), + const_uint_0_); + spv::Block& fsi_color_written_if_head = *builder_->getBuildPoint(); + spv::Block& fsi_color_written_if = builder_->makeNewBlock(); + spv::Block& fsi_color_written_if_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(fsi_color_written_if_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr rt_written_branch_conditional_op = + std::make_unique(spv::OpBranchConditional); + rt_written_branch_conditional_op->addIdOperand(fsi_color_written); + rt_written_branch_conditional_op->addIdOperand( + fsi_color_written_if.getId()); + rt_written_branch_conditional_op->addIdOperand( + fsi_color_written_if_merge.getId()); + // More likely to write to the render target than not. + rt_written_branch_conditional_op->addImmediateOperand(2); + rt_written_branch_conditional_op->addImmediateOperand(1); + builder_->getBuildPoint()->addInstruction( + std::move(rt_written_branch_conditional_op)); + } + fsi_color_written_if.addPredecessor(&fsi_color_written_if_head); + fsi_color_written_if_merge.addPredecessor(&fsi_color_written_if_head); + builder_->setBuildPoint(&fsi_color_written_if); + + // For accessing uint2 arrays of per-render-target data which are passed + // as uint4 arrays due to std140 array element alignment. + spv::Id rt_uint2_index_array = + builder_->makeIntConstant(color_target_index >> 1); + spv::Id rt_uint2_index_element[] = { + builder_->makeIntConstant((color_target_index & 1) << 1), + builder_->makeIntConstant(((color_target_index & 1) << 1) + 1), + }; + + // Load the mask of the bits of the destination color that should be + // preserved (in 32-bit halves), which are 0, 0 if the color is fully + // overwritten, or UINT32_MAX, UINT32_MAX if writing to the target is + // disabled completely. + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTKeepMask)); + id_vector_temp_.push_back(rt_uint2_index_array); + id_vector_temp_.push_back(rt_uint2_index_element[0]); + spv::Id rt_keep_mask[2]; + rt_keep_mask[0] = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.back() = rt_uint2_index_element[1]; + rt_keep_mask[1] = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + + // Check if writing to the render target is not disabled completely. + spv::Id const_uint32_max = builder_->makeUintConstant(UINT32_MAX); + spv::Id rt_write_mask_not_empty = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, + builder_->createBinOp(spv::OpINotEqual, type_bool_, rt_keep_mask[0], + const_uint32_max), + builder_->createBinOp(spv::OpINotEqual, type_bool_, rt_keep_mask[1], + const_uint32_max)); + spv::Block& rt_write_mask_not_empty_if = builder_->makeNewBlock(); + spv::Block& rt_write_mask_not_empty_if_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(rt_write_mask_not_empty_if_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_write_mask_not_empty, + &rt_write_mask_not_empty_if, + &rt_write_mask_not_empty_if_merge); + builder_->setBuildPoint(&rt_write_mask_not_empty_if); + + spv::Id const_int_rt_index = + builder_->makeIntConstant(color_target_index); + + // Load the information about the render target. + + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTFormatFlags)); + id_vector_temp_.push_back(const_int_rt_index); + spv::Id rt_format_with_flags = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + + spv::Id rt_is_64bpp = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, rt_format_with_flags, + builder_->makeUintConstant( + RenderTargetCache::kPSIColorFormatFlag_64bpp)), + const_uint_0_); + + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTBaseDwordsScaled)); + id_vector_temp_.push_back(const_int_rt_index); + // EDRAM addresses are wrapped on the Xenos (modulo the EDRAM size). + spv::Id rt_sample_0_address = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp( + spv::OpUMod, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision), + builder_->createTriOp(spv::OpSelect, type_uint_, + rt_is_64bpp, main_fsi_offset_64bpp_, + main_fsi_offset_32bpp_)), + fsi_const_edram_size_dwords)); + + // Load the blending parameters for the render target. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTBlendFactorsOps)); + id_vector_temp_.push_back(const_int_rt_index); + spv::Id rt_blend_factors_equations = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + + // Check if blending (the blending is not 1 * source + 0 * destination). + spv::Id rt_blend_enabled = builder_->createBinOp( + spv::OpINotEqual, type_bool_, rt_blend_factors_equations, + builder_->makeUintConstant(0x00010001)); + spv::Block& rt_blend_enabled_if = builder_->makeNewBlock(); + spv::Block& rt_blend_enabled_else = builder_->makeNewBlock(); + spv::Block& rt_blend_enabled_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(rt_blend_enabled_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + rt_blend_enabled, &rt_blend_enabled_if, &rt_blend_enabled_else); + + // Blending path. + { + builder_->setBuildPoint(&rt_blend_enabled_if); + + // Get various parameters used in blending. + spv::Id rt_color_is_fixed_point = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, rt_format_with_flags, + builder_->makeUintConstant( + RenderTargetCache::kPSIColorFormatFlag_FixedPointColor)), + const_uint_0_); + spv::Id rt_alpha_is_fixed_point = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, rt_format_with_flags, + builder_->makeUintConstant( + RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha)), + const_uint_0_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramRTClamp)); + id_vector_temp_.push_back(const_int_rt_index); + spv::Id rt_clamp = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + spv::Id rt_clamp_color_min = builder_->smearScalar( + spv::NoPrecision, + builder_->createCompositeExtract(rt_clamp, type_float_, 0), + type_float3_); + spv::Id rt_clamp_alpha_min = + builder_->createCompositeExtract(rt_clamp, type_float_, 1); + spv::Id rt_clamp_color_max = builder_->smearScalar( + spv::NoPrecision, + builder_->createCompositeExtract(rt_clamp, type_float_, 2), + type_float3_); + spv::Id rt_clamp_alpha_max = + builder_->createCompositeExtract(rt_clamp, type_float_, 3); + + spv::Id blend_factor_width = builder_->makeUintConstant(5); + spv::Id blend_equation_width = builder_->makeUintConstant(3); + spv::Id rt_color_source_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + const_uint_0_, blend_factor_width); + spv::Id rt_color_equation = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + blend_factor_width, blend_equation_width); + spv::Id rt_color_dest_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(8), blend_factor_width); + spv::Id rt_alpha_source_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(16), blend_factor_width); + spv::Id rt_alpha_equation = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(21), blend_equation_width); + spv::Id rt_alpha_dest_factor = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, rt_blend_factors_equations, + builder_->makeUintConstant(24), blend_factor_width); + + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramBlendConstant)); + spv::Id blend_constant_unclamped = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision); + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id blend_constant_color_unclamped = + builder_->createRvalueSwizzle(spv::NoPrecision, type_float3_, + blend_constant_unclamped, + uint_vector_temp_); + spv::Id blend_constant_color_clamped = FSI_FlushNaNClampAndInBlending( + blend_constant_color_unclamped, rt_color_is_fixed_point, + rt_clamp_color_min, rt_clamp_color_max); + spv::Id blend_constant_alpha_clamped = FSI_FlushNaNClampAndInBlending( + builder_->createCompositeExtract(blend_constant_unclamped, + type_float_, 3), + rt_alpha_is_fixed_point, rt_clamp_alpha_min, rt_clamp_alpha_max); + + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id source_color_unclamped = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float3_, color, uint_vector_temp_); + spv::Id source_color_clamped = FSI_FlushNaNClampAndInBlending( + source_color_unclamped, rt_color_is_fixed_point, + rt_clamp_color_min, rt_clamp_color_max); + spv::Id source_alpha_clamped = FSI_FlushNaNClampAndInBlending( + builder_->createCompositeExtract(color, type_float_, 3), + rt_alpha_is_fixed_point, rt_clamp_alpha_min, rt_clamp_alpha_max); + + std::array rt_replace_mask; + for (uint32_t i = 0; i < 2; ++i) { + rt_replace_mask[i] = builder_->createUnaryOp(spv::OpNot, type_uint_, + rt_keep_mask[i]); + } + + // Blend and mask each sample. + for (uint32_t i = 0; i < 4; ++i) { + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(fsi_samples_covered[i], + &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + + spv::Id rt_sample_address = + FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(rt_sample_address); + spv::Id rt_access_chain_0 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, rt_sample_address, fsi_const_int_1); + spv::Id rt_access_chain_1 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + + // Load the destination color. + std::array dest_packed; + dest_packed[0] = + builder_->createLoad(rt_access_chain_0, spv::NoPrecision); + { + spv::Block& block_load_64bpp_head = *builder_->getBuildPoint(); + spv::Block& block_load_64bpp = builder_->makeNewBlock(); + spv::Block& block_load_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_load_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_load_64bpp, + &block_load_64bpp_merge); + builder_->setBuildPoint(&block_load_64bpp); + spv::Id dest_packed_64bpp_high = + builder_->createLoad(rt_access_chain_1, spv::NoPrecision); + builder_->createBranch(&block_load_64bpp_merge); + builder_->setBuildPoint(&block_load_64bpp_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(dest_packed_64bpp_high); + id_vector_temp_.push_back(block_load_64bpp.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_load_64bpp_head.getId()); + dest_packed[1] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + std::array dest_unpacked = + FSI_UnpackColor(dest_packed, rt_format_with_flags); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(dest_unpacked[0]); + id_vector_temp_.push_back(dest_unpacked[1]); + id_vector_temp_.push_back(dest_unpacked[2]); + spv::Id dest_color = builder_->createCompositeConstruct( + type_float3_, id_vector_temp_); + + // Blend the components. + spv::Id result_color = FSI_BlendColorOrAlphaWithUnclampedResult( + rt_color_is_fixed_point, rt_clamp_color_min, rt_clamp_color_max, + source_color_clamped, source_alpha_clamped, dest_color, + dest_unpacked[3], blend_constant_color_clamped, + blend_constant_alpha_clamped, rt_color_equation, + rt_color_source_factor, rt_color_dest_factor); + spv::Id result_alpha = FSI_BlendColorOrAlphaWithUnclampedResult( + rt_alpha_is_fixed_point, rt_clamp_alpha_min, rt_clamp_alpha_max, + spv::NoResult, source_alpha_clamped, spv::NoResult, + dest_unpacked[3], spv::NoResult, blend_constant_alpha_clamped, + rt_alpha_equation, rt_alpha_source_factor, + rt_alpha_dest_factor); + + // Pack and store the result. + // Bypass the `getNumTypeConstituents(typeId) == + // (int)constituents.size()` assertion in createCompositeConstruct, + // OpCompositeConstruct can construct vectors not only from scalars, + // but also from other vectors. + spv::Id result_float4; + { + std::unique_ptr result_composite_construct_op = + std::make_unique(builder_->getUniqueId(), + type_float4_, + spv::OpCompositeConstruct); + result_composite_construct_op->addIdOperand(result_color); + result_composite_construct_op->addIdOperand(result_alpha); + result_float4 = result_composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(result_composite_construct_op)); + } + std::array result_packed = + FSI_ClampAndPackColor(result_float4, rt_format_with_flags); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + dest_packed[0], rt_keep_mask[0]), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + result_packed[0], + rt_replace_mask[0])), + rt_access_chain_0); + spv::Block& block_store_64bpp = builder_->makeNewBlock(); + spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_store_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, + &block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + dest_packed[1], rt_keep_mask[1]), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + result_packed[1], + rt_replace_mask[1])), + rt_access_chain_0); + builder_->createBranch(&block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp_merge); + + builder_->createBranch(&block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered_merge); + } + + builder_->createBranch(&rt_blend_enabled_merge); + } + + // Non-blending paths. + { + builder_->setBuildPoint(&rt_blend_enabled_else); + + // Pack the new color for all samples. + std::array color_packed = + FSI_ClampAndPackColor(color, rt_format_with_flags); + + // Check if need to load the original contents. + spv::Id rt_keep_mask_not_empty = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, + builder_->createBinOp(spv::OpINotEqual, type_bool_, + rt_keep_mask[0], const_uint_0_), + builder_->createBinOp(spv::OpINotEqual, type_bool_, + rt_keep_mask[1], const_uint_0_)); + spv::Block& rt_keep_mask_not_empty_if = builder_->makeNewBlock(); + spv::Block& rt_keep_mask_not_empty_if_else = builder_->makeNewBlock(); + spv::Block& rt_keep_mask_not_empty_if_merge = + builder_->makeNewBlock(); + SpirvCreateSelectionMerge(rt_keep_mask_not_empty_if_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_keep_mask_not_empty, + &rt_keep_mask_not_empty_if, + &rt_keep_mask_not_empty_if_else); + + // Loading and masking path. + { + builder_->setBuildPoint(&rt_keep_mask_not_empty_if); + std::array color_packed_masked; + for (uint32_t i = 0; i < 2; ++i) { + color_packed_masked[i] = builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, color_packed[i], + builder_->createUnaryOp(spv::OpNot, type_uint_, + rt_keep_mask[i])); + } + for (uint32_t i = 0; i < 4; ++i) { + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(fsi_samples_covered[i], + &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + spv::Id rt_sample_address = + FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(rt_sample_address); + spv::Id rt_access_chain_0 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createLoad(rt_access_chain_0, + spv::NoPrecision), + rt_keep_mask[0]), + color_packed_masked[0]), + rt_access_chain_0); + spv::Block& block_store_64bpp = builder_->makeNewBlock(); + spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_store_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, + &block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp); + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, rt_sample_address, fsi_const_int_1); + spv::Id rt_access_chain_1 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createLoad(rt_access_chain_1, + spv::NoPrecision), + rt_keep_mask[1]), + color_packed_masked[1]), + rt_access_chain_1); + builder_->createBranch(&block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp_merge); + builder_->createBranch(&block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered_merge); + } + builder_->createBranch(&rt_keep_mask_not_empty_if_merge); + } + + // Fully overwriting path. + { + builder_->setBuildPoint(&rt_keep_mask_not_empty_if_else); + for (uint32_t i = 0; i < 4; ++i) { + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(fsi_samples_covered[i], + &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + spv::Id rt_sample_address = + FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(rt_sample_address); + builder_->createStore(color_packed[0], + builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_)); + spv::Block& block_store_64bpp = builder_->makeNewBlock(); + spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_store_64bpp_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, + &block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp); + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, id_vector_temp_.back(), + fsi_const_int_1); + builder_->createStore(color_packed[1], + builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_)); + builder_->createBranch(&block_store_64bpp_merge); + builder_->setBuildPoint(&block_store_64bpp_merge); + builder_->createBranch(&block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered_merge); + } + builder_->createBranch(&rt_keep_mask_not_empty_if_merge); + } + + builder_->setBuildPoint(&rt_keep_mask_not_empty_if_merge); + builder_->createBranch(&rt_blend_enabled_merge); + } + + builder_->setBuildPoint(&rt_blend_enabled_merge); + builder_->createBranch(&rt_write_mask_not_empty_if_merge); + builder_->setBuildPoint(&rt_write_mask_not_empty_if_merge); + builder_->createBranch(&fsi_color_written_if_merge); + builder_->setBuildPoint(&fsi_color_written_if_merge); + } else { + // Convert to gamma space - this is incorrect, since it must be done + // after blending on the Xbox 360, but this is just one of many blending + // issues in the host render target path. + // TODO(Triang3l): Gamma as sRGB check. + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id color_rgb = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float3_, color, uint_vector_temp_); + spv::Id is_gamma = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_ConvertColor0ToGamma + << color_target_index)), + const_uint_0_); + spv::Block& block_gamma_head = *builder_->getBuildPoint(); + spv::Block& block_gamma = builder_->makeNewBlock(); + spv::Block& block_gamma_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_gamma_merge.getId()); + builder_->createConditionalBranch(is_gamma, &block_gamma, + &block_gamma_merge); + builder_->setBuildPoint(&block_gamma); + spv::Id color_rgb_gamma = LinearToPWLGamma(color_rgb, false); + builder_->createBranch(&block_gamma_merge); + builder_->setBuildPoint(&block_gamma_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(color_rgb_gamma); + id_vector_temp_.push_back(block_gamma.getId()); + id_vector_temp_.push_back(color_rgb); + id_vector_temp_.push_back(block_gamma_head.getId()); + color_rgb = + builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); + { + std::unique_ptr color_rgba_shuffle_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpVectorShuffle); + color_rgba_shuffle_op->addIdOperand(color_rgb); + color_rgba_shuffle_op->addIdOperand(color); + color_rgba_shuffle_op->addImmediateOperand(0); + color_rgba_shuffle_op->addImmediateOperand(1); + color_rgba_shuffle_op->addImmediateOperand(2); + color_rgba_shuffle_op->addImmediateOperand(3 + 3); + color = color_rgba_shuffle_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(color_rgba_shuffle_op)); + } + + builder_->createStore(color, color_variable); + } + } + } + + if (edram_fragment_shader_interlock_) { + if (block_fsi_if_after_depth_stencil_merge) { + builder_->createBranch(block_fsi_if_after_depth_stencil_merge); + builder_->setBuildPoint(block_fsi_if_after_depth_stencil_merge); + } + + if (block_fsi_if_after_kill_merge) { + builder_->createBranch(block_fsi_if_after_kill_merge); + builder_->setBuildPoint(block_fsi_if_after_kill_merge); + } + + if (FSI_IsDepthStencilEarly()) { + builder_->createBranch(main_fsi_early_depth_stencil_execute_quad_merge_); + builder_->setBuildPoint(main_fsi_early_depth_stencil_execute_quad_merge_); + } + + builder_->createNoResultOp(spv::OpEndInvocationInterlockEXT); + } +} + +spv::Id SpirvShaderTranslator::LoadMsaaSamplesFromFlags() { + return builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_MsaaSamples_Shift), + builder_->makeUintConstant(2)); +} + +void SpirvShaderTranslator::FSI_LoadSampleMask(spv::Id msaa_samples) { + // On the Xbox 360, 2x MSAA doubles the storage height, 4x MSAA doubles the + // storage width. + // Vulkan standard 2x samples are bottom, top. + // Vulkan standard 4x samples are TL, TR, BL, BR. + // Remap to T, B for 2x, and to TL, BL, TR, BR for 4x. + // 2x corresponds to 1, 0 with native 2x MSAA on Vulkan, 0, 3 with 2x as 4x. + // 4x corresponds to 0, 2, 1, 3 on Vulkan. + + spv::Id const_uint_1 = builder_->makeUintConstant(1); + spv::Id const_uint_2 = builder_->makeUintConstant(2); + + assert_true(input_sample_mask_ != spv::NoResult); + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_int_0_); + spv::Id input_sample_mask_value = builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassInput, + input_sample_mask_, id_vector_temp_), + spv::NoPrecision)); + + spv::Block& block_msaa_head = *builder_->getBuildPoint(); + spv::Block& block_msaa_1x = builder_->makeNewBlock(); + spv::Block& block_msaa_2x = builder_->makeNewBlock(); + spv::Block& block_msaa_4x = builder_->makeNewBlock(); + spv::Block& block_msaa_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_msaa_merge.getId()); + { + std::unique_ptr msaa_switch_op = + std::make_unique(spv::OpSwitch); + msaa_switch_op->addIdOperand(msaa_samples); + // Make 1x the default. + msaa_switch_op->addIdOperand(block_msaa_1x.getId()); + msaa_switch_op->addImmediateOperand(int32_t(xenos::MsaaSamples::k2X)); + msaa_switch_op->addIdOperand(block_msaa_2x.getId()); + msaa_switch_op->addImmediateOperand(int32_t(xenos::MsaaSamples::k4X)); + msaa_switch_op->addIdOperand(block_msaa_4x.getId()); + builder_->getBuildPoint()->addInstruction(std::move(msaa_switch_op)); + } + block_msaa_1x.addPredecessor(&block_msaa_head); + block_msaa_2x.addPredecessor(&block_msaa_head); + block_msaa_4x.addPredecessor(&block_msaa_head); + + // 1x MSAA - pass input_sample_mask_value through. + builder_->setBuildPoint(&block_msaa_1x); + builder_->createBranch(&block_msaa_merge); + + // 2x MSAA. + builder_->setBuildPoint(&block_msaa_2x); + spv::Id sample_mask_2x; + if (native_2x_msaa_no_attachments_) { + // 1 and 0 to 0 and 1. + sample_mask_2x = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createUnaryOp(spv::OpBitReverse, type_uint_, + input_sample_mask_value), + builder_->makeUintConstant(32 - 2)); + } else { + // 0 and 3 to 0 and 1. + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(input_sample_mask_value); + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, input_sample_mask_value, + const_uint_2, const_uint_1)); + id_vector_temp_.push_back(const_uint_1); + id_vector_temp_.push_back(builder_->makeUintConstant(32 - 1)); + sample_mask_2x = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + } + builder_->createBranch(&block_msaa_merge); + + // 4x MSAA. + builder_->setBuildPoint(&block_msaa_4x); + // Flip samples in bits 0:1 by reversing the whole coverage mask and inserting + // the reversing bits. + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(input_sample_mask_value); + id_vector_temp_.push_back(builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createUnaryOp(spv::OpBitReverse, type_uint_, + input_sample_mask_value), + builder_->makeUintConstant(32 - 1 - 2))); + id_vector_temp_.push_back(const_uint_1); + id_vector_temp_.push_back(const_uint_2); + spv::Id sample_mask_4x = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + builder_->createBranch(&block_msaa_merge); + + // Select the result depending on the MSAA sample count. + builder_->setBuildPoint(&block_msaa_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 3); + id_vector_temp_.push_back(input_sample_mask_value); + id_vector_temp_.push_back(block_msaa_1x.getId()); + id_vector_temp_.push_back(sample_mask_2x); + id_vector_temp_.push_back(block_msaa_2x.getId()); + id_vector_temp_.push_back(sample_mask_4x); + id_vector_temp_.push_back(block_msaa_4x.getId()); + main_fsi_sample_mask_ = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); +} + +void SpirvShaderTranslator::FSI_LoadEdramOffsets(spv::Id msaa_samples) { + // Convert the floating-point pixel coordinates to integer sample 0 + // coordinates. + assert_true(input_fragment_coordinates_ != spv::NoResult); + spv::Id axes_have_two_msaa_samples[2]; + spv::Id sample_coordinates[2]; + spv::Id const_uint_1 = builder_->makeUintConstant(1); + for (uint32_t i = 0; i < 2; ++i) { + spv::Id axis_has_two_msaa_samples = builder_->createBinOp( + spv::OpUGreaterThanEqual, type_bool_, msaa_samples, + builder_->makeUintConstant( + uint32_t(i ? xenos::MsaaSamples::k2X : xenos::MsaaSamples::k4X))); + axes_have_two_msaa_samples[i] = axis_has_two_msaa_samples; + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant(int32_t(i))); + sample_coordinates[i] = builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, + builder_->createUnaryOp( + spv::OpConvertFToU, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassInput, + input_fragment_coordinates_, + id_vector_temp_), + spv::NoPrecision)), + builder_->createTriOp(spv::OpSelect, type_uint_, + axis_has_two_msaa_samples, const_uint_1, + const_uint_0_)); + } + + // Get 40 x 16 x resolution scale 32bpp half-tile or 40x16 64bpp tile index. + // Working with 40x16-sample portions for 64bpp and for swapping for depth - + // dividing by 40, not by 80. + // TODO(Triang3l): Resolution scaling. + uint32_t tile_width = xenos::kEdramTileWidthSamples; + spv::Id const_tile_half_width = builder_->makeUintConstant(tile_width >> 1); + uint32_t tile_height = xenos::kEdramTileHeightSamples; + spv::Id const_tile_height = builder_->makeUintConstant(tile_height); + spv::Id tile_half_index[2], tile_half_sample_coordinates[2]; + for (uint32_t i = 0; i < 2; ++i) { + spv::Id sample_x_or_y = sample_coordinates[i]; + spv::Id tile_half_width_or_height = + i ? const_tile_height : const_tile_half_width; + tile_half_index[i] = builder_->createBinOp( + spv::OpUDiv, type_uint_, sample_x_or_y, tile_half_width_or_height); + tile_half_sample_coordinates[i] = builder_->createBinOp( + spv::OpUMod, type_uint_, sample_x_or_y, tile_half_width_or_height); + } + + // Convert the Y sample 0 position within the half-tile or tile to the dword + // offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile. + spv::Id const_tile_width = builder_->makeUintConstant(tile_width); + spv::Id row_offset_in_tile_at_32bpp = + builder_->createBinOp(spv::OpIMul, type_uint_, + tile_half_sample_coordinates[1], const_tile_width); + + // Multiply the Y tile position by the surface tile pitch in dwords at 32bpp + // to get the address of the origin of the row of tiles within a 32bpp surface + // in dwords (later it needs to be multiplied by 2 for 64bpp). + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant( + kSystemConstantEdram32bppTilePitchDwordsScaled)); + spv::Id tile_row_offset_at_32bpp = builder_->createBinOp( + spv::OpIMul, type_uint_, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + tile_half_index[1]); + + uint32_t tile_size = tile_width * tile_height; + spv::Id const_tile_size = builder_->makeUintConstant(tile_size); + + // Get the dword offset of the sample 0 in the first half-tile in the tile + // within a 32bpp surface. + spv::Id offset_in_first_tile_half_at_32bpp = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, tile_row_offset_at_32bpp, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpIMul, type_uint_, const_tile_size, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + tile_half_index[0], const_uint_1)), + row_offset_in_tile_at_32bpp)), + tile_half_sample_coordinates[0]); + + // Get whether the sample is in the second half-tile in a 32bpp surface. + spv::Id is_second_tile_half = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, tile_half_index[0], + const_uint_1), + const_uint_0_); + + // Get the offset of the sample 0 within a depth / stencil surface, with + // samples 40...79 in the first half-tile, 0...39 in the second (flipped as + // opposed to color). Then add the EDRAM base for depth / stencil, and wrap + // addressing. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramDepthBaseDwordsScaled)); + main_fsi_address_depth_ = builder_->createBinOp( + spv::OpUMod, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + builder_->createBinOp( + spv::OpIAdd, type_uint_, offset_in_first_tile_half_at_32bpp, + builder_->createTriOp(spv::OpSelect, type_uint_, + is_second_tile_half, const_uint_0_, + const_tile_half_width))), + builder_->makeUintConstant(tile_size * xenos::kEdramTileCount)); + + if (current_shader().writes_color_targets()) { + // Get the offset of the sample 0 within a 32bpp surface, with samples + // 0...39 in the first half-tile, 40...79 in the second. + main_fsi_offset_32bpp_ = builder_->createBinOp( + spv::OpIAdd, type_uint_, offset_in_first_tile_half_at_32bpp, + builder_->createTriOp(spv::OpSelect, type_uint_, is_second_tile_half, + const_tile_half_width, const_uint_0_)); + + // Get the offset of the sample 0 within a 64bpp surface. + main_fsi_offset_64bpp_ = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + tile_row_offset_at_32bpp, const_uint_1), + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp(spv::OpIMul, type_uint_, const_tile_size, + tile_half_index[0]), + row_offset_in_tile_at_32bpp)), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + tile_half_sample_coordinates[0], const_uint_1)); + } +} + +spv::Id SpirvShaderTranslator::FSI_AddSampleOffset(spv::Id sample_0_address, + uint32_t sample_index, + spv::Id is_64bpp) { + if (!sample_index) { + return sample_0_address; + } + spv::Id sample_offset; + // TODO(Triang3l): Resolution scaling. + uint32_t tile_width = xenos::kEdramTileWidthSamples; + if (sample_index == 1) { + sample_offset = builder_->makeIntConstant(tile_width); + } else { + spv::Id sample_offset_32bpp = builder_->makeIntConstant( + tile_width * (sample_index & 1) + (sample_index >> 1)); + if (is_64bpp != spv::NoResult) { + sample_offset = builder_->createTriOp( + spv::OpSelect, type_int_, is_64bpp, + builder_->makeIntConstant(tile_width * (sample_index & 1) + + 2 * (sample_index >> 1)), + sample_offset_32bpp); + } else { + sample_offset = sample_offset_32bpp; + } + } + return builder_->createBinOp(spv::OpIAdd, type_int_, sample_0_address, + sample_offset); +} + +void SpirvShaderTranslator::FSI_DepthStencilTest( + spv::Id msaa_samples, bool sample_mask_potentially_narrowed_previouly) { + bool is_early = FSI_IsDepthStencilEarly(); + bool implicit_early_z_write_allowed = + current_shader().implicit_early_z_write_allowed(); + spv::Id const_uint_1 = builder_->makeUintConstant(1); + spv::Id const_uint_8 = builder_->makeUintConstant(8); + + // Check if depth or stencil testing is needed. + spv::Id depth_stencil_enabled = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthStencil)), + const_uint_0_); + spv::Block& block_depth_stencil_enabled_head = *builder_->getBuildPoint(); + spv::Block& block_depth_stencil_enabled = builder_->makeNewBlock(); + spv::Block& block_depth_stencil_enabled_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_depth_stencil_enabled_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(depth_stencil_enabled, + &block_depth_stencil_enabled, + &block_depth_stencil_enabled_merge); + builder_->setBuildPoint(&block_depth_stencil_enabled); + + // Load the depth in the center of the pixel and calculate the derivatives of + // the depth outside non-uniform control flow. + assert_true(input_fragment_coordinates_ != spv::NoResult); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant(2)); + spv::Id center_depth32_unbiased = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassInput, + input_fragment_coordinates_, id_vector_temp_), + spv::NoPrecision); + builder_->addCapability(spv::CapabilityDerivativeControl); + std::array depth_dxy; + depth_dxy[0] = builder_->createUnaryOp(spv::OpDPdxCoarse, type_float_, + center_depth32_unbiased); + depth_dxy[1] = builder_->createUnaryOp(spv::OpDPdyCoarse, type_float_, + center_depth32_unbiased); + + // Skip everything if potentially discarded all the samples previously in the + // shader. + spv::Block* block_any_sample_covered_head = nullptr; + spv::Block* block_any_sample_covered = nullptr; + spv::Block* block_any_sample_covered_merge = nullptr; + if (sample_mask_potentially_narrowed_previouly) { + spv::Id any_sample_covered = builder_->createBinOp( + spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_); + block_any_sample_covered_head = builder_->getBuildPoint(); + block_any_sample_covered = &builder_->makeNewBlock(); + block_any_sample_covered_merge = &builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_any_sample_covered_merge->getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(any_sample_covered, + block_any_sample_covered, + block_any_sample_covered_merge); + builder_->setBuildPoint(block_any_sample_covered); + } + + // Load values involved in depth and stencil testing. + spv::Id msaa_is_2x_4x = builder_->createBinOp( + spv::OpUGreaterThanEqual, type_bool_, msaa_samples, + builder_->makeUintConstant(uint32_t(xenos::MsaaSamples::k2X))); + spv::Id msaa_is_4x = builder_->createBinOp( + spv::OpUGreaterThanEqual, type_bool_, msaa_samples, + builder_->makeUintConstant(uint32_t(xenos::MsaaSamples::k4X))); + spv::Id depth_is_float24 = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_DepthFloat24)), + const_uint_0_); + spv::Id depth_pass_if_less = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthPassIfLess)), + const_uint_0_); + spv::Id depth_pass_if_equal = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthPassIfEqual)), + const_uint_0_); + spv::Id depth_pass_if_greater = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthPassIfGreater)), + const_uint_0_); + spv::Id depth_write = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIDepthWrite)), + const_uint_0_); + spv::Id stencil_enabled = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(kSysFlag_FSIStencilTest)), + const_uint_0_); + spv::Id early_write = + (is_early && implicit_early_z_write_allowed) + ? builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + main_system_constant_flags_, + builder_->makeUintConstant( + kSysFlag_FSIDepthStencilEarlyWrite)), + const_uint_0_) + : spv::NoResult; + spv::Id not_early_write = + (is_early && implicit_early_z_write_allowed) + ? builder_->createUnaryOp(spv::OpLogicalNot, type_bool_, early_write) + : spv::NoResult; + assert_true(input_front_facing_ != spv::NoResult); + spv::Id front_facing = + builder_->createLoad(input_front_facing_, spv::NoPrecision); + spv::Id poly_offset_scale, poly_offset_offset, stencil_parameters; + { + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetFrontScale)); + spv::Id poly_offset_front_scale = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetBackScale)); + spv::Id poly_offset_back_scale = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + poly_offset_scale = + builder_->createTriOp(spv::OpSelect, type_float_, front_facing, + poly_offset_front_scale, poly_offset_back_scale); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetFrontOffset)); + spv::Id poly_offset_front_offset = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramPolyOffsetBackOffset)); + spv::Id poly_offset_back_offset = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + poly_offset_offset = builder_->createTriOp( + spv::OpSelect, type_float_, front_facing, poly_offset_front_offset, + poly_offset_back_offset); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramStencilFront)); + spv::Id stencil_parameters_front = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantEdramStencilBack)); + spv::Id stencil_parameters_back = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + stencil_parameters = builder_->createTriOp( + spv::OpSelect, type_uint2_, + builder_->smearScalar(spv::NoPrecision, front_facing, type_bool2_), + stencil_parameters_front, stencil_parameters_back); + } + spv::Id stencil_reference_masks = + builder_->createCompositeExtract(stencil_parameters, type_uint_, 0); + spv::Id stencil_reference = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_reference_masks, + const_uint_0_, const_uint_8); + spv::Id stencil_read_mask = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_reference_masks, + const_uint_8, const_uint_8); + spv::Id stencil_reference_read_masked = builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, stencil_reference, stencil_read_mask); + spv::Id stencil_write_mask = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_reference_masks, + builder_->makeUintConstant(16), const_uint_8); + spv::Id stencil_write_keep_mask = + builder_->createUnaryOp(spv::OpNot, type_uint_, stencil_write_mask); + spv::Id stencil_func_ops = + builder_->createCompositeExtract(stencil_parameters, type_uint_, 1); + spv::Id stencil_pass_if_less = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, stencil_func_ops, + builder_->makeUintConstant(uint32_t(1) << 0)), + const_uint_0_); + spv::Id stencil_pass_if_equal = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, stencil_func_ops, + builder_->makeUintConstant(uint32_t(1) << 1)), + const_uint_0_); + spv::Id stencil_pass_if_greater = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, stencil_func_ops, + builder_->makeUintConstant(uint32_t(1) << 2)), + const_uint_0_); + + // Get the maximum depth slope for the polygon offset. + // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/depth-bias + std::array depth_dxy_abs; + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(depth_dxy[i]); + depth_dxy_abs[i] = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450FAbs, id_vector_temp_); + } + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(depth_dxy_abs[0]); + id_vector_temp_.push_back(depth_dxy_abs[1]); + spv::Id depth_max_slope = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450FMax, id_vector_temp_); + // Calculate the polygon offset. + spv::Id slope_scaled_poly_offset = builder_->createBinOp( + spv::OpFMul, type_float_, poly_offset_scale, depth_max_slope); + builder_->addDecoration(slope_scaled_poly_offset, + spv::DecorationNoContraction); + spv::Id poly_offset = builder_->createBinOp( + spv::OpFAdd, type_float_, slope_scaled_poly_offset, poly_offset_offset); + builder_->addDecoration(poly_offset, spv::DecorationNoContraction); + // Apply the post-clip and post-viewport polygon offset to the fragment's + // depth. Not clamping yet as this is at the center, which is not necessarily + // covered and not necessarily inside the bounds - derivatives scaled by + // sample locations will be added to this value, and it must be linear. + spv::Id center_depth32_biased = builder_->createBinOp( + spv::OpFAdd, type_float_, center_depth32_unbiased, poly_offset); + builder_->addDecoration(center_depth32_biased, spv::DecorationNoContraction); + + // Perform depth and stencil testing for each covered sample. + spv::Id new_sample_mask = main_fsi_sample_mask_; + std::array late_write_depth_stencil{}; + for (uint32_t i = 0; i < 4; ++i) { + spv::Id sample_covered = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, new_sample_mask, + builder_->makeUintConstant(uint32_t(1) << i)), + const_uint_0_); + spv::Block& block_sample_covered_head = *builder_->getBuildPoint(); + spv::Block& block_sample_covered = builder_->makeNewBlock(); + spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_sample_covered_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(sample_covered, &block_sample_covered, + &block_sample_covered_merge); + builder_->setBuildPoint(&block_sample_covered); + + // Load the original depth and stencil for the sample. + spv::Id sample_address = FSI_AddSampleOffset(main_fsi_address_depth_, i); id_vector_temp_.clear(); id_vector_temp_.reserve(2); - id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantColorExpBias)); - id_vector_temp_.push_back( - builder_->makeIntConstant(int32_t(color_target_index))); - color = builder_->createBinOp( - spv::OpVectorTimesScalar, type_float4_, color, - builder_->createLoad(builder_->createAccessChain( - spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision)); - builder_->addDecoration(color, spv::DecorationNoContraction); + // First SSBO structure element. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(sample_address); + spv::Id sample_access_chain = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + spv::Id old_depth_stencil = + builder_->createLoad(sample_access_chain, spv::NoPrecision); - // Convert to gamma space - this is incorrect, since it must be done after - // blending on the Xbox 360, but this is just one of many blending issues in - // the host render target path. - // TODO(Triang3l): Gamma as sRGB check. - spv::Id color_rgb; - { - std::unique_ptr color_rgb_shuffle_op = - std::make_unique( - builder_->getUniqueId(), type_float3_, spv::OpVectorShuffle); - color_rgb_shuffle_op->addIdOperand(color); - color_rgb_shuffle_op->addIdOperand(color); - color_rgb_shuffle_op->addImmediateOperand(0); - color_rgb_shuffle_op->addImmediateOperand(1); - color_rgb_shuffle_op->addImmediateOperand(2); - color_rgb = color_rgb_shuffle_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(color_rgb_shuffle_op)); + // Calculate the new depth at the sample. + // interpolateAtSample(gl_FragCoord) is not valid in GLSL because + // gl_FragCoord is not an interpolator, calculating the depths at the + // samples manually. + std::array sample_location; + switch (i) { + case 0: { + // Center sample for no MSAA. + // Top-left sample for native 2x (top - 1 in Vulkan), 2x as 4x, 4x + // (0 in Vulkan). + // 4x on the host case. + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->makeFloatConstant( + draw_util::kD3D10StandardSamplePositions4x[0][j] * + (1.0f / 16.0f)); + } + if (native_2x_msaa_no_attachments_) { + // 2x on the host case. + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->createTriOp( + spv::OpSelect, type_float_, msaa_is_4x, sample_location[j], + builder_->makeFloatConstant( + draw_util::kD3D10StandardSamplePositions2x[1][j] * + (1.0f / 16.0f))); + } + } + // 1x case. + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = + builder_->createTriOp(spv::OpSelect, type_float_, msaa_is_2x_4x, + sample_location[j], const_float_0_); + } + } break; + case 1: { + // For guest 2x: bottom-right sample (bottom - 0 in Vulkan - for native + // 2x, bottom-right - 3 in Vulkan - for 2x as 4x). + // For guest 4x: bottom-left sample (2 in Vulkan). + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->createTriOp( + spv::OpSelect, type_float_, msaa_is_4x, + builder_->makeFloatConstant( + draw_util::kD3D10StandardSamplePositions4x[2][j] * + (1.0f / 16.0f)), + builder_->makeFloatConstant( + (native_2x_msaa_no_attachments_ + ? draw_util::kD3D10StandardSamplePositions2x[0][j] + : draw_util::kD3D10StandardSamplePositions4x[3][j]) * + (1.0f / 16.0f))); + } + } break; + default: { + // Xenia samples 2 and 3 (top-right and bottom-right) -> Vulkan samples + // 1 and 3. + const int8_t* sample_location_int = draw_util:: + kD3D10StandardSamplePositions4x[i ^ (((i & 1) ^ (i >> 1)) * 0b11)]; + for (uint32_t j = 0; j < 2; ++j) { + sample_location[j] = builder_->makeFloatConstant( + sample_location_int[j] * (1.0f / 16.0f)); + } + } break; } - spv::Id is_gamma = builder_->createBinOp( - spv::OpINotEqual, type_bool_, + std::array sample_depth_dxy; + for (uint32_t j = 0; j < 2; ++j) { + sample_depth_dxy[j] = builder_->createBinOp( + spv::OpFMul, type_float_, sample_location[j], depth_dxy[j]); + builder_->addDecoration(sample_depth_dxy[j], + spv::DecorationNoContraction); + } + spv::Id sample_depth32 = builder_->createBinOp( + spv::OpFAdd, type_float_, sample_depth_dxy[0], sample_depth_dxy[1]); + builder_->addDecoration(sample_depth32, spv::DecorationNoContraction); + sample_depth32 = builder_->createBinOp( + spv::OpFAdd, type_float_, center_depth32_biased, sample_depth32); + builder_->addDecoration(sample_depth32, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(sample_depth32); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(const_float_1_); + sample_depth32 = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NClamp, id_vector_temp_); + + // Convert the new depth to 24-bit. + spv::Block& block_depth_format_float = builder_->makeNewBlock(); + spv::Block& block_depth_format_unorm = builder_->makeNewBlock(); + spv::Block& block_depth_format_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_depth_format_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + depth_is_float24, &block_depth_format_float, &block_depth_format_unorm); + // Float24 case. + builder_->setBuildPoint(&block_depth_format_float); + spv::Id sample_depth_float24 = SpirvShaderTranslator::PreClampedDepthTo20e4( + *builder_, sample_depth32, true, false, ext_inst_glsl_std_450_); + builder_->createBranch(&block_depth_format_merge); + spv::Block& block_depth_format_float_end = *builder_->getBuildPoint(); + // Unorm24 case. + builder_->setBuildPoint(&block_depth_format_unorm); + // Round to the nearest even integer. This seems to be the correct + // conversion, adding +0.5 and rounding towards zero results in red instead + // of black in the 4D5307E6 clear shader. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->createBinOp(spv::OpFMul, type_float_, sample_depth32, + builder_->makeFloatConstant(float(0xFFFFFF)))); + builder_->addDecoration(id_vector_temp_.back(), + spv::DecorationNoContraction); + spv::Id sample_depth_unorm24 = builder_->createUnaryOp( + spv::OpConvertFToU, type_uint_, + builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, + GLSLstd450RoundEven, id_vector_temp_)); + builder_->createBranch(&block_depth_format_merge); + spv::Block& block_depth_format_unorm_end = *builder_->getBuildPoint(); + // Merge between the two formats. + builder_->setBuildPoint(&block_depth_format_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(sample_depth_float24); + id_vector_temp_.push_back(block_depth_format_float_end.getId()); + id_vector_temp_.push_back(sample_depth_unorm24); + id_vector_temp_.push_back(block_depth_format_unorm_end.getId()); + spv::Id sample_depth24 = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + + // Perform the depth test. + spv::Id old_depth = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, old_depth_stencil, const_uint_8); + spv::Id depth_passed = builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, depth_pass_if_less, + builder_->createBinOp(spv::OpULessThan, type_bool_, sample_depth24, + old_depth)); + depth_passed = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, depth_passed, builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant(kSysFlag_ConvertColor0ToGamma - << color_target_index)), - const_uint_0_); - spv::Block& block_gamma_head = *builder_->getBuildPoint(); - spv::Block& block_gamma = builder_->makeNewBlock(); - spv::Block& block_gamma_merge = builder_->makeNewBlock(); - SpirvCreateSelectionMerge(block_gamma_merge.getId()); - builder_->createConditionalBranch(is_gamma, &block_gamma, - &block_gamma_merge); - builder_->setBuildPoint(&block_gamma); - spv::Id color_rgb_gamma = LinearToPWLGamma(color_rgb, false); - builder_->createBranch(&block_gamma_merge); - builder_->setBuildPoint(&block_gamma_merge); + spv::OpLogicalAnd, type_bool_, depth_pass_if_equal, + builder_->createBinOp(spv::OpIEqual, type_bool_, sample_depth24, + old_depth))); + depth_passed = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, depth_passed, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, depth_pass_if_greater, + builder_->createBinOp(spv::OpUGreaterThan, type_bool_, + sample_depth24, old_depth))); + + // Begin the stencil test. + spv::Block& block_stencil_enabled_head = *builder_->getBuildPoint(); + spv::Block& block_stencil_enabled = builder_->makeNewBlock(); + spv::Block& block_stencil_enabled_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_stencil_enabled_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(stencil_enabled, &block_stencil_enabled, + &block_stencil_enabled_merge); + builder_->setBuildPoint(&block_stencil_enabled); + + // Perform the stencil test. + // The read mask has zeros in the upper bits, applying it to the combined + // stencil and depth will remove the depth part. + spv::Id old_stencil_read_masked = builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, old_depth_stencil, stencil_read_mask); + spv::Id stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_less, + builder_->createBinOp(spv::OpULessThan, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked)); + stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_equal, + builder_->createBinOp(spv::OpIEqual, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked))); + stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_greater, + builder_->createBinOp(spv::OpUGreaterThan, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked))); + spv::Id stencil_op = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_func_ops, + builder_->createTriOp( + spv::OpSelect, type_uint_, stencil_passed_if_enabled, + builder_->createTriOp(spv::OpSelect, type_uint_, depth_passed, + builder_->makeUintConstant(6), + builder_->makeUintConstant(9)), + builder_->makeUintConstant(3)), + builder_->makeUintConstant(3)); + spv::Block& block_stencil_op_head = *builder_->getBuildPoint(); + spv::Block& block_stencil_op_keep = builder_->makeNewBlock(); + spv::Block& block_stencil_op_zero = builder_->makeNewBlock(); + spv::Block& block_stencil_op_replace = builder_->makeNewBlock(); + spv::Block& block_stencil_op_increment_clamp = builder_->makeNewBlock(); + spv::Block& block_stencil_op_decrement_clamp = builder_->makeNewBlock(); + spv::Block& block_stencil_op_invert = builder_->makeNewBlock(); + spv::Block& block_stencil_op_increment_wrap = builder_->makeNewBlock(); + spv::Block& block_stencil_op_decrement_wrap = builder_->makeNewBlock(); + spv::Block& block_stencil_op_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_stencil_op_merge.getId(), + spv::SelectionControlDontFlattenMask); { - std::unique_ptr gamma_phi_op = - std::make_unique(builder_->getUniqueId(), - type_float3_, spv::OpPhi); - gamma_phi_op->addIdOperand(color_rgb_gamma); - gamma_phi_op->addIdOperand(block_gamma.getId()); - gamma_phi_op->addIdOperand(color_rgb); - gamma_phi_op->addIdOperand(block_gamma_head.getId()); - color_rgb = gamma_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(gamma_phi_op)); - } - { - std::unique_ptr color_rgba_shuffle_op = - std::make_unique( - builder_->getUniqueId(), type_float4_, spv::OpVectorShuffle); - color_rgba_shuffle_op->addIdOperand(color_rgb); - color_rgba_shuffle_op->addIdOperand(color); - color_rgba_shuffle_op->addImmediateOperand(0); - color_rgba_shuffle_op->addImmediateOperand(1); - color_rgba_shuffle_op->addImmediateOperand(2); - color_rgba_shuffle_op->addImmediateOperand(3 + 3); - color = color_rgba_shuffle_op->getResultId(); + std::unique_ptr stencil_op_switch_op = + std::make_unique(spv::OpSwitch); + stencil_op_switch_op->addIdOperand(stencil_op); + // Make keep the default. + stencil_op_switch_op->addIdOperand(block_stencil_op_keep.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kZero)); + stencil_op_switch_op->addIdOperand(block_stencil_op_zero.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kReplace)); + stencil_op_switch_op->addIdOperand(block_stencil_op_replace.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kIncrementClamp)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_increment_clamp.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kDecrementClamp)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_decrement_clamp.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kInvert)); + stencil_op_switch_op->addIdOperand(block_stencil_op_invert.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kIncrementWrap)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_increment_wrap.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kDecrementWrap)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_decrement_wrap.getId()); builder_->getBuildPoint()->addInstruction( - std::move(color_rgba_shuffle_op)); + std::move(stencil_op_switch_op)); + } + block_stencil_op_keep.addPredecessor(&block_stencil_op_head); + block_stencil_op_zero.addPredecessor(&block_stencil_op_head); + block_stencil_op_replace.addPredecessor(&block_stencil_op_head); + block_stencil_op_increment_clamp.addPredecessor(&block_stencil_op_head); + block_stencil_op_decrement_clamp.addPredecessor(&block_stencil_op_head); + block_stencil_op_invert.addPredecessor(&block_stencil_op_head); + block_stencil_op_increment_wrap.addPredecessor(&block_stencil_op_head); + block_stencil_op_decrement_wrap.addPredecessor(&block_stencil_op_head); + // Keep - will use the old stencil in the phi. + builder_->setBuildPoint(&block_stencil_op_keep); + builder_->createBranch(&block_stencil_op_merge); + // Zero - will use the zero constant in the phi. + builder_->setBuildPoint(&block_stencil_op_zero); + builder_->createBranch(&block_stencil_op_merge); + // Replace - will use the stencil reference in the phi. + builder_->setBuildPoint(&block_stencil_op_replace); + builder_->createBranch(&block_stencil_op_merge); + // Increment and clamp. + builder_->setBuildPoint(&block_stencil_op_increment_clamp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(builder_->makeUintConstant(UINT8_MAX - 1)); + id_vector_temp_.push_back( + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, old_depth_stencil, + builder_->makeUintConstant(UINT8_MAX))); + spv::Id new_stencil_in_low_bits_increment_clamp = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBuiltinCall(type_uint_, ext_inst_glsl_std_450_, + GLSLstd450UMin, id_vector_temp_), + const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Decrement and clamp. + builder_->setBuildPoint(&block_stencil_op_decrement_clamp); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(const_uint_1); + id_vector_temp_.push_back( + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, old_depth_stencil, + builder_->makeUintConstant(UINT8_MAX))); + spv::Id new_stencil_in_low_bits_decrement_clamp = builder_->createBinOp( + spv::OpISub, type_uint_, + builder_->createBuiltinCall(type_uint_, ext_inst_glsl_std_450_, + GLSLstd450UMax, id_vector_temp_), + const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Invert. + builder_->setBuildPoint(&block_stencil_op_invert); + spv::Id new_stencil_in_low_bits_invert = + builder_->createUnaryOp(spv::OpNot, type_uint_, old_depth_stencil); + builder_->createBranch(&block_stencil_op_merge); + // Increment and wrap. + // The upper bits containing the old depth have no effect on the behavior. + builder_->setBuildPoint(&block_stencil_op_increment_wrap); + spv::Id new_stencil_in_low_bits_increment_wrap = builder_->createBinOp( + spv::OpIAdd, type_uint_, old_depth_stencil, const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Decrement and wrap. + // The upper bits containing the old depth have no effect on the behavior. + builder_->setBuildPoint(&block_stencil_op_decrement_wrap); + spv::Id new_stencil_in_low_bits_decrement_wrap = builder_->createBinOp( + spv::OpISub, type_uint_, old_depth_stencil, const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Select the new stencil (with undefined data in bits starting from 8) + // based on the stencil operation. + builder_->setBuildPoint(&block_stencil_op_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 8); + id_vector_temp_.push_back(old_depth_stencil); + id_vector_temp_.push_back(block_stencil_op_keep.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_stencil_op_zero.getId()); + id_vector_temp_.push_back(stencil_reference); + id_vector_temp_.push_back(block_stencil_op_replace.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_increment_clamp); + id_vector_temp_.push_back(block_stencil_op_increment_clamp.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_clamp); + id_vector_temp_.push_back(block_stencil_op_decrement_clamp.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_invert); + id_vector_temp_.push_back(block_stencil_op_invert.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_increment_wrap); + id_vector_temp_.push_back(block_stencil_op_increment_wrap.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_wrap); + id_vector_temp_.push_back(block_stencil_op_decrement_wrap.getId()); + spv::Id new_stencil_in_low_bits_if_enabled = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + // Merge the old depth / stencil (old depth kept from the old depth / + // stencil so the separate old depth register is not needed anymore after + // the depth test) and the new stencil based on the write mask. + spv::Id new_stencil_and_old_depth_if_stencil_enabled = + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + old_depth_stencil, stencil_write_keep_mask), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + new_stencil_in_low_bits_if_enabled, + stencil_write_mask)); + + // Choose the result based on whether the stencil test was done. + // All phi operations must be the first in the block. + builder_->createBranch(&block_stencil_enabled_merge); + spv::Block& block_stencil_enabled_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(&block_stencil_enabled_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(stencil_passed_if_enabled); + id_vector_temp_.push_back(block_stencil_enabled_end.getId()); + id_vector_temp_.push_back(builder_->makeBoolConstant(true)); + id_vector_temp_.push_back(block_stencil_enabled_head.getId()); + spv::Id stencil_passed = + builder_->createOp(spv::OpPhi, type_bool_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_stencil_and_old_depth_if_stencil_enabled); + id_vector_temp_.push_back(block_stencil_enabled_end.getId()); + id_vector_temp_.push_back(old_depth_stencil); + id_vector_temp_.push_back(block_stencil_enabled_head.getId()); + spv::Id new_stencil_and_old_depth = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + + // Check whether the tests have passed, and exclude the bit from the + // coverage if not. + spv::Id depth_stencil_passed = builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, depth_passed, stencil_passed); + spv::Id new_sample_mask_after_sample = builder_->createTriOp( + spv::OpSelect, type_uint_, depth_stencil_passed, new_sample_mask, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, new_sample_mask, + builder_->makeUintConstant(~(uint32_t(1) << i)))); + + // Combine the new depth and the new stencil taking into account whether the + // new depth should be written. + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(new_stencil_and_old_depth); + id_vector_temp_.push_back(sample_depth24); + id_vector_temp_.push_back(const_uint_8); + id_vector_temp_.push_back(builder_->makeUintConstant(24)); + spv::Id new_stencil_and_unconditional_new_depth = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + spv::Id new_depth_stencil = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createBinOp(spv::OpLogicalAnd, type_bool_, + depth_stencil_passed, depth_write), + new_stencil_and_unconditional_new_depth, new_stencil_and_old_depth); + + // Write (or defer writing if the test is early, but may discard samples + // later still) the new depth and stencil if they're different. + spv::Id new_depth_stencil_different = builder_->createBinOp( + spv::OpINotEqual, type_bool_, new_depth_stencil, old_depth_stencil); + spv::Id new_depth_stencil_write_condition = spv::NoResult; + if (is_early) { + if (implicit_early_z_write_allowed) { + new_sample_mask_after_sample = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createBinOp(spv::OpLogicalAnd, type_bool_, + new_depth_stencil_different, not_early_write), + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, new_sample_mask_after_sample, + builder_->makeUintConstant(uint32_t(1) << (4 + i))), + new_sample_mask_after_sample); + new_depth_stencil_write_condition = + builder_->createBinOp(spv::OpLogicalAnd, type_bool_, + new_depth_stencil_different, early_write); + } else { + // Always need to write late in this shader, as it may do something like + // explicitly killing pixels. + new_sample_mask_after_sample = builder_->createTriOp( + spv::OpSelect, type_uint_, new_depth_stencil_different, + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, new_sample_mask_after_sample, + builder_->makeUintConstant(uint32_t(1) << (4 + i))), + new_sample_mask_after_sample); + } + } else { + new_depth_stencil_write_condition = new_depth_stencil_different; + } + if (new_depth_stencil_write_condition != spv::NoResult) { + spv::Block& block_depth_stencil_write = builder_->makeNewBlock(); + spv::Block& block_depth_stencil_write_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_depth_stencil_write_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(new_depth_stencil_write_condition, + &block_depth_stencil_write, + &block_depth_stencil_write_merge); + builder_->setBuildPoint(&block_depth_stencil_write); + builder_->createStore(new_depth_stencil, sample_access_chain); + builder_->createBranch(&block_depth_stencil_write_merge); + builder_->setBuildPoint(&block_depth_stencil_write_merge); } - builder_->createStore(color, color_variable); + builder_->createBranch(&block_sample_covered_merge); + spv::Block& block_sample_covered_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(&block_sample_covered_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_sample_mask_after_sample); + id_vector_temp_.push_back(block_sample_covered_end.getId()); + id_vector_temp_.push_back(new_sample_mask); + id_vector_temp_.push_back(block_sample_covered_head.getId()); + new_sample_mask = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if (is_early) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_depth_stencil); + id_vector_temp_.push_back(block_sample_covered_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_sample_covered_head.getId()); + late_write_depth_stencil[i] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } } + + // Close the conditionals for whether depth / stencil testing is needed. + if (block_any_sample_covered_merge) { + builder_->createBranch(block_any_sample_covered_merge); + spv::Block& block_any_sample_covered_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(block_any_sample_covered_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_sample_mask); + id_vector_temp_.push_back(block_any_sample_covered_end.getId()); + id_vector_temp_.push_back(main_fsi_sample_mask_); + id_vector_temp_.push_back(block_any_sample_covered_head->getId()); + new_sample_mask = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if (is_early) { + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(late_write_depth_stencil[i]); + id_vector_temp_.push_back(block_any_sample_covered_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_any_sample_covered_head->getId()); + late_write_depth_stencil[i] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + } + } + builder_->createBranch(&block_depth_stencil_enabled_merge); + spv::Block& block_depth_stencil_enabled_end = *builder_->getBuildPoint(); + builder_->setBuildPoint(&block_depth_stencil_enabled_merge); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.clear(); + id_vector_temp_.push_back(new_sample_mask); + id_vector_temp_.push_back(block_depth_stencil_enabled_end.getId()); + id_vector_temp_.push_back(main_fsi_sample_mask_); + id_vector_temp_.push_back(block_depth_stencil_enabled_head.getId()); + main_fsi_sample_mask_ = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if (is_early) { + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(late_write_depth_stencil[i]); + id_vector_temp_.push_back(block_depth_stencil_enabled_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_depth_stencil_enabled_head.getId()); + main_fsi_late_write_depth_stencil_[i] = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + } + } +} + +std::array SpirvShaderTranslator::FSI_ClampAndPackColor( + spv::Id color_float4, spv::Id format_with_flags) { + spv::Block& block_format_head = *builder_->getBuildPoint(); + spv::Block& block_format_8_8_8_8 = builder_->makeNewBlock(); + spv::Block& block_format_8_8_8_8_gamma = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10 = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10_float = builder_->makeNewBlock(); + spv::Block& block_format_16 = builder_->makeNewBlock(); + spv::Block& block_format_16_float = builder_->makeNewBlock(); + spv::Block& block_format_32_float = builder_->makeNewBlock(); + spv::Block& block_format_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_format_merge.getId()); + { + std::unique_ptr format_switch_op = + std::make_unique(spv::OpSwitch); + format_switch_op->addIdOperand(format_with_flags); + // Make k_32_FLOAT or k_32_32_FLOAT the default. + format_switch_op->addIdOperand(block_format_32_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8))); + format_switch_op->addIdOperand(block_format_8_8_8_8.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA))); + format_switch_op->addIdOperand(block_format_8_8_8_8_gamma.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat :: + k_2_10_10_10_FLOAT_AS_16_16_16_16))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16))); + format_switch_op->addIdOperand(block_format_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16))); + format_switch_op->addIdOperand(block_format_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_float.getId()); + builder_->getBuildPoint()->addInstruction(std::move(format_switch_op)); + } + block_format_8_8_8_8.addPredecessor(&block_format_head); + block_format_8_8_8_8_gamma.addPredecessor(&block_format_head); + block_format_2_10_10_10.addPredecessor(&block_format_head); + block_format_2_10_10_10_float.addPredecessor(&block_format_head); + block_format_16.addPredecessor(&block_format_head); + block_format_16_float.addPredecessor(&block_format_head); + block_format_32_float.addPredecessor(&block_format_head); + + spv::Id unorm_round_offset_float = builder_->makeFloatConstant(0.5f); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, unorm_round_offset_float); + spv::Id unorm_round_offset_float4 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + + // *************************************************************************** + // k_8_8_8_8 + // *************************************************************************** + spv::Id packed_8_8_8_8; + { + builder_->setBuildPoint(&block_format_8_8_8_8); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(color_float4); + id_vector_temp_.push_back(const_float4_0_); + id_vector_temp_.push_back(const_float4_1_); + spv::Id color_scaled = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float4_, + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_), + builder_->makeFloatConstant(255.0f)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, unorm_round_offset_float4); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint4_, color_offset); + packed_8_8_8_8 = + builder_->createCompositeExtract(color_uint4, type_uint_, 0); + spv::Id component_width = builder_->makeUintConstant(8); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_8_8_8_8); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, i)); + id_vector_temp_.push_back(builder_->makeUintConstant(8 * i)); + id_vector_temp_.push_back(component_width); + packed_8_8_8_8 = builder_->createOp(spv::OpBitFieldInsert, type_uint_, + id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_8_8_8_8_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_8_8_8_8_GAMMA + // *************************************************************************** + spv::Id packed_8_8_8_8_gamma; + { + builder_->setBuildPoint(&block_format_8_8_8_8_gamma); + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(3); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(2); + spv::Id color_rgb = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float3_, color_float4, uint_vector_temp_); + spv::Id rgb_gamma = LinearToPWLGamma( + builder_->createRvalueSwizzle(spv::NoPrecision, type_float3_, + color_float4, uint_vector_temp_), + false); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_float4, type_float_, 3)); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(const_float_1_); + spv::Id alpha_clamped = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NClamp, id_vector_temp_); + // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()` + // assertion in createCompositeConstruct, OpCompositeConstruct can + // construct vectors not only from scalars, but also from other vectors. + spv::Id color_gamma; + { + std::unique_ptr color_gamma_composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); + color_gamma_composite_construct_op->addIdOperand(rgb_gamma); + color_gamma_composite_construct_op->addIdOperand(alpha_clamped); + color_gamma = color_gamma_composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(color_gamma_composite_construct_op)); + } + spv::Id color_scaled = + builder_->createBinOp(spv::OpVectorTimesScalar, type_float4_, + color_gamma, builder_->makeFloatConstant(255.0f)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, unorm_round_offset_float4); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint4_, color_offset); + packed_8_8_8_8_gamma = + builder_->createCompositeExtract(color_uint4, type_uint_, 0); + spv::Id component_width = builder_->makeUintConstant(8); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_8_8_8_8_gamma); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, i)); + id_vector_temp_.push_back(builder_->makeUintConstant(8 * i)); + id_vector_temp_.push_back(component_width); + packed_8_8_8_8_gamma = builder_->createOp(spv::OpBitFieldInsert, + type_uint_, id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_8_8_8_8_gamma_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_2_10_10_10 + // k_2_10_10_10_AS_10_10_10_10 + // *************************************************************************** + spv::Id packed_2_10_10_10; + { + builder_->setBuildPoint(&block_format_2_10_10_10); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(color_float4); + id_vector_temp_.push_back(const_float4_0_); + id_vector_temp_.push_back(const_float4_1_); + spv::Id color_clamped = + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.resize(3, builder_->makeFloatConstant(1023.0f)); + id_vector_temp_.push_back(builder_->makeFloatConstant(3.0f)); + spv::Id color_scaled = builder_->createBinOp( + spv::OpFMul, type_float4_, color_clamped, + builder_->makeCompositeConstant(type_float4_, id_vector_temp_)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, unorm_round_offset_float4); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint4_, color_offset); + packed_2_10_10_10 = + builder_->createCompositeExtract(color_uint4, type_uint_, 0); + spv::Id rgb_width = builder_->makeUintConstant(10); + spv::Id alpha_width = builder_->makeUintConstant(2); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_2_10_10_10); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, i)); + id_vector_temp_.push_back(builder_->makeUintConstant(10 * i)); + id_vector_temp_.push_back(i == 3 ? alpha_width : rgb_width); + packed_2_10_10_10 = builder_->createOp(spv::OpBitFieldInsert, type_uint_, + id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_2_10_10_10_FLOAT + // k_2_10_10_10_FLOAT_AS_16_16_16_16 + // *************************************************************************** + spv::Id packed_2_10_10_10_float; + { + builder_->setBuildPoint(&block_format_2_10_10_10_float); + std::array color_components; + // RGB. + for (uint32_t i = 0; i < 3; ++i) { + color_components[i] = UnclampedFloat32To7e3( + *builder_, + builder_->createCompositeExtract(color_float4, type_float_, i), + ext_inst_glsl_std_450_); + } + // Alpha. + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_float4, type_float_, 3)); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(const_float_1_); + spv::Id alpha_scaled = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_), + builder_->makeFloatConstant(3.0f)); + builder_->addDecoration(alpha_scaled, spv::DecorationNoContraction); + spv::Id alpha_offset = builder_->createBinOp( + spv::OpFAdd, type_float_, alpha_scaled, unorm_round_offset_float); + builder_->addDecoration(alpha_offset, spv::DecorationNoContraction); + color_components[3] = + builder_->createUnaryOp(spv::OpConvertFToU, type_uint_, alpha_offset); + // Pack. + packed_2_10_10_10_float = color_components[0]; + spv::Id rgb_width = builder_->makeUintConstant(10); + for (uint32_t i = 1; i < 3; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_2_10_10_10_float); + id_vector_temp_.push_back(color_components[i]); + id_vector_temp_.push_back(builder_->makeUintConstant(10 * i)); + id_vector_temp_.push_back(rgb_width); + packed_2_10_10_10_float = builder_->createOp(spv::OpBitFieldInsert, + type_uint_, id_vector_temp_); + } + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back(packed_2_10_10_10_float); + id_vector_temp_.push_back(color_components[3]); + id_vector_temp_.push_back(builder_->makeUintConstant(30)); + id_vector_temp_.push_back(builder_->makeUintConstant(2)); + packed_2_10_10_10_float = + builder_->createOp(spv::OpBitFieldInsert, type_uint_, id_vector_temp_); + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_16_16 + // k_16_16_16_16 + // *************************************************************************** + std::array packed_16; + { + builder_->setBuildPoint(&block_format_16); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(-32.0f)); + spv::Id const_float4_minus_32 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(32.0f)); + spv::Id const_float4_32 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + // NaN to 0, not to -32. + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createUnaryOp(spv::OpIsNan, type_bool4_, color_float4), + const_float4_0_, color_float4)); + id_vector_temp_.push_back(const_float4_minus_32); + id_vector_temp_.push_back(const_float4_32); + spv::Id color_scaled = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float4_, + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450FClamp, id_vector_temp_), + builder_->makeFloatConstant(32767.0f / 32.0f)); + builder_->addDecoration(color_scaled, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(-0.5f)); + spv::Id unorm_round_offset_negative_float4 = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + spv::Id color_offset = builder_->createBinOp( + spv::OpFAdd, type_float4_, color_scaled, + builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createBinOp(spv::OpFOrdLessThan, type_bool4_, + color_scaled, const_float4_0_), + unorm_round_offset_negative_float4, unorm_round_offset_float4)); + builder_->addDecoration(color_offset, spv::DecorationNoContraction); + spv::Id color_uint4 = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, + builder_->createUnaryOp(spv::OpConvertFToS, type_int4_, color_offset)); + spv::Id component_offset_width = builder_->makeUintConstant(16); + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(4); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, 2 * i)); + id_vector_temp_.push_back( + builder_->createCompositeExtract(color_uint4, type_uint_, 2 * i + 1)); + id_vector_temp_.push_back(component_offset_width); + id_vector_temp_.push_back(component_offset_width); + packed_16[i] = builder_->createOp(spv::OpBitFieldInsert, type_uint_, + id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_16_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_16_16_FLOAT + // k_16_16_16_16_FLOAT + // *************************************************************************** + std::array packed_16_float; + { + builder_->setBuildPoint(&block_format_16_float); + // TODO(Triang3l): Xenos extended-range float16. + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(-65504.0f)); + spv::Id const_float4_minus_float16_max = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant(65504.0f)); + spv::Id const_float4_float16_max = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + // NaN to 0, not to -max. + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createUnaryOp(spv::OpIsNan, type_bool4_, color_float4), + const_float4_0_, color_float4)); + id_vector_temp_.push_back(const_float4_minus_float16_max); + id_vector_temp_.push_back(const_float4_float16_max); + spv::Id color_clamped = + builder_->createBuiltinCall(type_float4_, ext_inst_glsl_std_450_, + GLSLstd450FClamp, id_vector_temp_); + for (uint32_t i = 0; i < 2; ++i) { + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(2 * i); + uint_vector_temp_.push_back(2 * i + 1); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, color_clamped, uint_vector_temp_)); + packed_16_float[i] = + builder_->createBuiltinCall(type_uint_, ext_inst_glsl_std_450_, + GLSLstd450PackHalf2x16, id_vector_temp_); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_16_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_32_FLOAT + // k_32_32_FLOAT + // *************************************************************************** + std::array packed_32_float; + { + builder_->setBuildPoint(&block_format_32_float); + for (uint32_t i = 0; i < 2; ++i) { + packed_32_float[i] = builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createCompositeExtract(color_float4, type_float_, i)); + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_32_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // Selection of the result depending on the format. + // *************************************************************************** + + builder_->setBuildPoint(&block_format_merge); + std::array packed; + id_vector_temp_.reserve(2 * 7); + // Low 32 bits. + id_vector_temp_.clear(); + id_vector_temp_.push_back(packed_8_8_8_8); + id_vector_temp_.push_back(block_format_8_8_8_8_end.getId()); + id_vector_temp_.push_back(packed_8_8_8_8_gamma); + id_vector_temp_.push_back(block_format_8_8_8_8_gamma_end.getId()); + id_vector_temp_.push_back(packed_2_10_10_10); + id_vector_temp_.push_back(block_format_2_10_10_10_end.getId()); + id_vector_temp_.push_back(packed_2_10_10_10_float); + id_vector_temp_.push_back(block_format_2_10_10_10_float_end.getId()); + id_vector_temp_.push_back(packed_16[0]); + id_vector_temp_.push_back(block_format_16_end.getId()); + id_vector_temp_.push_back(packed_16_float[0]); + id_vector_temp_.push_back(block_format_16_float_end.getId()); + id_vector_temp_.push_back(packed_32_float[0]); + id_vector_temp_.push_back(block_format_32_float_end.getId()); + packed[0] = builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + // High 32 bits. + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_8_8_8_8_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_8_8_8_8_gamma_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_2_10_10_10_end.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_format_2_10_10_10_float_end.getId()); + id_vector_temp_.push_back(packed_16[1]); + id_vector_temp_.push_back(block_format_16_end.getId()); + id_vector_temp_.push_back(packed_16_float[1]); + id_vector_temp_.push_back(block_format_16_float_end.getId()); + id_vector_temp_.push_back(packed_32_float[1]); + id_vector_temp_.push_back(block_format_32_float_end.getId()); + packed[1] = builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + return packed; +} + +std::array SpirvShaderTranslator::FSI_UnpackColor( + std::array color_packed, spv::Id format_with_flags) { + spv::Block& block_format_head = *builder_->getBuildPoint(); + spv::Block& block_format_8_8_8_8 = builder_->makeNewBlock(); + spv::Block& block_format_8_8_8_8_gamma = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10 = builder_->makeNewBlock(); + spv::Block& block_format_2_10_10_10_float = builder_->makeNewBlock(); + spv::Block& block_format_16_16 = builder_->makeNewBlock(); + spv::Block& block_format_16_16_16_16 = builder_->makeNewBlock(); + spv::Block& block_format_16_16_float = builder_->makeNewBlock(); + spv::Block& block_format_16_16_16_16_float = builder_->makeNewBlock(); + spv::Block& block_format_32_float = builder_->makeNewBlock(); + spv::Block& block_format_32_32_float = builder_->makeNewBlock(); + spv::Block& block_format_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_format_merge.getId()); + { + std::unique_ptr format_switch_op = + std::make_unique(spv::OpSwitch); + format_switch_op->addIdOperand(format_with_flags); + // Make k_32_FLOAT the default. + format_switch_op->addIdOperand(block_format_32_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8))); + format_switch_op->addIdOperand(block_format_8_8_8_8.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA))); + format_switch_op->addIdOperand(block_format_8_8_8_8_gamma.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10))); + format_switch_op->addIdOperand(block_format_2_10_10_10.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat :: + k_2_10_10_10_FLOAT_AS_16_16_16_16))); + format_switch_op->addIdOperand(block_format_2_10_10_10_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16))); + format_switch_op->addIdOperand(block_format_16_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16))); + format_switch_op->addIdOperand(block_format_16_16_16_16.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_16_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT))); + format_switch_op->addIdOperand(block_format_16_16_16_16_float.getId()); + format_switch_op->addImmediateOperand( + int32_t(RenderTargetCache::AddPSIColorFormatFlags( + xenos::ColorRenderTargetFormat::k_32_32_FLOAT))); + format_switch_op->addIdOperand(block_format_32_32_float.getId()); + builder_->getBuildPoint()->addInstruction(std::move(format_switch_op)); + } + block_format_8_8_8_8.addPredecessor(&block_format_head); + block_format_8_8_8_8_gamma.addPredecessor(&block_format_head); + block_format_2_10_10_10.addPredecessor(&block_format_head); + block_format_2_10_10_10_float.addPredecessor(&block_format_head); + block_format_16_16.addPredecessor(&block_format_head); + block_format_16_16_16_16.addPredecessor(&block_format_head); + block_format_16_16_float.addPredecessor(&block_format_head); + block_format_16_16_16_16_float.addPredecessor(&block_format_head); + block_format_32_float.addPredecessor(&block_format_head); + block_format_32_32_float.addPredecessor(&block_format_head); + + // *************************************************************************** + // k_8_8_8_8 + // k_8_8_8_8_GAMMA + // *************************************************************************** + + std::array, 2> unpacked_8_8_8_8_and_gamma; + std::array block_format_8_8_8_8_and_gamma_end; + { + spv::Id component_width = builder_->makeUintConstant(8); + spv::Id component_scale = builder_->makeFloatConstant(1.0f / 255.0f); + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_8_8_8_8_gamma + : &block_format_8_8_8_8); + for (uint32_t j = 0; j < 4; ++j) { + spv::Id component = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertUToF, type_float_, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, color_packed[0], + builder_->makeUintConstant(8 * j), component_width)), + component_scale); + builder_->addDecoration(component, spv::DecorationNoContraction); + if (i && j <= 2) { + component = PWLGammaToLinear(component, true); + } + unpacked_8_8_8_8_and_gamma[i][j] = component; + } + builder_->createBranch(&block_format_merge); + block_format_8_8_8_8_and_gamma_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // k_2_10_10_10 + // k_2_10_10_10_AS_10_10_10_10 + // *************************************************************************** + + std::array unpacked_2_10_10_10; + { + builder_->setBuildPoint(&block_format_2_10_10_10); + spv::Id rgb_width = builder_->makeUintConstant(10); + spv::Id alpha_width = builder_->makeUintConstant(2); + spv::Id rgb_scale = builder_->makeFloatConstant(1.0f / 1023.0f); + spv::Id alpha_scale = builder_->makeFloatConstant(1.0f / 3.0f); + for (uint32_t i = 0; i < 4; ++i) { + spv::Id component = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertUToF, type_float_, + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, + color_packed[0], + builder_->makeUintConstant(10 * i), + i == 3 ? alpha_width : rgb_width)), + i == 3 ? alpha_scale : rgb_scale); + builder_->addDecoration(component, spv::DecorationNoContraction); + unpacked_2_10_10_10[i] = component; + } + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_2_10_10_10_FLOAT + // k_2_10_10_10_FLOAT_AS_16_16_16_16 + // *************************************************************************** + + std::array unpacked_2_10_10_10_float; + { + builder_->setBuildPoint(&block_format_2_10_10_10_float); + spv::Id rgb_width = builder_->makeUintConstant(10); + for (uint32_t i = 0; i < 3; ++i) { + unpacked_2_10_10_10_float[i] = + Float7e3To32(*builder_, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, color_packed[0], + builder_->makeUintConstant(10 * i), rgb_width), + 0, false, ext_inst_glsl_std_450_); + } + spv::Id alpha = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertUToF, type_float_, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, color_packed[0], + builder_->makeUintConstant(30), builder_->makeUintConstant(2))), + builder_->makeFloatConstant(1.0f / 3.0f)); + builder_->addDecoration(alpha, spv::DecorationNoContraction); + unpacked_2_10_10_10_float[3] = alpha; + builder_->createBranch(&block_format_merge); + } + spv::Block& block_format_2_10_10_10_float_end = *builder_->getBuildPoint(); + + // *************************************************************************** + // k_16_16 + // k_16_16_16_16 + // *************************************************************************** + + std::array, 2> unpacked_16; + unpacked_16[0][2] = const_float_0_; + unpacked_16[0][3] = const_float_1_; + std::array block_format_16_end; + { + spv::Id component_width = builder_->makeUintConstant(16); + spv::Id component_scale = builder_->makeFloatConstant(32.0f / 32767.0f); + spv::Id component_min = builder_->makeFloatConstant(-1.0f); + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_16_16_16_16 + : &block_format_16_16); + std::array color_packed_signed; + for (uint32_t j = 0; j <= i; ++j) { + color_packed_signed[j] = + builder_->createUnaryOp(spv::OpBitcast, type_int_, color_packed[j]); + } + for (uint32_t j = 0; j < uint32_t(i ? 4 : 2); ++j) { + spv::Id component = builder_->createBinOp( + spv::OpFMul, type_float_, + builder_->createUnaryOp( + spv::OpConvertSToF, type_float_, + builder_->createTriOp(spv::OpBitFieldSExtract, type_int_, + color_packed_signed[j >> 1], + builder_->makeUintConstant(16 * (j & 1)), + component_width)), + component_scale); + builder_->addDecoration(component, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(component_min); + id_vector_temp_.push_back(component); + component = + builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, + GLSLstd450FMax, id_vector_temp_); + unpacked_16[i][j] = component; + } + builder_->createBranch(&block_format_merge); + block_format_16_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // k_16_16_FLOAT + // k_16_16_16_16_FLOAT + // *************************************************************************** + + std::array, 2> unpacked_16_float; + unpacked_16_float[0][2] = const_float_0_; + unpacked_16_float[0][3] = const_float_1_; + std::array block_format_16_float_end; + { + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_16_16_16_16_float + : &block_format_16_16_float); + // TODO(Triang3l): Xenos extended-range float16. + for (uint32_t j = 0; j <= i; ++j) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(color_packed[j]); + spv::Id components_float2 = builder_->createBuiltinCall( + type_float2_, ext_inst_glsl_std_450_, GLSLstd450UnpackHalf2x16, + id_vector_temp_); + for (uint32_t k = 0; k < 2; ++k) { + unpacked_16_float[i][2 * j + k] = builder_->createCompositeExtract( + components_float2, type_float_, k); + } + } + builder_->createBranch(&block_format_merge); + block_format_16_float_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // k_32_FLOAT + // k_32_32_FLOAT + // *************************************************************************** + + std::array, 2> unpacked_32_float; + unpacked_32_float[0][1] = const_float_0_; + unpacked_32_float[0][2] = const_float_0_; + unpacked_32_float[0][3] = const_float_1_; + unpacked_32_float[1][2] = const_float_0_; + unpacked_32_float[1][3] = const_float_1_; + std::array block_format_32_float_end; + { + for (uint32_t i = 0; i < 2; ++i) { + builder_->setBuildPoint(i ? &block_format_32_32_float + : &block_format_32_float); + for (uint32_t j = 0; j <= i; ++j) { + unpacked_32_float[i][j] = builder_->createUnaryOp( + spv::OpBitcast, type_float_, color_packed[j]); + } + builder_->createBranch(&block_format_merge); + block_format_32_float_end[i] = builder_->getBuildPoint(); + } + } + + // *************************************************************************** + // Selection of the result depending on the format. + // *************************************************************************** + + builder_->setBuildPoint(&block_format_merge); + std::array unpacked; + id_vector_temp_.reserve(2 * 10); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(unpacked_8_8_8_8_and_gamma[0][i]); + id_vector_temp_.push_back(block_format_8_8_8_8_and_gamma_end[0]->getId()); + id_vector_temp_.push_back(unpacked_8_8_8_8_and_gamma[1][i]); + id_vector_temp_.push_back(block_format_8_8_8_8_and_gamma_end[1]->getId()); + id_vector_temp_.push_back(unpacked_2_10_10_10[i]); + id_vector_temp_.push_back(block_format_2_10_10_10_end.getId()); + id_vector_temp_.push_back(unpacked_2_10_10_10_float[i]); + id_vector_temp_.push_back(block_format_2_10_10_10_float_end.getId()); + id_vector_temp_.push_back(unpacked_16[0][i]); + id_vector_temp_.push_back(block_format_16_end[0]->getId()); + id_vector_temp_.push_back(unpacked_16[1][i]); + id_vector_temp_.push_back(block_format_16_end[1]->getId()); + id_vector_temp_.push_back(unpacked_16_float[0][i]); + id_vector_temp_.push_back(block_format_16_float_end[0]->getId()); + id_vector_temp_.push_back(unpacked_16_float[1][i]); + id_vector_temp_.push_back(block_format_16_float_end[1]->getId()); + id_vector_temp_.push_back(unpacked_32_float[0][i]); + id_vector_temp_.push_back(block_format_32_float_end[0]->getId()); + id_vector_temp_.push_back(unpacked_32_float[1][i]); + id_vector_temp_.push_back(block_format_32_float_end[1]->getId()); + unpacked[i] = builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); + } + return unpacked; +} + +spv::Id SpirvShaderTranslator::FSI_FlushNaNClampAndInBlending( + spv::Id color_or_alpha, spv::Id is_fixed_point, spv::Id min_value, + spv::Id max_value) { + spv::Id color_or_alpha_type = builder_->getTypeId(color_or_alpha); + uint32_t component_count = + uint32_t(builder_->getNumTypeConstituents(color_or_alpha_type)); + assert_true(builder_->isScalarType(color_or_alpha_type) || + builder_->isVectorType(color_or_alpha_type)); + assert_true( + builder_->isFloatType(builder_->getScalarTypeId(color_or_alpha_type))); + assert_true(builder_->getTypeId(min_value) == color_or_alpha_type); + assert_true(builder_->getTypeId(max_value) == color_or_alpha_type); + + spv::Block& block_is_fixed_point_head = *builder_->getBuildPoint(); + spv::Block& block_is_fixed_point_if = builder_->makeNewBlock(); + spv::Block& block_is_fixed_point_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_is_fixed_point_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(is_fixed_point, &block_is_fixed_point_if, + &block_is_fixed_point_merge); + builder_->setBuildPoint(&block_is_fixed_point_if); + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + // Flush NaN to 0 even for signed (NMax would flush it to the minimum value). + id_vector_temp_.push_back(builder_->createTriOp( + spv::OpSelect, color_or_alpha_type, + builder_->createUnaryOp(spv::OpIsNan, + type_bool_vectors_[component_count - 1], + color_or_alpha), + const_float_vectors_0_[component_count - 1], color_or_alpha)); + id_vector_temp_.push_back(min_value); + id_vector_temp_.push_back(max_value); + spv::Id color_or_alpha_clamped = + builder_->createBuiltinCall(color_or_alpha_type, ext_inst_glsl_std_450_, + GLSLstd450FClamp, id_vector_temp_); + builder_->createBranch(&block_is_fixed_point_merge); + builder_->setBuildPoint(&block_is_fixed_point_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(color_or_alpha_clamped); + id_vector_temp_.push_back(block_is_fixed_point_if.getId()); + id_vector_temp_.push_back(color_or_alpha); + id_vector_temp_.push_back(block_is_fixed_point_head.getId()); + return builder_->createOp(spv::OpPhi, color_or_alpha_type, id_vector_temp_); +} + +spv::Id SpirvShaderTranslator::FSI_ApplyColorBlendFactor( + spv::Id value, spv::Id is_fixed_point, spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, spv::Id source_color, + spv::Id source_alpha, spv::Id dest_color, spv::Id dest_alpha, + spv::Id constant_color, spv::Id constant_alpha) { + // If the factor is zero, don't use it in the multiplication at all, so that + // infinity and NaN are not potentially involved in the multiplication. + // Calculate the condition before the selection merge, which must be the + // penultimate instruction in the block. + spv::Id factor_not_zero = builder_->createBinOp( + spv::OpINotEqual, type_bool_, factor, + builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))); + spv::Block& block_not_zero_head = *builder_->getBuildPoint(); + spv::Block& block_not_zero_if = builder_->makeNewBlock(); + spv::Block& block_not_zero_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_not_zero_merge.getId()); + builder_->createConditionalBranch(factor_not_zero, &block_not_zero_if, + &block_not_zero_merge); + + // Non-zero factor case. + + builder_->setBuildPoint(&block_not_zero_if); + + spv::Block& block_factor_head = *builder_->getBuildPoint(); + spv::Block& block_factor_one = builder_->makeNewBlock(); + std::array color_factor_blocks; + std::array one_minus_color_factor_blocks; + std::array alpha_factor_blocks; + std::array one_minus_alpha_factor_blocks; + color_factor_blocks[0] = &builder_->makeNewBlock(); + one_minus_color_factor_blocks[0] = &builder_->makeNewBlock(); + alpha_factor_blocks[0] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[0] = &builder_->makeNewBlock(); + color_factor_blocks[1] = &builder_->makeNewBlock(); + one_minus_color_factor_blocks[1] = &builder_->makeNewBlock(); + alpha_factor_blocks[1] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[1] = &builder_->makeNewBlock(); + color_factor_blocks[2] = &builder_->makeNewBlock(); + one_minus_color_factor_blocks[2] = &builder_->makeNewBlock(); + alpha_factor_blocks[2] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[2] = &builder_->makeNewBlock(); + spv::Block& block_factor_source_alpha_saturate = builder_->makeNewBlock(); + spv::Block& block_factor_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_factor_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr factor_switch_op = + std::make_unique(spv::OpSwitch); + factor_switch_op->addIdOperand(factor); + // Make one the default factor. + factor_switch_op->addIdOperand(block_factor_one.getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcColor)); + factor_switch_op->addIdOperand(color_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcColor)); + factor_switch_op->addIdOperand(one_minus_color_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstColor)); + factor_switch_op->addIdOperand(color_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstColor)); + factor_switch_op->addIdOperand(one_minus_color_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantColor)); + factor_switch_op->addIdOperand(color_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantColor)); + factor_switch_op->addIdOperand(one_minus_color_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlphaSaturate)); + factor_switch_op->addIdOperand(block_factor_source_alpha_saturate.getId()); + builder_->getBuildPoint()->addInstruction(std::move(factor_switch_op)); + } + block_factor_one.addPredecessor(&block_factor_head); + for (uint32_t i = 0; i < 3; ++i) { + color_factor_blocks[i]->addPredecessor(&block_factor_head); + one_minus_color_factor_blocks[i]->addPredecessor(&block_factor_head); + alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + one_minus_alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + } + block_factor_source_alpha_saturate.addPredecessor(&block_factor_head); + + // kOne + builder_->setBuildPoint(&block_factor_one); + // The result is the value itself. + builder_->createBranch(&block_factor_merge); + + // k[OneMinus]Src/Dest/ConstantColor/Alpha + std::array color_factors = { + source_color, + dest_color, + constant_color, + }; + std::array alpha_factors = { + source_alpha, + dest_alpha, + constant_alpha, + }; + std::array color_factor_results; + std::array one_minus_color_factor_results; + std::array alpha_factor_results; + std::array one_minus_alpha_factor_results; + for (uint32_t i = 0; i < 3; ++i) { + spv::Id color_factor = color_factors[i]; + spv::Id alpha_factor = alpha_factors[i]; + + // kSrc/Dst/ConstantColor + { + builder_->setBuildPoint(color_factor_blocks[i]); + spv::Id result_color = + builder_->createBinOp(spv::OpFMul, type_float3_, value, color_factor); + builder_->addDecoration(result_color, spv::DecorationNoContraction); + color_factor_results[i] = result_color; + builder_->createBranch(&block_factor_merge); + } + + // kOneMinusSrc/Dst/ConstantColor + { + builder_->setBuildPoint(one_minus_color_factor_blocks[i]); + spv::Id one_minus_color_factor = builder_->createBinOp( + spv::OpFSub, type_float3_, const_float3_1_, color_factor); + builder_->addDecoration(one_minus_color_factor, + spv::DecorationNoContraction); + spv::Id result_one_minus_color = builder_->createBinOp( + spv::OpFMul, type_float3_, value, one_minus_color_factor); + builder_->addDecoration(result_one_minus_color, + spv::DecorationNoContraction); + one_minus_color_factor_results[i] = result_one_minus_color; + builder_->createBranch(&block_factor_merge); + } + + // kSrc/Dst/ConstantAlpha + { + builder_->setBuildPoint(alpha_factor_blocks[i]); + spv::Id result_alpha = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float3_, value, alpha_factor); + builder_->addDecoration(result_alpha, spv::DecorationNoContraction); + alpha_factor_results[i] = result_alpha; + builder_->createBranch(&block_factor_merge); + } + + // kOneMinusSrc/Dst/ConstantAlpha + { + builder_->setBuildPoint(one_minus_alpha_factor_blocks[i]); + spv::Id one_minus_alpha_factor = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, alpha_factor); + builder_->addDecoration(one_minus_alpha_factor, + spv::DecorationNoContraction); + spv::Id result_one_minus_alpha = + builder_->createBinOp(spv::OpVectorTimesScalar, type_float3_, value, + one_minus_alpha_factor); + builder_->addDecoration(result_one_minus_alpha, + spv::DecorationNoContraction); + one_minus_alpha_factor_results[i] = result_one_minus_alpha; + builder_->createBranch(&block_factor_merge); + } + } + + // kSrcAlphaSaturate + spv::Id result_source_alpha_saturate; + { + builder_->setBuildPoint(&block_factor_source_alpha_saturate); + spv::Id one_minus_dest_alpha = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, dest_alpha); + builder_->addDecoration(one_minus_dest_alpha, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(source_alpha); + id_vector_temp_.push_back(one_minus_dest_alpha); + spv::Id factor_source_alpha_saturate = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NMin, id_vector_temp_); + result_source_alpha_saturate = + builder_->createBinOp(spv::OpVectorTimesScalar, type_float3_, value, + factor_source_alpha_saturate); + builder_->addDecoration(result_source_alpha_saturate, + spv::DecorationNoContraction); + builder_->createBranch(&block_factor_merge); + } + + // Select the term for the non-zero factor. + builder_->setBuildPoint(&block_factor_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 14); + id_vector_temp_.push_back(value); + id_vector_temp_.push_back(block_factor_one.getId()); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp_.push_back(color_factor_results[i]); + id_vector_temp_.push_back(color_factor_blocks[i]->getId()); + id_vector_temp_.push_back(one_minus_color_factor_results[i]); + id_vector_temp_.push_back(one_minus_color_factor_blocks[i]->getId()); + id_vector_temp_.push_back(alpha_factor_results[i]); + id_vector_temp_.push_back(alpha_factor_blocks[i]->getId()); + id_vector_temp_.push_back(one_minus_alpha_factor_results[i]); + id_vector_temp_.push_back(one_minus_alpha_factor_blocks[i]->getId()); + } + id_vector_temp_.push_back(result_source_alpha_saturate); + id_vector_temp_.push_back(block_factor_source_alpha_saturate.getId()); + spv::Id result_unclamped = + builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); + spv::Id result = FSI_FlushNaNClampAndInBlending( + result_unclamped, is_fixed_point, clamp_min_value, clamp_max_value); + builder_->createBranch(&block_not_zero_merge); + // Get the latest block for a non-zero factor after all the control flow. + spv::Block& block_not_zero_if_end = *builder_->getBuildPoint(); + + // Make the result zero if the factor is zero. + builder_->setBuildPoint(&block_not_zero_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(result); + id_vector_temp_.push_back(block_not_zero_if_end.getId()); + id_vector_temp_.push_back(const_float3_0_); + id_vector_temp_.push_back(block_not_zero_head.getId()); + return builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); +} + +spv::Id SpirvShaderTranslator::FSI_ApplyAlphaBlendFactor( + spv::Id value, spv::Id is_fixed_point, spv::Id clamp_min_value, + spv::Id clamp_max_value, spv::Id factor, spv::Id source_alpha, + spv::Id dest_alpha, spv::Id constant_alpha) { + // If the factor is zero, don't use it in the multiplication at all, so that + // infinity and NaN are not potentially involved in the multiplication. + // Calculate the condition before the selection merge, which must be the + // penultimate instruction in the block. + spv::Id factor_not_zero = builder_->createBinOp( + spv::OpINotEqual, type_bool_, factor, + builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))); + spv::Block& block_not_zero_head = *builder_->getBuildPoint(); + spv::Block& block_not_zero_if = builder_->makeNewBlock(); + spv::Block& block_not_zero_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_not_zero_merge.getId()); + builder_->createConditionalBranch(factor_not_zero, &block_not_zero_if, + &block_not_zero_merge); + + // Non-zero factor case. + + builder_->setBuildPoint(&block_not_zero_if); + + spv::Block& block_factor_head = *builder_->getBuildPoint(); + spv::Block& block_factor_one = builder_->makeNewBlock(); + std::array alpha_factor_blocks; + std::array one_minus_alpha_factor_blocks; + alpha_factor_blocks[0] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[0] = &builder_->makeNewBlock(); + alpha_factor_blocks[1] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[1] = &builder_->makeNewBlock(); + alpha_factor_blocks[2] = &builder_->makeNewBlock(); + one_minus_alpha_factor_blocks[2] = &builder_->makeNewBlock(); + spv::Block& block_factor_source_alpha_saturate = builder_->makeNewBlock(); + spv::Block& block_factor_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_factor_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr factor_switch_op = + std::make_unique(spv::OpSwitch); + factor_switch_op->addIdOperand(factor); + // Make one the default factor. + factor_switch_op->addIdOperand(block_factor_one.getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcColor)); + factor_switch_op->addIdOperand(alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcColor)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusSrcAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[0]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstColor)); + factor_switch_op->addIdOperand(alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstColor)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kDstAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusDstAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[1]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantColor)); + factor_switch_op->addIdOperand(alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantColor)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kConstantAlpha)); + factor_switch_op->addIdOperand(alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kOneMinusConstantAlpha)); + factor_switch_op->addIdOperand(one_minus_alpha_factor_blocks[2]->getId()); + factor_switch_op->addImmediateOperand( + int32_t(xenos::BlendFactor::kSrcAlphaSaturate)); + factor_switch_op->addIdOperand(block_factor_source_alpha_saturate.getId()); + builder_->getBuildPoint()->addInstruction(std::move(factor_switch_op)); + } + block_factor_one.addPredecessor(&block_factor_head); + for (uint32_t i = 0; i < 3; ++i) { + alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + one_minus_alpha_factor_blocks[i]->addPredecessor(&block_factor_head); + } + block_factor_source_alpha_saturate.addPredecessor(&block_factor_head); + + // kOne + builder_->setBuildPoint(&block_factor_one); + // The result is the value itself. + builder_->createBranch(&block_factor_merge); + + // k[OneMinus]Src/Dest/ConstantColor/Alpha + std::array alpha_factors = { + source_alpha, + dest_alpha, + constant_alpha, + }; + std::array alpha_factor_results; + std::array one_minus_alpha_factor_results; + for (uint32_t i = 0; i < 3; ++i) { + spv::Id alpha_factor = alpha_factors[i]; + + // kSrc/Dst/ConstantColor/Alpha + { + builder_->setBuildPoint(alpha_factor_blocks[i]); + spv::Id result_alpha = + builder_->createBinOp(spv::OpFMul, type_float_, value, alpha_factor); + builder_->addDecoration(result_alpha, spv::DecorationNoContraction); + alpha_factor_results[i] = result_alpha; + builder_->createBranch(&block_factor_merge); + } + + // kOneMinusSrc/Dst/ConstantColor/Alpha + { + builder_->setBuildPoint(one_minus_alpha_factor_blocks[i]); + spv::Id one_minus_alpha_factor = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, alpha_factor); + builder_->addDecoration(one_minus_alpha_factor, + spv::DecorationNoContraction); + spv::Id result_one_minus_alpha = builder_->createBinOp( + spv::OpFMul, type_float_, value, one_minus_alpha_factor); + builder_->addDecoration(result_one_minus_alpha, + spv::DecorationNoContraction); + one_minus_alpha_factor_results[i] = result_one_minus_alpha; + builder_->createBranch(&block_factor_merge); + } + } + + // kSrcAlphaSaturate + spv::Id result_source_alpha_saturate; + { + builder_->setBuildPoint(&block_factor_source_alpha_saturate); + spv::Id one_minus_dest_alpha = builder_->createBinOp( + spv::OpFSub, type_float_, const_float_1_, dest_alpha); + builder_->addDecoration(one_minus_dest_alpha, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(source_alpha); + id_vector_temp_.push_back(one_minus_dest_alpha); + spv::Id factor_source_alpha_saturate = builder_->createBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NMin, id_vector_temp_); + result_source_alpha_saturate = builder_->createBinOp( + spv::OpFMul, type_float_, value, factor_source_alpha_saturate); + builder_->addDecoration(result_source_alpha_saturate, + spv::DecorationNoContraction); + builder_->createBranch(&block_factor_merge); + } + + // Select the term for the non-zero factor. + builder_->setBuildPoint(&block_factor_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 8); + id_vector_temp_.push_back(value); + id_vector_temp_.push_back(block_factor_one.getId()); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp_.push_back(alpha_factor_results[i]); + id_vector_temp_.push_back(alpha_factor_blocks[i]->getId()); + id_vector_temp_.push_back(one_minus_alpha_factor_results[i]); + id_vector_temp_.push_back(one_minus_alpha_factor_blocks[i]->getId()); + } + id_vector_temp_.push_back(result_source_alpha_saturate); + id_vector_temp_.push_back(block_factor_source_alpha_saturate.getId()); + spv::Id result_unclamped = + builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); + spv::Id result = FSI_FlushNaNClampAndInBlending( + result_unclamped, is_fixed_point, clamp_min_value, clamp_max_value); + builder_->createBranch(&block_not_zero_merge); + // Get the latest block for a non-zero factor after all the control flow. + spv::Block& block_not_zero_if_end = *builder_->getBuildPoint(); + + // Make the result zero if the factor is zero. + builder_->setBuildPoint(&block_not_zero_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 2); + id_vector_temp_.push_back(result); + id_vector_temp_.push_back(block_not_zero_if_end.getId()); + id_vector_temp_.push_back(const_float_0_); + id_vector_temp_.push_back(block_not_zero_head.getId()); + return builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); +} + +spv::Id SpirvShaderTranslator::FSI_BlendColorOrAlphaWithUnclampedResult( + spv::Id is_fixed_point, spv::Id clamp_min_value, spv::Id clamp_max_value, + spv::Id source_color_clamped, spv::Id source_alpha_clamped, + spv::Id dest_color, spv::Id dest_alpha, spv::Id constant_color_clamped, + spv::Id constant_alpha_clamped, spv::Id equation, spv::Id source_factor, + spv::Id dest_factor) { + bool is_alpha = source_color_clamped == spv::NoResult; + assert_false(!is_alpha && (dest_color == spv::NoResult || + constant_color_clamped == spv::NoResult)); + assert_false(is_alpha && (dest_color != spv::NoResult || + constant_color_clamped != spv::NoResult)); + spv::Id value_type = is_alpha ? type_float_ : type_float3_; + + // Handle min and max blend operations, which don't involve the factors. + spv::Block& block_min_max_head = *builder_->getBuildPoint(); + spv::Block& block_min_max_min = builder_->makeNewBlock(); + spv::Block& block_min_max_max = builder_->makeNewBlock(); + spv::Block& block_min_max_default = builder_->makeNewBlock(); + spv::Block& block_min_max_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_min_max_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr min_max_switch_op = + std::make_unique(spv::OpSwitch); + min_max_switch_op->addIdOperand(equation); + min_max_switch_op->addIdOperand(block_min_max_default.getId()); + min_max_switch_op->addImmediateOperand(int32_t(xenos::BlendOp::kMin)); + min_max_switch_op->addIdOperand(block_min_max_min.getId()); + min_max_switch_op->addImmediateOperand(int32_t(xenos::BlendOp::kMax)); + min_max_switch_op->addIdOperand(block_min_max_max.getId()); + builder_->getBuildPoint()->addInstruction(std::move(min_max_switch_op)); + } + block_min_max_default.addPredecessor(&block_min_max_head); + block_min_max_min.addPredecessor(&block_min_max_head); + block_min_max_max.addPredecessor(&block_min_max_head); + + // Min case. + builder_->setBuildPoint(&block_min_max_min); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(is_alpha ? source_alpha_clamped + : source_color_clamped); + id_vector_temp_.push_back(is_alpha ? dest_alpha : dest_color); + spv::Id result_min = builder_->createBuiltinCall( + value_type, ext_inst_glsl_std_450_, GLSLstd450FMin, id_vector_temp_); + builder_->createBranch(&block_min_max_merge); + + // Max case. + builder_->setBuildPoint(&block_min_max_max); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(is_alpha ? source_alpha_clamped + : source_color_clamped); + id_vector_temp_.push_back(is_alpha ? dest_alpha : dest_color); + spv::Id result_max = builder_->createBuiltinCall( + value_type, ext_inst_glsl_std_450_, GLSLstd450FMax, id_vector_temp_); + builder_->createBranch(&block_min_max_merge); + + // Blending with factors. + spv::Id result_factors; + { + builder_->setBuildPoint(&block_min_max_default); + + spv::Id term_source, term_dest; + if (is_alpha) { + term_source = FSI_ApplyAlphaBlendFactor( + source_alpha_clamped, is_fixed_point, clamp_min_value, + clamp_max_value, source_factor, source_alpha_clamped, dest_alpha, + constant_alpha_clamped); + term_dest = FSI_ApplyAlphaBlendFactor(dest_alpha, is_fixed_point, + clamp_min_value, clamp_max_value, + dest_factor, source_alpha_clamped, + dest_alpha, constant_alpha_clamped); + } else { + term_source = FSI_ApplyColorBlendFactor( + source_color_clamped, is_fixed_point, clamp_min_value, + clamp_max_value, source_factor, source_color_clamped, + source_alpha_clamped, dest_color, dest_alpha, constant_color_clamped, + constant_alpha_clamped); + term_dest = FSI_ApplyColorBlendFactor( + dest_color, is_fixed_point, clamp_min_value, clamp_max_value, + dest_factor, source_color_clamped, source_alpha_clamped, dest_color, + dest_alpha, constant_color_clamped, constant_alpha_clamped); + } + + spv::Block& block_signs_head = *builder_->getBuildPoint(); + spv::Block& block_signs_add = builder_->makeNewBlock(); + spv::Block& block_signs_subtract = builder_->makeNewBlock(); + spv::Block& block_signs_reverse_subtract = builder_->makeNewBlock(); + spv::Block& block_signs_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_signs_merge.getId(), + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr signs_switch_op = + std::make_unique(spv::OpSwitch); + signs_switch_op->addIdOperand(equation); + // Make addition the default. + signs_switch_op->addIdOperand(block_signs_add.getId()); + signs_switch_op->addImmediateOperand(int32_t(xenos::BlendOp::kSubtract)); + signs_switch_op->addIdOperand(block_signs_subtract.getId()); + signs_switch_op->addImmediateOperand( + int32_t(xenos::BlendOp::kRevSubtract)); + signs_switch_op->addIdOperand(block_signs_reverse_subtract.getId()); + builder_->getBuildPoint()->addInstruction(std::move(signs_switch_op)); + } + block_signs_add.addPredecessor(&block_signs_head); + block_signs_subtract.addPredecessor(&block_signs_head); + block_signs_reverse_subtract.addPredecessor(&block_signs_head); + + // Addition case. + builder_->setBuildPoint(&block_signs_add); + spv::Id result_add = + builder_->createBinOp(spv::OpFAdd, value_type, term_source, term_dest); + builder_->addDecoration(result_add, spv::DecorationNoContraction); + builder_->createBranch(&block_signs_merge); + + // Subtraction case. + builder_->setBuildPoint(&block_signs_subtract); + spv::Id result_subtract = + builder_->createBinOp(spv::OpFSub, value_type, term_source, term_dest); + builder_->addDecoration(result_subtract, spv::DecorationNoContraction); + builder_->createBranch(&block_signs_merge); + + // Reverse subtraction case. + builder_->setBuildPoint(&block_signs_reverse_subtract); + spv::Id result_reverse_subtract = + builder_->createBinOp(spv::OpFSub, value_type, term_dest, term_source); + builder_->addDecoration(result_reverse_subtract, + spv::DecorationNoContraction); + builder_->createBranch(&block_signs_merge); + + // Selection between the signs involved in the addition. + builder_->setBuildPoint(&block_signs_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 3); + id_vector_temp_.push_back(result_add); + id_vector_temp_.push_back(block_signs_add.getId()); + id_vector_temp_.push_back(result_subtract); + id_vector_temp_.push_back(block_signs_subtract.getId()); + id_vector_temp_.push_back(result_reverse_subtract); + id_vector_temp_.push_back(block_signs_reverse_subtract.getId()); + result_factors = + builder_->createOp(spv::OpPhi, value_type, id_vector_temp_); + builder_->createBranch(&block_min_max_merge); + } + // Get the latest block for blending with factors after all the control flow. + spv::Block& block_min_max_default_end = *builder_->getBuildPoint(); + + builder_->setBuildPoint(&block_min_max_merge); + // Choose out of min, max, and blending with factors. + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 3); + id_vector_temp_.push_back(result_min); + id_vector_temp_.push_back(block_min_max_min.getId()); + id_vector_temp_.push_back(result_max); + id_vector_temp_.push_back(block_min_max_max.getId()); + id_vector_temp_.push_back(result_factors); + id_vector_temp_.push_back(block_min_max_default_end.getId()); + return builder_->createOp(spv::OpPhi, value_type, id_vector_temp_); } } // namespace gpu diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 68a00cbe8..e48115894 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -67,9 +67,6 @@ const VkDescriptorPoolSize {VK_DESCRIPTOR_TYPE_SAMPLER, kLinkedTypeDescriptorPoolSetCount}, }; -// No specific reason for 32768 descriptors, just the "too much" amount from -// Direct3D 12 PIX warnings. 2x descriptors for textures because of unsigned and -// signed bindings. VulkanCommandProcessor::VulkanCommandProcessor( VulkanGraphicsSystem* graphics_system, kernel::KernelState* kernel_state) : CommandProcessor(graphics_system, kernel_state), @@ -106,6 +103,32 @@ void VulkanCommandProcessor::TracePlaybackWroteMemory(uint32_t base_ptr, void VulkanCommandProcessor::RestoreEdramSnapshot(const void* snapshot) {} +std::string VulkanCommandProcessor::GetWindowTitleText() const { + std::ostringstream title; + title << "Vulkan"; + if (render_target_cache_) { + switch (render_target_cache_->GetPath()) { + case RenderTargetCache::Path::kHostRenderTargets: + title << " - FBO"; + break; + case RenderTargetCache::Path::kPixelShaderInterlock: + title << " - FSI"; + break; + default: + break; + } + uint32_t draw_resolution_scale_x = + texture_cache_ ? texture_cache_->draw_resolution_scale_x() : 1; + uint32_t draw_resolution_scale_y = + texture_cache_ ? texture_cache_->draw_resolution_scale_y() : 1; + if (draw_resolution_scale_x > 1 || draw_resolution_scale_y > 1) { + title << ' ' << draw_resolution_scale_x << 'x' << draw_resolution_scale_y; + } + } + title << " - HEAVILY INCOMPLETE, early development"; + return title.str(); +} + bool VulkanCommandProcessor::SetupContext() { if (!CommandProcessor::SetupContext()) { XELOGE("Failed to initialize base command processor context"); @@ -146,7 +169,7 @@ bool VulkanCommandProcessor::SetupContext() { size_t(16384)), size_t(uniform_buffer_alignment))); - // Descriptor set layouts. + // Descriptor set layouts that don't depend on the setup of other subsystems. VkShaderStageFlags guest_shader_stages = guest_shader_vertex_stages_ | VK_SHADER_STAGE_FRAGMENT_BIT; // Empty. @@ -163,37 +186,6 @@ bool VulkanCommandProcessor::SetupContext() { XELOGE("Failed to create an empty Vulkan descriptor set layout"); return false; } - // Shared memory and EDRAM. - uint32_t shared_memory_binding_count_log2 = - SpirvShaderTranslator::GetSharedMemoryStorageBufferCountLog2( - provider.device_properties().limits.maxStorageBufferRange); - uint32_t shared_memory_binding_count = UINT32_C(1) - << shared_memory_binding_count_log2; - VkDescriptorSetLayoutBinding - descriptor_set_layout_bindings_shared_memory_and_edram[1]; - descriptor_set_layout_bindings_shared_memory_and_edram[0].binding = 0; - descriptor_set_layout_bindings_shared_memory_and_edram[0].descriptorType = - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - descriptor_set_layout_bindings_shared_memory_and_edram[0].descriptorCount = - shared_memory_binding_count; - descriptor_set_layout_bindings_shared_memory_and_edram[0].stageFlags = - guest_shader_stages; - descriptor_set_layout_bindings_shared_memory_and_edram[0].pImmutableSamplers = - nullptr; - // TODO(Triang3l): EDRAM storage image binding for the fragment shader - // interlocks case. - descriptor_set_layout_create_info.bindingCount = uint32_t( - xe::countof(descriptor_set_layout_bindings_shared_memory_and_edram)); - descriptor_set_layout_create_info.pBindings = - descriptor_set_layout_bindings_shared_memory_and_edram; - if (dfn.vkCreateDescriptorSetLayout( - device, &descriptor_set_layout_create_info, nullptr, - &descriptor_set_layout_shared_memory_and_edram_) != VK_SUCCESS) { - XELOGE( - "Failed to create a Vulkan descriptor set layout for the shared memory " - "and the EDRAM"); - return false; - } // Guest draw constants. VkDescriptorSetLayoutBinding descriptor_set_layout_bindings_constants [SpirvShaderTranslator::kConstantBufferCount] = {}; @@ -289,16 +281,70 @@ bool VulkanCommandProcessor::SetupContext() { return false; } + uint32_t shared_memory_binding_count_log2 = + SpirvShaderTranslator::GetSharedMemoryStorageBufferCountLog2( + provider.device_properties().limits.maxStorageBufferRange); + uint32_t shared_memory_binding_count = UINT32_C(1) + << shared_memory_binding_count_log2; + // Requires the transient descriptor set layouts. // TODO(Triang3l): Get the actual draw resolution scale when the texture cache // supports resolution scaling. render_target_cache_ = std::make_unique( *register_file_, *memory_, trace_writer_, 1, 1, *this); - if (!render_target_cache_->Initialize()) { + if (!render_target_cache_->Initialize(shared_memory_binding_count)) { XELOGE("Failed to initialize the render target cache"); return false; } + // Shared memory and EDRAM descriptor set layout. + bool edram_fragment_shader_interlock = + render_target_cache_->GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + VkDescriptorSetLayoutBinding + shared_memory_and_edram_descriptor_set_layout_bindings[2]; + shared_memory_and_edram_descriptor_set_layout_bindings[0].binding = 0; + shared_memory_and_edram_descriptor_set_layout_bindings[0].descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + shared_memory_and_edram_descriptor_set_layout_bindings[0].descriptorCount = + shared_memory_binding_count; + shared_memory_and_edram_descriptor_set_layout_bindings[0].stageFlags = + guest_shader_stages; + shared_memory_and_edram_descriptor_set_layout_bindings[0].pImmutableSamplers = + nullptr; + VkDescriptorSetLayoutCreateInfo + shared_memory_and_edram_descriptor_set_layout_create_info; + shared_memory_and_edram_descriptor_set_layout_create_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + shared_memory_and_edram_descriptor_set_layout_create_info.pNext = nullptr; + shared_memory_and_edram_descriptor_set_layout_create_info.flags = 0; + shared_memory_and_edram_descriptor_set_layout_create_info.pBindings = + shared_memory_and_edram_descriptor_set_layout_bindings; + if (edram_fragment_shader_interlock) { + // EDRAM. + shared_memory_and_edram_descriptor_set_layout_bindings[1].binding = 1; + shared_memory_and_edram_descriptor_set_layout_bindings[1].descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + shared_memory_and_edram_descriptor_set_layout_bindings[1].descriptorCount = + 1; + shared_memory_and_edram_descriptor_set_layout_bindings[1].stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT; + shared_memory_and_edram_descriptor_set_layout_bindings[1] + .pImmutableSamplers = nullptr; + shared_memory_and_edram_descriptor_set_layout_create_info.bindingCount = 2; + } else { + shared_memory_and_edram_descriptor_set_layout_create_info.bindingCount = 1; + } + if (dfn.vkCreateDescriptorSetLayout( + device, &shared_memory_and_edram_descriptor_set_layout_create_info, + nullptr, + &descriptor_set_layout_shared_memory_and_edram_) != VK_SUCCESS) { + XELOGE( + "Failed to create a Vulkan descriptor set layout for the shared memory " + "and the EDRAM"); + return false; + } + pipeline_cache_ = std::make_unique( *this, *register_file_, *render_target_cache_, guest_shader_vertex_stages_); @@ -320,9 +366,8 @@ bool VulkanCommandProcessor::SetupContext() { // Shared memory and EDRAM common bindings. VkDescriptorPoolSize descriptor_pool_sizes[1]; descriptor_pool_sizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - descriptor_pool_sizes[0].descriptorCount = shared_memory_binding_count; - // TODO(Triang3l): EDRAM storage image binding for the fragment shader - // interlocks case. + descriptor_pool_sizes[0].descriptorCount = + shared_memory_binding_count + uint32_t(edram_fragment_shader_interlock); VkDescriptorPoolCreateInfo descriptor_pool_create_info; descriptor_pool_create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; @@ -369,20 +414,45 @@ bool VulkanCommandProcessor::SetupContext() { shared_memory_binding_range * i; shared_memory_descriptor_buffer_info.range = shared_memory_binding_range; } - VkWriteDescriptorSet write_descriptor_sets[1]; - write_descriptor_sets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - write_descriptor_sets[0].pNext = nullptr; - write_descriptor_sets[0].dstSet = shared_memory_and_edram_descriptor_set_; - write_descriptor_sets[0].dstBinding = 0; - write_descriptor_sets[0].dstArrayElement = 0; - write_descriptor_sets[0].descriptorCount = shared_memory_binding_count; - write_descriptor_sets[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - write_descriptor_sets[0].pImageInfo = nullptr; - write_descriptor_sets[0].pBufferInfo = shared_memory_descriptor_buffers_info; - write_descriptor_sets[0].pTexelBufferView = nullptr; - // TODO(Triang3l): EDRAM storage image binding for the fragment shader - // interlocks case. - dfn.vkUpdateDescriptorSets(device, 1, write_descriptor_sets, 0, nullptr); + VkWriteDescriptorSet write_descriptor_sets[2]; + VkWriteDescriptorSet& write_descriptor_set_shared_memory = + write_descriptor_sets[0]; + write_descriptor_set_shared_memory.sType = + VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write_descriptor_set_shared_memory.pNext = nullptr; + write_descriptor_set_shared_memory.dstSet = + shared_memory_and_edram_descriptor_set_; + write_descriptor_set_shared_memory.dstBinding = 0; + write_descriptor_set_shared_memory.dstArrayElement = 0; + write_descriptor_set_shared_memory.descriptorCount = + shared_memory_binding_count; + write_descriptor_set_shared_memory.descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write_descriptor_set_shared_memory.pImageInfo = nullptr; + write_descriptor_set_shared_memory.pBufferInfo = + shared_memory_descriptor_buffers_info; + write_descriptor_set_shared_memory.pTexelBufferView = nullptr; + VkDescriptorBufferInfo edram_descriptor_buffer_info; + if (edram_fragment_shader_interlock) { + edram_descriptor_buffer_info.buffer = render_target_cache_->edram_buffer(); + edram_descriptor_buffer_info.offset = 0; + edram_descriptor_buffer_info.range = VK_WHOLE_SIZE; + VkWriteDescriptorSet& write_descriptor_set_edram = write_descriptor_sets[1]; + write_descriptor_set_edram.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write_descriptor_set_edram.pNext = nullptr; + write_descriptor_set_edram.dstSet = shared_memory_and_edram_descriptor_set_; + write_descriptor_set_edram.dstBinding = 1; + write_descriptor_set_edram.dstArrayElement = 0; + write_descriptor_set_edram.descriptorCount = 1; + write_descriptor_set_edram.descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write_descriptor_set_edram.pImageInfo = nullptr; + write_descriptor_set_edram.pBufferInfo = &edram_descriptor_buffer_info; + write_descriptor_set_edram.pTexelBufferView = nullptr; + } + dfn.vkUpdateDescriptorSets(device, + 1 + uint32_t(edram_fragment_shader_interlock), + write_descriptor_sets, 0, nullptr); // Swap objects. @@ -1041,6 +1111,9 @@ void VulkanCommandProcessor::ShutdownContext() { } descriptor_set_layouts_textures_.clear(); + ui::vulkan::util::DestroyAndNullHandle( + dfn.vkDestroyDescriptorSetLayout, device, + descriptor_set_layout_shared_memory_and_edram_); for (VkDescriptorSetLayout& descriptor_set_layout_single_transient : descriptor_set_layouts_single_transient_) { ui::vulkan::util::DestroyAndNullHandle( @@ -1050,9 +1123,6 @@ void VulkanCommandProcessor::ShutdownContext() { ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, device, descriptor_set_layout_constants_); - ui::vulkan::util::DestroyAndNullHandle( - dfn.vkDestroyDescriptorSetLayout, device, - descriptor_set_layout_shared_memory_and_edram_); ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, device, descriptor_set_layout_empty_); @@ -2401,7 +2471,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // Update system constants before uploading them. UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result, shader_32bit_index_dma, viewport_info, - used_texture_mask); + used_texture_mask, normalized_depth_control, + normalized_color_mask); // Update uniform buffers and descriptor sets after binding the pipeline with // the new layout. @@ -2461,6 +2532,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // After all commands that may dispatch, copy or insert barriers, submit the // barriers (may end the render pass), and (re)enter the render pass before // drawing. + // TODO(Triang3l): Handle disabled variableMultisampleRate by restarting the + // render pass with no attachments if the sample count becomes different. SubmitBarriersAndEnterRenderTargetCacheRenderPass( render_target_cache_->last_update_render_pass(), render_target_cache_->last_update_framebuffer()); @@ -3180,175 +3253,180 @@ void VulkanCommandProcessor::UpdateDynamicState( scissor_rect.extent.height = scissor.extent[1]; SetScissor(scissor_rect); - // Depth bias. - // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB - // implementation. - float depth_bias_constant_factor, depth_bias_slope_factor; - draw_util::GetPreferredFacePolygonOffset(regs, primitive_polygonal, - depth_bias_slope_factor, - depth_bias_constant_factor); - depth_bias_constant_factor *= - regs.Get().depth_format == - xenos::DepthRenderTargetFormat::kD24S8 - ? draw_util::kD3D10PolygonOffsetFactorUnorm24 - : draw_util::kD3D10PolygonOffsetFactorFloat24; - // With non-square resolution scaling, make sure the worst-case impact is - // reverted (slope only along the scaled axis), thus max. More bias is better - // than less bias, because less bias means Z fighting with the background is - // more likely. - depth_bias_slope_factor *= - xenos::kPolygonOffsetScaleSubpixelUnit * - float(std::max(render_target_cache_->draw_resolution_scale_x(), - render_target_cache_->draw_resolution_scale_y())); - // std::memcmp instead of != so in case of NaN, every draw won't be - // invalidating it. - dynamic_depth_bias_update_needed_ |= - std::memcmp(&dynamic_depth_bias_constant_factor_, - &depth_bias_constant_factor, sizeof(float)) != 0; - dynamic_depth_bias_update_needed_ |= - std::memcmp(&dynamic_depth_bias_slope_factor_, &depth_bias_slope_factor, - sizeof(float)) != 0; - if (dynamic_depth_bias_update_needed_) { - dynamic_depth_bias_constant_factor_ = depth_bias_constant_factor; - dynamic_depth_bias_slope_factor_ = depth_bias_slope_factor; - deferred_command_buffer_.CmdVkSetDepthBias( - dynamic_depth_bias_constant_factor_, 0.0f, - dynamic_depth_bias_slope_factor_); - dynamic_depth_bias_update_needed_ = false; - } + if (render_target_cache_->GetPath() == + RenderTargetCache::Path::kHostRenderTargets) { + // Depth bias. + float depth_bias_constant_factor, depth_bias_slope_factor; + draw_util::GetPreferredFacePolygonOffset(regs, primitive_polygonal, + depth_bias_slope_factor, + depth_bias_constant_factor); + depth_bias_constant_factor *= + regs.Get().depth_format == + xenos::DepthRenderTargetFormat::kD24S8 + ? draw_util::kD3D10PolygonOffsetFactorUnorm24 + : draw_util::kD3D10PolygonOffsetFactorFloat24; + // With non-square resolution scaling, make sure the worst-case impact is + // reverted (slope only along the scaled axis), thus max. More bias is + // better than less bias, because less bias means Z fighting with the + // background is more likely. + depth_bias_slope_factor *= + xenos::kPolygonOffsetScaleSubpixelUnit * + float(std::max(render_target_cache_->draw_resolution_scale_x(), + render_target_cache_->draw_resolution_scale_y())); + // std::memcmp instead of != so in case of NaN, every draw won't be + // invalidating it. + dynamic_depth_bias_update_needed_ |= + std::memcmp(&dynamic_depth_bias_constant_factor_, + &depth_bias_constant_factor, sizeof(float)) != 0; + dynamic_depth_bias_update_needed_ |= + std::memcmp(&dynamic_depth_bias_slope_factor_, &depth_bias_slope_factor, + sizeof(float)) != 0; + if (dynamic_depth_bias_update_needed_) { + dynamic_depth_bias_constant_factor_ = depth_bias_constant_factor; + dynamic_depth_bias_slope_factor_ = depth_bias_slope_factor; + deferred_command_buffer_.CmdVkSetDepthBias( + dynamic_depth_bias_constant_factor_, 0.0f, + dynamic_depth_bias_slope_factor_); + dynamic_depth_bias_update_needed_ = false; + } - // Blend constants. - float blend_constants[] = { - regs[XE_GPU_REG_RB_BLEND_RED].f32, - regs[XE_GPU_REG_RB_BLEND_GREEN].f32, - regs[XE_GPU_REG_RB_BLEND_BLUE].f32, - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32, - }; - dynamic_blend_constants_update_needed_ |= - std::memcmp(dynamic_blend_constants_, blend_constants, - sizeof(float) * 4) != 0; - if (dynamic_blend_constants_update_needed_) { - std::memcpy(dynamic_blend_constants_, blend_constants, sizeof(float) * 4); - deferred_command_buffer_.CmdVkSetBlendConstants(dynamic_blend_constants_); - dynamic_blend_constants_update_needed_ = false; - } + // Blend constants. + float blend_constants[] = { + regs[XE_GPU_REG_RB_BLEND_RED].f32, + regs[XE_GPU_REG_RB_BLEND_GREEN].f32, + regs[XE_GPU_REG_RB_BLEND_BLUE].f32, + regs[XE_GPU_REG_RB_BLEND_ALPHA].f32, + }; + dynamic_blend_constants_update_needed_ |= + std::memcmp(dynamic_blend_constants_, blend_constants, + sizeof(float) * 4) != 0; + if (dynamic_blend_constants_update_needed_) { + std::memcpy(dynamic_blend_constants_, blend_constants, sizeof(float) * 4); + deferred_command_buffer_.CmdVkSetBlendConstants(dynamic_blend_constants_); + dynamic_blend_constants_update_needed_ = false; + } - // Stencil masks and references. - // Due to pretty complex conditions involving registers not directly related - // to stencil (primitive type, culling), changing the values only when stencil - // is actually needed. However, due to the way dynamic state needs to be set - // in Vulkan, which doesn't take into account whether the state actually has - // effect on drawing, and because the masks and the references are always - // dynamic in Xenia guest pipelines, they must be set in the command buffer - // before any draw. - if (normalized_depth_control.stencil_enable) { - Register stencil_ref_mask_front_reg, stencil_ref_mask_back_reg; - if (primitive_polygonal && normalized_depth_control.backface_enable) { - const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); - const VkPhysicalDevicePortabilitySubsetFeaturesKHR* - device_portability_subset_features = - provider.device_portability_subset_features(); - if (!device_portability_subset_features || - device_portability_subset_features->separateStencilMaskRef) { - // Choose the back face values only if drawing only back faces. - stencil_ref_mask_front_reg = - regs.Get().cull_front - ? XE_GPU_REG_RB_STENCILREFMASK_BF - : XE_GPU_REG_RB_STENCILREFMASK; - stencil_ref_mask_back_reg = stencil_ref_mask_front_reg; + // Stencil masks and references. + // Due to pretty complex conditions involving registers not directly related + // to stencil (primitive type, culling), changing the values only when + // stencil is actually needed. However, due to the way dynamic state needs + // to be set in Vulkan, which doesn't take into account whether the state + // actually has effect on drawing, and because the masks and the references + // are always dynamic in Xenia guest pipelines, they must be set in the + // command buffer before any draw. + if (normalized_depth_control.stencil_enable) { + Register stencil_ref_mask_front_reg, stencil_ref_mask_back_reg; + if (primitive_polygonal && normalized_depth_control.backface_enable) { + const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); + const VkPhysicalDevicePortabilitySubsetFeaturesKHR* + device_portability_subset_features = + provider.device_portability_subset_features(); + if (!device_portability_subset_features || + device_portability_subset_features->separateStencilMaskRef) { + // Choose the back face values only if drawing only back faces. + stencil_ref_mask_front_reg = + regs.Get().cull_front + ? XE_GPU_REG_RB_STENCILREFMASK_BF + : XE_GPU_REG_RB_STENCILREFMASK; + stencil_ref_mask_back_reg = stencil_ref_mask_front_reg; + } else { + stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK; + stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK_BF; + } } else { stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK; - stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK_BF; + stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK; } - } else { - stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK; - stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK; + auto stencil_ref_mask_front = + regs.Get(stencil_ref_mask_front_reg); + auto stencil_ref_mask_back = + regs.Get(stencil_ref_mask_back_reg); + // Compare mask. + dynamic_stencil_compare_mask_front_update_needed_ |= + dynamic_stencil_compare_mask_front_ != + stencil_ref_mask_front.stencilmask; + dynamic_stencil_compare_mask_front_ = stencil_ref_mask_front.stencilmask; + dynamic_stencil_compare_mask_back_update_needed_ |= + dynamic_stencil_compare_mask_back_ != + stencil_ref_mask_back.stencilmask; + dynamic_stencil_compare_mask_back_ = stencil_ref_mask_back.stencilmask; + // Write mask. + dynamic_stencil_write_mask_front_update_needed_ |= + dynamic_stencil_write_mask_front_ != + stencil_ref_mask_front.stencilwritemask; + dynamic_stencil_write_mask_front_ = + stencil_ref_mask_front.stencilwritemask; + dynamic_stencil_write_mask_back_update_needed_ |= + dynamic_stencil_write_mask_back_ != + stencil_ref_mask_back.stencilwritemask; + dynamic_stencil_write_mask_back_ = stencil_ref_mask_back.stencilwritemask; + // Reference. + dynamic_stencil_reference_front_update_needed_ |= + dynamic_stencil_reference_front_ != stencil_ref_mask_front.stencilref; + dynamic_stencil_reference_front_ = stencil_ref_mask_front.stencilref; + dynamic_stencil_reference_back_update_needed_ |= + dynamic_stencil_reference_back_ != stencil_ref_mask_back.stencilref; + dynamic_stencil_reference_back_ = stencil_ref_mask_back.stencilref; } - auto stencil_ref_mask_front = - regs.Get(stencil_ref_mask_front_reg); - auto stencil_ref_mask_back = - regs.Get(stencil_ref_mask_back_reg); - // Compare mask. - dynamic_stencil_compare_mask_front_update_needed_ |= - dynamic_stencil_compare_mask_front_ != - stencil_ref_mask_front.stencilmask; - dynamic_stencil_compare_mask_front_ = stencil_ref_mask_front.stencilmask; - dynamic_stencil_compare_mask_back_update_needed_ |= - dynamic_stencil_compare_mask_back_ != stencil_ref_mask_back.stencilmask; - dynamic_stencil_compare_mask_back_ = stencil_ref_mask_back.stencilmask; - // Write mask. - dynamic_stencil_write_mask_front_update_needed_ |= - dynamic_stencil_write_mask_front_ != - stencil_ref_mask_front.stencilwritemask; - dynamic_stencil_write_mask_front_ = stencil_ref_mask_front.stencilwritemask; - dynamic_stencil_write_mask_back_update_needed_ |= - dynamic_stencil_write_mask_back_ != - stencil_ref_mask_back.stencilwritemask; - dynamic_stencil_write_mask_back_ = stencil_ref_mask_back.stencilwritemask; - // Reference. - dynamic_stencil_reference_front_update_needed_ |= - dynamic_stencil_reference_front_ != stencil_ref_mask_front.stencilref; - dynamic_stencil_reference_front_ = stencil_ref_mask_front.stencilref; - dynamic_stencil_reference_back_update_needed_ |= - dynamic_stencil_reference_back_ != stencil_ref_mask_back.stencilref; - dynamic_stencil_reference_back_ = stencil_ref_mask_back.stencilref; - } - // Using VK_STENCIL_FACE_FRONT_AND_BACK for higher safety when running on the - // Vulkan portability subset without separateStencilMaskRef. - if (dynamic_stencil_compare_mask_front_update_needed_ || - dynamic_stencil_compare_mask_back_update_needed_) { - if (dynamic_stencil_compare_mask_front_ == - dynamic_stencil_compare_mask_back_) { - deferred_command_buffer_.CmdVkSetStencilCompareMask( - VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_compare_mask_front_); - } else { - if (dynamic_stencil_compare_mask_front_update_needed_) { + // Using VK_STENCIL_FACE_FRONT_AND_BACK for higher safety when running on + // the Vulkan portability subset without separateStencilMaskRef. + if (dynamic_stencil_compare_mask_front_update_needed_ || + dynamic_stencil_compare_mask_back_update_needed_) { + if (dynamic_stencil_compare_mask_front_ == + dynamic_stencil_compare_mask_back_) { deferred_command_buffer_.CmdVkSetStencilCompareMask( - VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_compare_mask_front_); - } - if (dynamic_stencil_compare_mask_back_update_needed_) { - deferred_command_buffer_.CmdVkSetStencilCompareMask( - VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_compare_mask_back_); + VK_STENCIL_FACE_FRONT_AND_BACK, + dynamic_stencil_compare_mask_front_); + } else { + if (dynamic_stencil_compare_mask_front_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilCompareMask( + VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_compare_mask_front_); + } + if (dynamic_stencil_compare_mask_back_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilCompareMask( + VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_compare_mask_back_); + } } + dynamic_stencil_compare_mask_front_update_needed_ = false; + dynamic_stencil_compare_mask_back_update_needed_ = false; } - dynamic_stencil_compare_mask_front_update_needed_ = false; - dynamic_stencil_compare_mask_back_update_needed_ = false; - } - if (dynamic_stencil_write_mask_front_update_needed_ || - dynamic_stencil_write_mask_back_update_needed_) { - if (dynamic_stencil_write_mask_front_ == dynamic_stencil_write_mask_back_) { - deferred_command_buffer_.CmdVkSetStencilWriteMask( - VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_write_mask_front_); - } else { - if (dynamic_stencil_write_mask_front_update_needed_) { + if (dynamic_stencil_write_mask_front_update_needed_ || + dynamic_stencil_write_mask_back_update_needed_) { + if (dynamic_stencil_write_mask_front_ == + dynamic_stencil_write_mask_back_) { deferred_command_buffer_.CmdVkSetStencilWriteMask( - VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_write_mask_front_); - } - if (dynamic_stencil_write_mask_back_update_needed_) { - deferred_command_buffer_.CmdVkSetStencilWriteMask( - VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_write_mask_back_); + VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_write_mask_front_); + } else { + if (dynamic_stencil_write_mask_front_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilWriteMask( + VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_write_mask_front_); + } + if (dynamic_stencil_write_mask_back_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilWriteMask( + VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_write_mask_back_); + } } + dynamic_stencil_write_mask_front_update_needed_ = false; + dynamic_stencil_write_mask_back_update_needed_ = false; } - dynamic_stencil_write_mask_front_update_needed_ = false; - dynamic_stencil_write_mask_back_update_needed_ = false; - } - if (dynamic_stencil_reference_front_update_needed_ || - dynamic_stencil_reference_back_update_needed_) { - if (dynamic_stencil_reference_front_ == dynamic_stencil_reference_back_) { - deferred_command_buffer_.CmdVkSetStencilReference( - VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_reference_front_); - } else { - if (dynamic_stencil_reference_front_update_needed_) { + if (dynamic_stencil_reference_front_update_needed_ || + dynamic_stencil_reference_back_update_needed_) { + if (dynamic_stencil_reference_front_ == dynamic_stencil_reference_back_) { deferred_command_buffer_.CmdVkSetStencilReference( - VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_reference_front_); - } - if (dynamic_stencil_reference_back_update_needed_) { - deferred_command_buffer_.CmdVkSetStencilReference( - VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_reference_back_); + VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_reference_front_); + } else { + if (dynamic_stencil_reference_front_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilReference( + VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_reference_front_); + } + if (dynamic_stencil_reference_back_update_needed_) { + deferred_command_buffer_.CmdVkSetStencilReference( + VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_reference_back_); + } } + dynamic_stencil_reference_front_update_needed_ = false; + dynamic_stencil_reference_back_update_needed_ = false; } - dynamic_stencil_reference_front_update_needed_ = false; - dynamic_stencil_reference_back_update_needed_ = false; } // TODO(Triang3l): VK_EXT_extended_dynamic_state and @@ -3359,23 +3437,67 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, - uint32_t used_texture_mask) { + uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control, + uint32_t normalized_color_mask) { #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES const RegisterFile& regs = *register_file_; auto pa_cl_vte_cntl = regs.Get(); + auto pa_su_sc_mode_cntl = regs.Get(); float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32; auto rb_colorcontrol = regs.Get(); + auto rb_depth_info = regs.Get(); + auto rb_stencilrefmask = regs.Get(); + auto rb_stencilrefmask_bf = + regs.Get(XE_GPU_REG_RB_STENCILREFMASK_BF); + auto rb_surface_info = regs.Get(); auto vgt_draw_initiator = regs.Get(); int32_t vgt_indx_offset = int32_t(regs[XE_GPU_REG_VGT_INDX_OFFSET].u32); - // Get the color info register values for each render target. + bool edram_fragment_shader_interlock = + render_target_cache_->GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); + uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); + + // Get the color info register values for each render target. Also, for FSI, + // exclude components that don't exist in the format from the write mask. + // Don't exclude fully overlapping render targets, however - two render + // targets with the same base address are used in the lighting pass of + // 4D5307E6, for example, with the needed one picked with dynamic control + // flow. reg::RB_COLOR_INFO color_infos[xenos::kMaxColorRenderTargets]; + float rt_clamp[4][4]; + // Two UINT32_MAX if no components actually existing in the RT are written. + uint32_t rt_keep_masks[4][2]; for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { - color_infos[i] = regs.Get( + auto color_info = regs.Get( reg::RB_COLOR_INFO::rt_register_indices[i]); + color_infos[i] = color_info; + if (edram_fragment_shader_interlock) { + RenderTargetCache::GetPSIColorFormatInfo( + color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111, + rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3], + rt_keep_masks[i][0], rt_keep_masks[i][1]); + } + } + + // Disable depth and stencil if it aliases a color render target (for + // instance, during the XBLA logo in 58410954, though depth writing is already + // disabled there). + bool depth_stencil_enabled = normalized_depth_control.stencil_enable || + normalized_depth_control.z_enable; + if (edram_fragment_shader_interlock && depth_stencil_enabled) { + for (uint32_t i = 0; i < 4; ++i) { + if (rb_depth_info.depth_base == color_infos[i].color_base && + (rt_keep_masks[i][0] != UINT32_MAX || + rt_keep_masks[i][1] != UINT32_MAX)) { + depth_stencil_enabled = false; + break; + } + } } bool dirty = false; @@ -3419,6 +3541,13 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( if (draw_util::IsPrimitiveLine(regs)) { flags |= SpirvShaderTranslator::kSysFlag_PrimitiveLine; } + // MSAA sample count. + flags |= uint32_t(rb_surface_info.msaa_samples) + << SpirvShaderTranslator::kSysFlag_MsaaSamples_Shift; + // Depth format. + if (rb_depth_info.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { + flags |= SpirvShaderTranslator::kSysFlag_DepthFloat24; + } // Alpha test. xenos::CompareFunction alpha_test_function = rb_colorcontrol.alpha_test_enable ? rb_colorcontrol.alpha_func @@ -3433,6 +3562,30 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( flags |= SpirvShaderTranslator::kSysFlag_ConvertColor0ToGamma << i; } } + if (edram_fragment_shader_interlock && depth_stencil_enabled) { + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthStencil; + if (normalized_depth_control.z_enable) { + flags |= uint32_t(normalized_depth_control.zfunc) + << SpirvShaderTranslator::kSysFlag_FSIDepthPassIfLess_Shift; + if (normalized_depth_control.z_write_enable) { + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthWrite; + } + } else { + // In case stencil is used without depth testing - always pass, and + // don't modify the stored depth. + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthPassIfLess | + SpirvShaderTranslator::kSysFlag_FSIDepthPassIfEqual | + SpirvShaderTranslator::kSysFlag_FSIDepthPassIfGreater; + } + if (normalized_depth_control.stencil_enable) { + flags |= SpirvShaderTranslator::kSysFlag_FSIStencilTest; + } + // Hint - if not applicable to the shader, will not have effect. + if (alpha_test_function == xenos::CompareFunction::kAlways && + !rb_colorcontrol.alpha_to_mask_enable) { + flags |= SpirvShaderTranslator::kSysFlag_FSIDepthStencilEarlyWrite; + } + } dirty |= system_constants_.flags != flags; system_constants_.flags = flags; @@ -3492,10 +3645,10 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( // to radius conversion to avoid multiplying the per-vertex diameter by an // additional constant in the shader. float point_screen_diameter_to_ndc_radius_x = - (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_x())) / + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) / std::max(viewport_info.xy_extent[0], uint32_t(1)); float point_screen_diameter_to_ndc_radius_y = - (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_y())) / + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) / std::max(viewport_info.xy_extent[1], uint32_t(1)); dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] != point_screen_diameter_to_ndc_radius_x; @@ -3560,7 +3713,25 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.alpha_test_reference != rb_alpha_ref; system_constants_.alpha_test_reference = rb_alpha_ref; - // Color exponent bias. + uint32_t edram_tile_dwords_scaled = + xenos::kEdramTileWidthSamples * xenos::kEdramTileHeightSamples * + (draw_resolution_scale_x * draw_resolution_scale_y); + + // EDRAM pitch for FSI render target writing. + if (edram_fragment_shader_interlock) { + // Align, then multiply by 32bpp tile size in dwords. + uint32_t edram_32bpp_tile_pitch_dwords_scaled = + ((rb_surface_info.surface_pitch * + (rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1)) + + (xenos::kEdramTileWidthSamples - 1)) / + xenos::kEdramTileWidthSamples * edram_tile_dwords_scaled; + dirty |= system_constants_.edram_32bpp_tile_pitch_dwords_scaled != + edram_32bpp_tile_pitch_dwords_scaled; + system_constants_.edram_32bpp_tile_pitch_dwords_scaled = + edram_32bpp_tile_pitch_dwords_scaled; + } + + // Color exponent bias and FSI render target writing. for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { reg::RB_COLOR_INFO color_info = color_infos[i]; // Exponent bias is in bits 20:25 of RB_COLOR_INFO. @@ -3581,6 +3752,148 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( UINT32_C(0x3F800000) + (color_exp_bias << 23); dirty |= system_constants_.color_exp_bias[i] != color_exp_bias_scale; system_constants_.color_exp_bias[i] = color_exp_bias_scale; + if (edram_fragment_shader_interlock) { + dirty |= + system_constants_.edram_rt_keep_mask[i][0] != rt_keep_masks[i][0]; + system_constants_.edram_rt_keep_mask[i][0] = rt_keep_masks[i][0]; + dirty |= + system_constants_.edram_rt_keep_mask[i][1] != rt_keep_masks[i][1]; + system_constants_.edram_rt_keep_mask[i][1] = rt_keep_masks[i][1]; + if (rt_keep_masks[i][0] != UINT32_MAX || + rt_keep_masks[i][1] != UINT32_MAX) { + uint32_t rt_base_dwords_scaled = + color_info.color_base * edram_tile_dwords_scaled; + dirty |= system_constants_.edram_rt_base_dwords_scaled[i] != + rt_base_dwords_scaled; + system_constants_.edram_rt_base_dwords_scaled[i] = + rt_base_dwords_scaled; + uint32_t format_flags = + RenderTargetCache::AddPSIColorFormatFlags(color_info.color_format); + dirty |= system_constants_.edram_rt_format_flags[i] != format_flags; + system_constants_.edram_rt_format_flags[i] = format_flags; + uint32_t blend_factors_ops = + regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF; + dirty |= system_constants_.edram_rt_blend_factors_ops[i] != + blend_factors_ops; + system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops; + // Can't do float comparisons here because NaNs would result in always + // setting the dirty flag. + dirty |= std::memcmp(system_constants_.edram_rt_clamp[i], rt_clamp[i], + 4 * sizeof(float)) != 0; + std::memcpy(system_constants_.edram_rt_clamp[i], rt_clamp[i], + 4 * sizeof(float)); + } + } + } + + if (edram_fragment_shader_interlock) { + uint32_t depth_base_dwords_scaled = + rb_depth_info.depth_base * edram_tile_dwords_scaled; + dirty |= system_constants_.edram_depth_base_dwords_scaled != + depth_base_dwords_scaled; + system_constants_.edram_depth_base_dwords_scaled = depth_base_dwords_scaled; + + // For non-polygons, front polygon offset is used, and it's enabled if + // POLY_OFFSET_PARA_ENABLED is set, for polygons, separate front and back + // are used. + float poly_offset_front_scale = 0.0f, poly_offset_front_offset = 0.0f; + float poly_offset_back_scale = 0.0f, poly_offset_back_offset = 0.0f; + if (primitive_polygonal) { + if (pa_su_sc_mode_cntl.poly_offset_front_enable) { + poly_offset_front_scale = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + poly_offset_front_offset = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + } + if (pa_su_sc_mode_cntl.poly_offset_back_enable) { + poly_offset_back_scale = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE].f32; + poly_offset_back_offset = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET].f32; + } + } else { + if (pa_su_sc_mode_cntl.poly_offset_para_enable) { + poly_offset_front_scale = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + poly_offset_front_offset = + regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + poly_offset_back_scale = poly_offset_front_scale; + poly_offset_back_offset = poly_offset_front_offset; + } + } + // With non-square resolution scaling, make sure the worst-case impact is + // reverted (slope only along the scaled axis), thus max. More bias is + // better than less bias, because less bias means Z fighting with the + // background is more likely. + float poly_offset_scale_factor = + xenos::kPolygonOffsetScaleSubpixelUnit * + std::max(draw_resolution_scale_x, draw_resolution_scale_y); + poly_offset_front_scale *= poly_offset_scale_factor; + poly_offset_back_scale *= poly_offset_scale_factor; + dirty |= system_constants_.edram_poly_offset_front_scale != + poly_offset_front_scale; + system_constants_.edram_poly_offset_front_scale = poly_offset_front_scale; + dirty |= system_constants_.edram_poly_offset_front_offset != + poly_offset_front_offset; + system_constants_.edram_poly_offset_front_offset = poly_offset_front_offset; + dirty |= system_constants_.edram_poly_offset_back_scale != + poly_offset_back_scale; + system_constants_.edram_poly_offset_back_scale = poly_offset_back_scale; + dirty |= system_constants_.edram_poly_offset_back_offset != + poly_offset_back_offset; + system_constants_.edram_poly_offset_back_offset = poly_offset_back_offset; + + if (depth_stencil_enabled && normalized_depth_control.stencil_enable) { + uint32_t stencil_front_reference_masks = + rb_stencilrefmask.value & 0xFFFFFF; + dirty |= system_constants_.edram_stencil_front_reference_masks != + stencil_front_reference_masks; + system_constants_.edram_stencil_front_reference_masks = + stencil_front_reference_masks; + uint32_t stencil_func_ops = + (normalized_depth_control.value >> 8) & ((1 << 12) - 1); + dirty |= + system_constants_.edram_stencil_front_func_ops != stencil_func_ops; + system_constants_.edram_stencil_front_func_ops = stencil_func_ops; + + if (primitive_polygonal && normalized_depth_control.backface_enable) { + uint32_t stencil_back_reference_masks = + rb_stencilrefmask_bf.value & 0xFFFFFF; + dirty |= system_constants_.edram_stencil_back_reference_masks != + stencil_back_reference_masks; + system_constants_.edram_stencil_back_reference_masks = + stencil_back_reference_masks; + uint32_t stencil_func_ops_bf = + (normalized_depth_control.value >> 20) & ((1 << 12) - 1); + dirty |= system_constants_.edram_stencil_back_func_ops != + stencil_func_ops_bf; + system_constants_.edram_stencil_back_func_ops = stencil_func_ops_bf; + } else { + dirty |= std::memcmp(system_constants_.edram_stencil_back, + system_constants_.edram_stencil_front, + 2 * sizeof(uint32_t)) != 0; + std::memcpy(system_constants_.edram_stencil_back, + system_constants_.edram_stencil_front, + 2 * sizeof(uint32_t)); + } + } + + dirty |= system_constants_.edram_blend_constant[0] != + regs[XE_GPU_REG_RB_BLEND_RED].f32; + system_constants_.edram_blend_constant[0] = + regs[XE_GPU_REG_RB_BLEND_RED].f32; + dirty |= system_constants_.edram_blend_constant[1] != + regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + system_constants_.edram_blend_constant[1] = + regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + dirty |= system_constants_.edram_blend_constant[2] != + regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + system_constants_.edram_blend_constant[2] = + regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + dirty |= system_constants_.edram_blend_constant[3] != + regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; + system_constants_.edram_blend_constant[3] = + regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; } if (dirty) { diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 7920981fb..8e1df02ef 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -256,6 +257,9 @@ class VulkanCommandProcessor : public CommandProcessor { void SetViewport(const VkViewport& viewport); void SetScissor(const VkRect2D& scissor); + // Returns the text to display in the GPU backend name in the window title. + std::string GetWindowTitleText() const; + protected: bool SetupContext() override; void ShutdownContext() override; @@ -437,7 +441,8 @@ class VulkanCommandProcessor : public CommandProcessor { bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, - uint32_t used_texture_mask); + uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control, + uint32_t normalized_color_mask); bool UpdateBindings(const VulkanShader* vertex_shader, const VulkanShader* pixel_shader); // Allocates a descriptor set and fills one or two VkWriteDescriptorSet @@ -514,12 +519,12 @@ class VulkanCommandProcessor : public CommandProcessor { // Descriptor set layouts used by different shaders. VkDescriptorSetLayout descriptor_set_layout_empty_ = VK_NULL_HANDLE; - VkDescriptorSetLayout descriptor_set_layout_shared_memory_and_edram_ = - VK_NULL_HANDLE; VkDescriptorSetLayout descriptor_set_layout_constants_ = VK_NULL_HANDLE; std::array descriptor_set_layouts_single_transient_{}; + VkDescriptorSetLayout descriptor_set_layout_shared_memory_and_edram_ = + VK_NULL_HANDLE; // Descriptor set layouts are referenced by pipeline_layouts_. std::unordered_map(command_processor()); + if (vulkan_command_processor != nullptr) { + return vulkan_command_processor->GetWindowTitleText(); + } + return "Vulkan - HEAVILY INCOMPLETE, early development"; +} + X_STATUS VulkanGraphicsSystem::Setup(cpu::Processor* processor, kernel::KernelState* kernel_state, ui::WindowedAppContext* app_context, diff --git a/src/xenia/gpu/vulkan/vulkan_graphics_system.h b/src/xenia/gpu/vulkan/vulkan_graphics_system.h index ae81e144c..e06892aa1 100644 --- a/src/xenia/gpu/vulkan/vulkan_graphics_system.h +++ b/src/xenia/gpu/vulkan/vulkan_graphics_system.h @@ -26,9 +26,7 @@ class VulkanGraphicsSystem : public GraphicsSystem { static bool IsAvailable() { return true; } - std::string name() const override { - return "Vulkan - HEAVILY INCOMPLETE, early development"; - } + std::string name() const override; X_STATUS Setup(cpu::Processor* processor, kernel::KernelState* kernel_state, ui::WindowedAppContext* app_context, diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index aff800c1a..f1af57a23 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include "third_party/fmt/include/fmt/format.h" #include "third_party/glslang/SPIRV/SpvBuilder.h" @@ -53,8 +54,32 @@ bool VulkanPipelineCache::Initialize() { const ui::vulkan::VulkanProvider& provider = command_processor_.GetVulkanProvider(); + bool edram_fragment_shader_interlock = + render_target_cache_.GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + shader_translator_ = std::make_unique( - SpirvShaderTranslator::Features(provider)); + SpirvShaderTranslator::Features(provider), + render_target_cache_.msaa_2x_attachments_supported(), + render_target_cache_.msaa_2x_no_attachments_supported(), + edram_fragment_shader_interlock); + + if (edram_fragment_shader_interlock) { + std::vector depth_only_fragment_shader_code = + shader_translator_->CreateDepthOnlyFragmentShader(); + depth_only_fragment_shader_ = ui::vulkan::util::CreateShaderModule( + provider, + reinterpret_cast( + depth_only_fragment_shader_code.data()), + depth_only_fragment_shader_code.size()); + if (depth_only_fragment_shader_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanPipelineCache: Failed to create the depth/stencil-only " + "fragment shader for the fragment shader interlock render backend " + "implementation"); + return false; + } + } return true; } @@ -75,6 +100,8 @@ void VulkanPipelineCache::Shutdown() { pipelines_.clear(); // Destroy all internal shaders. + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyShaderModule, device, + depth_only_fragment_shader_); for (const auto& geometry_shader_pair : geometry_shaders_) { if (geometry_shader_pair.second != VK_NULL_HANDLE) { dfn.vkDestroyShaderModule(device, geometry_shader_pair.second, nullptr); @@ -179,15 +206,18 @@ VulkanPipelineCache::GetCurrentPixelShaderModification( modification.pixel.param_gen_point = 0; } - using DepthStencilMode = - SpirvShaderTranslator::Modification::DepthStencilMode; - if (shader.implicit_early_z_write_allowed() && - (!shader.writes_color_target(0) || - !draw_util::DoesCoverageDependOnAlpha( - regs.Get()))) { - modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint; - } else { - modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers; + if (render_target_cache_.GetPath() == + RenderTargetCache::Path::kHostRenderTargets) { + using DepthStencilMode = + SpirvShaderTranslator::Modification::DepthStencilMode; + if (shader.implicit_early_z_write_allowed() && + (!shader.writes_color_target(0) || + !draw_util::DoesCoverageDependOnAlpha( + regs.Get()))) { + modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint; + } else { + modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers; + } } return modification; @@ -303,7 +333,11 @@ bool VulkanPipelineCache::ConfigurePipeline( } } VkRenderPass render_pass = - render_target_cache_.GetRenderPass(render_pass_key); + render_target_cache_.GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock + ? render_target_cache_.GetFragmentShaderInterlockRenderPass() + : render_target_cache_.GetHostRenderTargetsRenderPass( + render_pass_key); if (render_pass == VK_NULL_HANDLE) { return false; } @@ -603,123 +637,127 @@ bool VulkanPipelineCache::GetCurrentStateDescription( description_out.polygon_mode = PipelinePolygonMode::kFill; } - // TODO(Triang3l): Skip depth / stencil and color state for the fragment - // shader interlock RB implementation. - - if (render_pass_key.depth_and_color_used & 1) { - if (normalized_depth_control.z_enable) { - description_out.depth_write_enable = - normalized_depth_control.z_write_enable; - description_out.depth_compare_op = normalized_depth_control.zfunc; - } else { - description_out.depth_compare_op = xenos::CompareFunction::kAlways; - } - if (normalized_depth_control.stencil_enable) { - description_out.stencil_test_enable = 1; - description_out.stencil_front_fail_op = - normalized_depth_control.stencilfail; - description_out.stencil_front_pass_op = - normalized_depth_control.stencilzpass; - description_out.stencil_front_depth_fail_op = - normalized_depth_control.stencilzfail; - description_out.stencil_front_compare_op = - normalized_depth_control.stencilfunc; - if (primitive_polygonal && normalized_depth_control.backface_enable) { - description_out.stencil_back_fail_op = - normalized_depth_control.stencilfail_bf; - description_out.stencil_back_pass_op = - normalized_depth_control.stencilzpass_bf; - description_out.stencil_back_depth_fail_op = - normalized_depth_control.stencilzfail_bf; - description_out.stencil_back_compare_op = - normalized_depth_control.stencilfunc_bf; + if (render_target_cache_.GetPath() == + RenderTargetCache::Path::kHostRenderTargets) { + if (render_pass_key.depth_and_color_used & 1) { + if (normalized_depth_control.z_enable) { + description_out.depth_write_enable = + normalized_depth_control.z_write_enable; + description_out.depth_compare_op = normalized_depth_control.zfunc; } else { - description_out.stencil_back_fail_op = - description_out.stencil_front_fail_op; - description_out.stencil_back_pass_op = - description_out.stencil_front_pass_op; - description_out.stencil_back_depth_fail_op = - description_out.stencil_front_depth_fail_op; - description_out.stencil_back_compare_op = - description_out.stencil_front_compare_op; + description_out.depth_compare_op = xenos::CompareFunction::kAlways; + } + if (normalized_depth_control.stencil_enable) { + description_out.stencil_test_enable = 1; + description_out.stencil_front_fail_op = + normalized_depth_control.stencilfail; + description_out.stencil_front_pass_op = + normalized_depth_control.stencilzpass; + description_out.stencil_front_depth_fail_op = + normalized_depth_control.stencilzfail; + description_out.stencil_front_compare_op = + normalized_depth_control.stencilfunc; + if (primitive_polygonal && normalized_depth_control.backface_enable) { + description_out.stencil_back_fail_op = + normalized_depth_control.stencilfail_bf; + description_out.stencil_back_pass_op = + normalized_depth_control.stencilzpass_bf; + description_out.stencil_back_depth_fail_op = + normalized_depth_control.stencilzfail_bf; + description_out.stencil_back_compare_op = + normalized_depth_control.stencilfunc_bf; + } else { + description_out.stencil_back_fail_op = + description_out.stencil_front_fail_op; + description_out.stencil_back_pass_op = + description_out.stencil_front_pass_op; + description_out.stencil_back_depth_fail_op = + description_out.stencil_front_depth_fail_op; + description_out.stencil_back_compare_op = + description_out.stencil_front_compare_op; + } } } - } - // Color blending and write masks (filled only for the attachments present in - // the render pass object). - uint32_t render_pass_color_rts = render_pass_key.depth_and_color_used >> 1; - if (device_features.independentBlend) { - uint32_t render_pass_color_rts_remaining = render_pass_color_rts; - uint32_t color_rt_index; - while (xe::bit_scan_forward(render_pass_color_rts_remaining, - &color_rt_index)) { - render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index); - WritePipelineRenderTargetDescription( - regs.Get( - reg::RB_BLENDCONTROL::rt_register_indices[color_rt_index]), - (normalized_color_mask >> (color_rt_index * 4)) & 0b1111, - description_out.render_targets[color_rt_index]); - } - } else { - // Take the blend control for the first render target that the guest wants - // to write to (consider it the most important) and use it for all render - // targets, if any. - // TODO(Triang3l): Implement an option for independent blending via multiple - // draw calls with different pipelines maybe? Though independent blending - // support is pretty wide, with a quite prominent exception of Adreno 4xx - // apparently. - uint32_t render_pass_color_rts_remaining = render_pass_color_rts; - uint32_t render_pass_first_color_rt_index; - if (xe::bit_scan_forward(render_pass_color_rts_remaining, - &render_pass_first_color_rt_index)) { - render_pass_color_rts_remaining &= - ~(uint32_t(1) << render_pass_first_color_rt_index); - PipelineRenderTarget& render_pass_first_color_rt = - description_out.render_targets[render_pass_first_color_rt_index]; - uint32_t common_blend_rt_index; - if (xe::bit_scan_forward(normalized_color_mask, &common_blend_rt_index)) { - common_blend_rt_index >>= 2; - // If a common write mask will be used for multiple render targets, use - // the original RB_COLOR_MASK instead of the normalized color mask as - // the normalized color mask has non-existent components forced to - // written (don't need reading to be preserved), while the number of - // components may vary between render targets. The attachments in the - // pass that must not be written to at all will be excluded via a shader - // modification. - WritePipelineRenderTargetDescription( - regs.Get( - reg::RB_BLENDCONTROL::rt_register_indices - [common_blend_rt_index]), - (((normalized_color_mask & - ~(uint32_t(0b1111) << (4 * common_blend_rt_index))) - ? regs[XE_GPU_REG_RB_COLOR_MASK].u32 - : normalized_color_mask) >> - (4 * common_blend_rt_index)) & - 0b1111, - render_pass_first_color_rt); - } else { - // No render targets are written to, though the render pass still may - // contain color attachments - set them to not written and not blending. - render_pass_first_color_rt.src_color_blend_factor = - PipelineBlendFactor::kOne; - render_pass_first_color_rt.dst_color_blend_factor = - PipelineBlendFactor::kZero; - render_pass_first_color_rt.color_blend_op = xenos::BlendOp::kAdd; - render_pass_first_color_rt.src_alpha_blend_factor = - PipelineBlendFactor::kOne; - render_pass_first_color_rt.dst_alpha_blend_factor = - PipelineBlendFactor::kZero; - render_pass_first_color_rt.alpha_blend_op = xenos::BlendOp::kAdd; - } - // Reuse the same blending settings for all render targets in the pass, - // for description consistency. + // Color blending and write masks (filled only for the attachments present + // in the render pass object). + uint32_t render_pass_color_rts = render_pass_key.depth_and_color_used >> 1; + if (device_features.independentBlend) { + uint32_t render_pass_color_rts_remaining = render_pass_color_rts; uint32_t color_rt_index; while (xe::bit_scan_forward(render_pass_color_rts_remaining, &color_rt_index)) { render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index); - description_out.render_targets[color_rt_index] = - render_pass_first_color_rt; + WritePipelineRenderTargetDescription( + regs.Get( + reg::RB_BLENDCONTROL::rt_register_indices[color_rt_index]), + (normalized_color_mask >> (color_rt_index * 4)) & 0b1111, + description_out.render_targets[color_rt_index]); + } + } else { + // Take the blend control for the first render target that the guest wants + // to write to (consider it the most important) and use it for all render + // targets, if any. + // TODO(Triang3l): Implement an option for independent blending via + // replaying the render pass for each set of render targets with unique + // blending parameters, with depth / stencil saved before the first and + // restored before each of the rest maybe? Though independent blending + // support is pretty wide, with a quite prominent exception of Adreno 4xx + // apparently. + uint32_t render_pass_color_rts_remaining = render_pass_color_rts; + uint32_t render_pass_first_color_rt_index; + if (xe::bit_scan_forward(render_pass_color_rts_remaining, + &render_pass_first_color_rt_index)) { + render_pass_color_rts_remaining &= + ~(uint32_t(1) << render_pass_first_color_rt_index); + PipelineRenderTarget& render_pass_first_color_rt = + description_out.render_targets[render_pass_first_color_rt_index]; + uint32_t common_blend_rt_index; + if (xe::bit_scan_forward(normalized_color_mask, + &common_blend_rt_index)) { + common_blend_rt_index >>= 2; + // If a common write mask will be used for multiple render targets, + // use the original RB_COLOR_MASK instead of the normalized color mask + // as the normalized color mask has non-existent components forced to + // written (don't need reading to be preserved), while the number of + // components may vary between render targets. The attachments in the + // pass that must not be written to at all will be excluded via a + // shader modification. + WritePipelineRenderTargetDescription( + regs.Get( + reg::RB_BLENDCONTROL::rt_register_indices + [common_blend_rt_index]), + (((normalized_color_mask & + ~(uint32_t(0b1111) << (4 * common_blend_rt_index))) + ? regs[XE_GPU_REG_RB_COLOR_MASK].u32 + : normalized_color_mask) >> + (4 * common_blend_rt_index)) & + 0b1111, + render_pass_first_color_rt); + } else { + // No render targets are written to, though the render pass still may + // contain color attachments - set them to not written and not + // blending. + render_pass_first_color_rt.src_color_blend_factor = + PipelineBlendFactor::kOne; + render_pass_first_color_rt.dst_color_blend_factor = + PipelineBlendFactor::kZero; + render_pass_first_color_rt.color_blend_op = xenos::BlendOp::kAdd; + render_pass_first_color_rt.src_alpha_blend_factor = + PipelineBlendFactor::kOne; + render_pass_first_color_rt.dst_alpha_blend_factor = + PipelineBlendFactor::kZero; + render_pass_first_color_rt.alpha_blend_op = xenos::BlendOp::kAdd; + } + // Reuse the same blending settings for all render targets in the pass, + // for description consistency. + uint32_t color_rt_index; + while (xe::bit_scan_forward(render_pass_color_rts_remaining, + &color_rt_index)) { + render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index); + description_out.render_targets[color_rt_index] = + render_pass_first_color_rt; + } } } } @@ -1929,6 +1967,10 @@ bool VulkanPipelineCache::EnsurePipelineCreated( command_processor_.GetVulkanProvider(); const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + bool edram_fragment_shader_interlock = + render_target_cache_.GetPath() == + RenderTargetCache::Path::kPixelShaderInterlock; + std::array shader_stages; uint32_t shader_stage_count = 0; @@ -1962,24 +2004,32 @@ bool VulkanPipelineCache::EnsurePipelineCreated( shader_stage_geometry.pName = "main"; shader_stage_geometry.pSpecializationInfo = nullptr; } - // Pixel shader. + // Fragment shader. + VkPipelineShaderStageCreateInfo& shader_stage_fragment = + shader_stages[shader_stage_count++]; + shader_stage_fragment.sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stage_fragment.pNext = nullptr; + shader_stage_fragment.flags = 0; + shader_stage_fragment.stage = VK_SHADER_STAGE_FRAGMENT_BIT; + shader_stage_fragment.module = VK_NULL_HANDLE; + shader_stage_fragment.pName = "main"; + shader_stage_fragment.pSpecializationInfo = nullptr; if (creation_arguments.pixel_shader) { assert_true(creation_arguments.pixel_shader->is_translated()); if (!creation_arguments.pixel_shader->is_valid()) { return false; } - VkPipelineShaderStageCreateInfo& shader_stage_fragment = - shader_stages[shader_stage_count++]; - shader_stage_fragment.sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shader_stage_fragment.pNext = nullptr; - shader_stage_fragment.flags = 0; - shader_stage_fragment.stage = VK_SHADER_STAGE_FRAGMENT_BIT; shader_stage_fragment.module = creation_arguments.pixel_shader->shader_module(); assert_true(shader_stage_fragment.module != VK_NULL_HANDLE); - shader_stage_fragment.pName = "main"; - shader_stage_fragment.pSpecializationInfo = nullptr; + } else { + if (edram_fragment_shader_interlock) { + shader_stage_fragment.module = depth_only_fragment_shader_; + } + } + if (shader_stage_fragment.module == VK_NULL_HANDLE) { + --shader_stage_count; } VkPipelineVertexInputStateCreateInfo vertex_input_state = {}; @@ -2087,11 +2137,11 @@ bool VulkanPipelineCache::EnsurePipelineCreated( // formula, though Z has no effect on anything if a depth attachment is not // used (the guest shader can't access Z), enabling only when there's a // depth / stencil attachment for correctness. - // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB - // implementation. rasterization_state.depthBiasEnable = - (description.render_pass_key.depth_and_color_used & 0b1) ? VK_TRUE - : VK_FALSE; + (!edram_fragment_shader_interlock && + (description.render_pass_key.depth_and_color_used & 0b1)) + ? VK_TRUE + : VK_FALSE; // TODO(Triang3l): Wide lines. rasterization_state.lineWidth = 1.0f; @@ -2101,6 +2151,7 @@ bool VulkanPipelineCache::EnsurePipelineCreated( VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; if (description.render_pass_key.msaa_samples == xenos::MsaaSamples::k2X && !render_target_cache_.IsMsaa2xSupported( + !edram_fragment_shader_interlock && description.render_pass_key.depth_and_color_used != 0)) { // Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same // sample locations, but still top-left and bottom-right - however, this can @@ -2119,126 +2170,131 @@ bool VulkanPipelineCache::EnsurePipelineCreated( depth_stencil_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; depth_stencil_state.pNext = nullptr; - if (description.depth_write_enable || - description.depth_compare_op != xenos::CompareFunction::kAlways) { - depth_stencil_state.depthTestEnable = VK_TRUE; - depth_stencil_state.depthWriteEnable = - description.depth_write_enable ? VK_TRUE : VK_FALSE; - depth_stencil_state.depthCompareOp = VkCompareOp( - uint32_t(VK_COMPARE_OP_NEVER) + uint32_t(description.depth_compare_op)); - } - if (description.stencil_test_enable) { - depth_stencil_state.stencilTestEnable = VK_TRUE; - depth_stencil_state.front.failOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_front_fail_op)); - depth_stencil_state.front.passOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_front_pass_op)); - depth_stencil_state.front.depthFailOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_front_depth_fail_op)); - depth_stencil_state.front.compareOp = - VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + - uint32_t(description.stencil_front_compare_op)); - depth_stencil_state.back.failOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_back_fail_op)); - depth_stencil_state.back.passOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_back_pass_op)); - depth_stencil_state.back.depthFailOp = - VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + - uint32_t(description.stencil_back_depth_fail_op)); - depth_stencil_state.back.compareOp = - VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + - uint32_t(description.stencil_back_compare_op)); - } - - VkPipelineColorBlendAttachmentState - color_blend_attachments[xenos::kMaxColorRenderTargets] = {}; - uint32_t color_rts_used = - description.render_pass_key.depth_and_color_used >> 1; - { - static const VkBlendFactor kBlendFactorMap[] = { - VK_BLEND_FACTOR_ZERO, - VK_BLEND_FACTOR_ONE, - VK_BLEND_FACTOR_SRC_COLOR, - VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR, - VK_BLEND_FACTOR_DST_COLOR, - VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR, - VK_BLEND_FACTOR_SRC_ALPHA, - VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, - VK_BLEND_FACTOR_DST_ALPHA, - VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA, - VK_BLEND_FACTOR_CONSTANT_COLOR, - VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR, - VK_BLEND_FACTOR_CONSTANT_ALPHA, - VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA, - VK_BLEND_FACTOR_SRC_ALPHA_SATURATE, - }; - // 8 entries for safety since 3 bits from the guest are passed directly. - static const VkBlendOp kBlendOpMap[] = {VK_BLEND_OP_ADD, - VK_BLEND_OP_SUBTRACT, - VK_BLEND_OP_MIN, - VK_BLEND_OP_MAX, - VK_BLEND_OP_REVERSE_SUBTRACT, - VK_BLEND_OP_ADD, - VK_BLEND_OP_ADD, - VK_BLEND_OP_ADD}; - uint32_t color_rts_remaining = color_rts_used; - uint32_t color_rt_index; - while (xe::bit_scan_forward(color_rts_remaining, &color_rt_index)) { - color_rts_remaining &= ~(uint32_t(1) << color_rt_index); - VkPipelineColorBlendAttachmentState& color_blend_attachment = - color_blend_attachments[color_rt_index]; - const PipelineRenderTarget& color_rt = - description.render_targets[color_rt_index]; - if (color_rt.src_color_blend_factor != PipelineBlendFactor::kOne || - color_rt.dst_color_blend_factor != PipelineBlendFactor::kZero || - color_rt.color_blend_op != xenos::BlendOp::kAdd || - color_rt.src_alpha_blend_factor != PipelineBlendFactor::kOne || - color_rt.dst_alpha_blend_factor != PipelineBlendFactor::kZero || - color_rt.alpha_blend_op != xenos::BlendOp::kAdd) { - color_blend_attachment.blendEnable = VK_TRUE; - color_blend_attachment.srcColorBlendFactor = - kBlendFactorMap[uint32_t(color_rt.src_color_blend_factor)]; - color_blend_attachment.dstColorBlendFactor = - kBlendFactorMap[uint32_t(color_rt.dst_color_blend_factor)]; - color_blend_attachment.colorBlendOp = - kBlendOpMap[uint32_t(color_rt.color_blend_op)]; - color_blend_attachment.srcAlphaBlendFactor = - kBlendFactorMap[uint32_t(color_rt.src_alpha_blend_factor)]; - color_blend_attachment.dstAlphaBlendFactor = - kBlendFactorMap[uint32_t(color_rt.dst_alpha_blend_factor)]; - color_blend_attachment.alphaBlendOp = - kBlendOpMap[uint32_t(color_rt.alpha_blend_op)]; - } - color_blend_attachment.colorWriteMask = - VkColorComponentFlags(color_rt.color_write_mask); - if (!device_features.independentBlend) { - // For non-independent blend, the pAttachments element for the first - // actually used color will be replicated into all. - break; - } + if (!edram_fragment_shader_interlock) { + if (description.depth_write_enable || + description.depth_compare_op != xenos::CompareFunction::kAlways) { + depth_stencil_state.depthTestEnable = VK_TRUE; + depth_stencil_state.depthWriteEnable = + description.depth_write_enable ? VK_TRUE : VK_FALSE; + depth_stencil_state.depthCompareOp = + VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + + uint32_t(description.depth_compare_op)); + } + if (description.stencil_test_enable) { + depth_stencil_state.stencilTestEnable = VK_TRUE; + depth_stencil_state.front.failOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_front_fail_op)); + depth_stencil_state.front.passOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_front_pass_op)); + depth_stencil_state.front.depthFailOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_front_depth_fail_op)); + depth_stencil_state.front.compareOp = + VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + + uint32_t(description.stencil_front_compare_op)); + depth_stencil_state.back.failOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_back_fail_op)); + depth_stencil_state.back.passOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_back_pass_op)); + depth_stencil_state.back.depthFailOp = + VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) + + uint32_t(description.stencil_back_depth_fail_op)); + depth_stencil_state.back.compareOp = + VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) + + uint32_t(description.stencil_back_compare_op)); } } + VkPipelineColorBlendStateCreateInfo color_blend_state = {}; color_blend_state.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - color_blend_state.attachmentCount = 32 - xe::lzcnt(color_rts_used); - color_blend_state.pAttachments = color_blend_attachments; - if (color_rts_used && !device_features.independentBlend) { - // "If the independent blending feature is not enabled, all elements of - // pAttachments must be identical." - uint32_t first_color_rt_index; - xe::bit_scan_forward(color_rts_used, &first_color_rt_index); - for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) { - if (i == first_color_rt_index) { - continue; + VkPipelineColorBlendAttachmentState + color_blend_attachments[xenos::kMaxColorRenderTargets] = {}; + if (!edram_fragment_shader_interlock) { + uint32_t color_rts_used = + description.render_pass_key.depth_and_color_used >> 1; + { + static const VkBlendFactor kBlendFactorMap[] = { + VK_BLEND_FACTOR_ZERO, + VK_BLEND_FACTOR_ONE, + VK_BLEND_FACTOR_SRC_COLOR, + VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR, + VK_BLEND_FACTOR_DST_COLOR, + VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR, + VK_BLEND_FACTOR_SRC_ALPHA, + VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + VK_BLEND_FACTOR_DST_ALPHA, + VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA, + VK_BLEND_FACTOR_CONSTANT_COLOR, + VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR, + VK_BLEND_FACTOR_CONSTANT_ALPHA, + VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA, + VK_BLEND_FACTOR_SRC_ALPHA_SATURATE, + }; + // 8 entries for safety since 3 bits from the guest are passed directly. + static const VkBlendOp kBlendOpMap[] = {VK_BLEND_OP_ADD, + VK_BLEND_OP_SUBTRACT, + VK_BLEND_OP_MIN, + VK_BLEND_OP_MAX, + VK_BLEND_OP_REVERSE_SUBTRACT, + VK_BLEND_OP_ADD, + VK_BLEND_OP_ADD, + VK_BLEND_OP_ADD}; + uint32_t color_rts_remaining = color_rts_used; + uint32_t color_rt_index; + while (xe::bit_scan_forward(color_rts_remaining, &color_rt_index)) { + color_rts_remaining &= ~(uint32_t(1) << color_rt_index); + VkPipelineColorBlendAttachmentState& color_blend_attachment = + color_blend_attachments[color_rt_index]; + const PipelineRenderTarget& color_rt = + description.render_targets[color_rt_index]; + if (color_rt.src_color_blend_factor != PipelineBlendFactor::kOne || + color_rt.dst_color_blend_factor != PipelineBlendFactor::kZero || + color_rt.color_blend_op != xenos::BlendOp::kAdd || + color_rt.src_alpha_blend_factor != PipelineBlendFactor::kOne || + color_rt.dst_alpha_blend_factor != PipelineBlendFactor::kZero || + color_rt.alpha_blend_op != xenos::BlendOp::kAdd) { + color_blend_attachment.blendEnable = VK_TRUE; + color_blend_attachment.srcColorBlendFactor = + kBlendFactorMap[uint32_t(color_rt.src_color_blend_factor)]; + color_blend_attachment.dstColorBlendFactor = + kBlendFactorMap[uint32_t(color_rt.dst_color_blend_factor)]; + color_blend_attachment.colorBlendOp = + kBlendOpMap[uint32_t(color_rt.color_blend_op)]; + color_blend_attachment.srcAlphaBlendFactor = + kBlendFactorMap[uint32_t(color_rt.src_alpha_blend_factor)]; + color_blend_attachment.dstAlphaBlendFactor = + kBlendFactorMap[uint32_t(color_rt.dst_alpha_blend_factor)]; + color_blend_attachment.alphaBlendOp = + kBlendOpMap[uint32_t(color_rt.alpha_blend_op)]; + } + color_blend_attachment.colorWriteMask = + VkColorComponentFlags(color_rt.color_write_mask); + if (!device_features.independentBlend) { + // For non-independent blend, the pAttachments element for the first + // actually used color will be replicated into all. + break; + } + } + } + color_blend_state.attachmentCount = 32 - xe::lzcnt(color_rts_used); + color_blend_state.pAttachments = color_blend_attachments; + if (color_rts_used && !device_features.independentBlend) { + // "If the independent blending feature is not enabled, all elements of + // pAttachments must be identical." + uint32_t first_color_rt_index; + xe::bit_scan_forward(color_rts_used, &first_color_rt_index); + for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) { + if (i == first_color_rt_index) { + continue; + } + color_blend_attachments[i] = + color_blend_attachments[first_color_rt_index]; } - color_blend_attachments[i] = - color_blend_attachments[first_color_rt_index]; } } @@ -2255,16 +2311,18 @@ bool VulkanPipelineCache::EnsurePipelineCreated( // invalidated (again, even if it has no effect). dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_VIEWPORT; dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_SCISSOR; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_DEPTH_BIAS; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_BLEND_CONSTANTS; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_STENCIL_WRITE_MASK; - dynamic_states[dynamic_state.dynamicStateCount++] = - VK_DYNAMIC_STATE_STENCIL_REFERENCE; + if (!edram_fragment_shader_interlock) { + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_DEPTH_BIAS; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_BLEND_CONSTANTS; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK; + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_REFERENCE; + } VkGraphicsPipelineCreateInfo pipeline_create_info; pipeline_create_info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h index 56346d1bc..09a26caa4 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h @@ -314,6 +314,10 @@ class VulkanPipelineCache { GeometryShaderKey::Hasher> geometry_shaders_; + // Empty depth-only pixel shader for writing to depth buffer using fragment + // shader interlock when no Xenos pixel shader provided. + VkShaderModule depth_only_fragment_shader_ = VK_NULL_HANDLE; + std::unordered_map pipelines_; diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc index 92efc4189..8113827e5 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc @@ -22,6 +22,7 @@ #include "third_party/glslang/SPIRV/GLSL.std.450.h" #include "third_party/glslang/SPIRV/SpvBuilder.h" #include "xenia/base/assert.h" +#include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" @@ -33,6 +34,27 @@ #include "xenia/gpu/xenos.h" #include "xenia/ui/vulkan/vulkan_util.h" +DEFINE_string( + render_target_path_vulkan, "", + "Render target emulation path to use on Vulkan.\n" + "Use: [any, fbo, fsi]\n" + " fbo:\n" + " Host framebuffers and fixed-function blending and depth / stencil " + "testing, copying between render targets when needed.\n" + " Lower accuracy (limited pixel format support).\n" + " Performance limited primarily by render target layout changes requiring " + "copying, but generally higher.\n" + " fsi:\n" + " Manual pixel packing, blending and depth / stencil testing, with free " + "render target layout changes.\n" + " Requires a GPU supporting fragment shader interlock.\n" + " Highest accuracy (all pixel formats handled in software).\n" + " Performance limited primarily by overdraw.\n" + " Any other value:\n" + " Choose what is considered the most optimal for the system (currently " + "always FB because the FSI path is much slower now).", + "GPU"); + namespace xe { namespace gpu { namespace vulkan { @@ -43,6 +65,10 @@ namespace shaders { #include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_2xmsaa_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_4xmsaa_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/passthrough_position_xy_vs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_32bpp_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_32bpp_scaled_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_64bpp_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_clear_64bpp_scaled_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_fast_32bpp_1x2xmsaa_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_fast_32bpp_1x2xmsaa_scaled_cs.h" #include "xenia/gpu/shaders/bytecode/vulkan_spirv/resolve_fast_32bpp_4xmsaa_cs.h" @@ -180,13 +206,61 @@ VulkanRenderTargetCache::VulkanRenderTargetCache( VulkanRenderTargetCache::~VulkanRenderTargetCache() { Shutdown(true); } -bool VulkanRenderTargetCache::Initialize() { +bool VulkanRenderTargetCache::Initialize(uint32_t shared_memory_binding_count) { const ui::vulkan::VulkanProvider& provider = command_processor_.GetVulkanProvider(); const ui::vulkan::VulkanProvider::InstanceFunctions& ifn = provider.ifn(); VkPhysicalDevice physical_device = provider.physical_device(); const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; + + if (cvars::render_target_path_vulkan == "fsi") { + path_ = Path::kPixelShaderInterlock; + } else { + path_ = Path::kHostRenderTargets; + } + // Fragment shader interlock is a feature implemented by pretty advanced GPUs, + // closer to Direct3D 11 / OpenGL ES 3.2 level mainly, not Direct3D 10 / + // OpenGL ES 3.1. Thus, it's fine to demand a wide range of other optional + // features for the fragment shader interlock backend to work. + if (path_ == Path::kPixelShaderInterlock) { + const VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT& + device_fragment_shader_interlock_features = + provider.device_fragment_shader_interlock_features(); + const VkPhysicalDeviceFeatures& device_features = + provider.device_features(); + // Interlocking between fragments with common sample coverage is enough, but + // interlocking more is acceptable too (fragmentShaderShadingRateInterlock + // would be okay too, but it's unlikely that an implementation would + // advertise only it and not any other ones, as it's a very specific feature + // interacting with another optional feature that is variable shading rate, + // so there's no need to overcomplicate the checks and the shader execution + // mode setting). + // Sample-rate shading is required by certain SPIR-V revisions to access the + // sample mask fragment shader input. + // Stanard sample locations are needed for calculating the depth at the + // samples. + // It's unlikely that a device exposing fragment shader interlock won't have + // a large enough storage buffer range and a sufficient SSBO slot count for + // all the shared memory buffers and the EDRAM buffer - an in a conflict + // between, for instance, the ability to vfetch and memexport in fragment + // shaders, and the usage of fragment shader interlock, prefer the former + // for simplicity. + if (!provider.device_extensions().ext_fragment_shader_interlock || + !(device_fragment_shader_interlock_features + .fragmentShaderSampleInterlock || + device_fragment_shader_interlock_features + .fragmentShaderPixelInterlock) || + !device_features.fragmentStoresAndAtomics || + !device_features.sampleRateShading || + !device_limits.standardSampleLocations || + shared_memory_binding_count >= + device_limits.maxDescriptorSetStorageBuffers) { + path_ = Path::kHostRenderTargets; + } + } // Format support. constexpr VkFormatFeatureFlags kUsedDepthFormatFeatures = @@ -199,6 +273,30 @@ bool VulkanRenderTargetCache::Initialize() { (depth_unorm24_properties.optimalTilingFeatures & kUsedDepthFormatFeatures) == kUsedDepthFormatFeatures; + // 2x MSAA support. + // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in + // transfers. + if (cvars::native_2x_msaa) { + // Multisampled integer sampled images are optional in Vulkan and in Xenia. + msaa_2x_attachments_supported_ = + (device_limits.framebufferColorSampleCounts & + device_limits.framebufferDepthSampleCounts & + device_limits.framebufferStencilSampleCounts & + device_limits.sampledImageColorSampleCounts & + device_limits.sampledImageDepthSampleCounts & + device_limits.sampledImageStencilSampleCounts & + VK_SAMPLE_COUNT_2_BIT) && + (device_limits.sampledImageIntegerSampleCounts & + (VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT)) != + VK_SAMPLE_COUNT_4_BIT; + msaa_2x_no_attachments_supported_ = + (device_limits.framebufferNoAttachmentsSampleCounts & + VK_SAMPLE_COUNT_2_BIT) != 0; + } else { + msaa_2x_attachments_supported_ = false; + msaa_2x_no_attachments_supported_ = false; + } + // Descriptor set layouts. VkDescriptorSetLayoutBinding descriptor_set_layout_bindings[2]; descriptor_set_layout_bindings[0].binding = 0; @@ -429,227 +527,355 @@ bool VulkanRenderTargetCache::Initialize() { // TODO(Triang3l): All paths (FSI). - depth_float24_round_ = cvars::depth_float24_round; + if (path_ == Path::kHostRenderTargets) { + // Host render targets. - // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in - // transfers. - if (cvars::native_2x_msaa) { - const VkPhysicalDeviceLimits& device_limits = - provider.device_properties().limits; - // Multisampled integer sampled images are optional in Vulkan and in Xenia. - msaa_2x_attachments_supported_ = - (device_limits.framebufferColorSampleCounts & - device_limits.framebufferDepthSampleCounts & - device_limits.framebufferStencilSampleCounts & - device_limits.sampledImageColorSampleCounts & - device_limits.sampledImageDepthSampleCounts & - device_limits.sampledImageStencilSampleCounts & - VK_SAMPLE_COUNT_2_BIT) && - (device_limits.sampledImageIntegerSampleCounts & - (VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT)) != - VK_SAMPLE_COUNT_4_BIT; - msaa_2x_no_attachments_supported_ = - (device_limits.framebufferNoAttachmentsSampleCounts & - VK_SAMPLE_COUNT_2_BIT) != 0; - } else { - msaa_2x_attachments_supported_ = false; - msaa_2x_no_attachments_supported_ = false; - } + depth_float24_round_ = cvars::depth_float24_round; - // Host depth storing pipeline layout. - VkDescriptorSetLayout host_depth_store_descriptor_set_layouts[] = { - // Destination EDRAM storage buffer. - descriptor_set_layout_storage_buffer_, - // Source depth / stencil texture (only depth is used). - descriptor_set_layout_sampled_image_x2_, - }; - VkPushConstantRange host_depth_store_push_constant_range; - host_depth_store_push_constant_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - host_depth_store_push_constant_range.offset = 0; - host_depth_store_push_constant_range.size = sizeof(HostDepthStoreConstants); - VkPipelineLayoutCreateInfo host_depth_store_pipeline_layout_create_info; - host_depth_store_pipeline_layout_create_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - host_depth_store_pipeline_layout_create_info.pNext = nullptr; - host_depth_store_pipeline_layout_create_info.flags = 0; - host_depth_store_pipeline_layout_create_info.setLayoutCount = - uint32_t(xe::countof(host_depth_store_descriptor_set_layouts)); - host_depth_store_pipeline_layout_create_info.pSetLayouts = - host_depth_store_descriptor_set_layouts; - host_depth_store_pipeline_layout_create_info.pushConstantRangeCount = 1; - host_depth_store_pipeline_layout_create_info.pPushConstantRanges = - &host_depth_store_push_constant_range; - if (dfn.vkCreatePipelineLayout( - device, &host_depth_store_pipeline_layout_create_info, nullptr, - &host_depth_store_pipeline_layout_) != VK_SUCCESS) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the host depth storing " - "pipeline layout"); - Shutdown(); - return false; - } - const std::pair host_depth_store_shaders[] = { - {shaders::host_depth_store_1xmsaa_cs, - sizeof(shaders::host_depth_store_1xmsaa_cs)}, - {shaders::host_depth_store_2xmsaa_cs, - sizeof(shaders::host_depth_store_2xmsaa_cs)}, - {shaders::host_depth_store_4xmsaa_cs, - sizeof(shaders::host_depth_store_4xmsaa_cs)}, - }; - for (size_t i = 0; i < xe::countof(host_depth_store_shaders); ++i) { - const std::pair host_depth_store_shader = - host_depth_store_shaders[i]; - VkPipeline host_depth_store_pipeline = - ui::vulkan::util::CreateComputePipeline( - provider, host_depth_store_pipeline_layout_, - host_depth_store_shader.first, host_depth_store_shader.second); - if (host_depth_store_pipeline == VK_NULL_HANDLE) { + // Host depth storing pipeline layout. + VkDescriptorSetLayout host_depth_store_descriptor_set_layouts[] = { + // Destination EDRAM storage buffer. + descriptor_set_layout_storage_buffer_, + // Source depth / stencil texture (only depth is used). + descriptor_set_layout_sampled_image_x2_, + }; + VkPushConstantRange host_depth_store_push_constant_range; + host_depth_store_push_constant_range.stageFlags = + VK_SHADER_STAGE_COMPUTE_BIT; + host_depth_store_push_constant_range.offset = 0; + host_depth_store_push_constant_range.size = sizeof(HostDepthStoreConstants); + VkPipelineLayoutCreateInfo host_depth_store_pipeline_layout_create_info; + host_depth_store_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + host_depth_store_pipeline_layout_create_info.pNext = nullptr; + host_depth_store_pipeline_layout_create_info.flags = 0; + host_depth_store_pipeline_layout_create_info.setLayoutCount = + uint32_t(xe::countof(host_depth_store_descriptor_set_layouts)); + host_depth_store_pipeline_layout_create_info.pSetLayouts = + host_depth_store_descriptor_set_layouts; + host_depth_store_pipeline_layout_create_info.pushConstantRangeCount = 1; + host_depth_store_pipeline_layout_create_info.pPushConstantRanges = + &host_depth_store_push_constant_range; + if (dfn.vkCreatePipelineLayout( + device, &host_depth_store_pipeline_layout_create_info, nullptr, + &host_depth_store_pipeline_layout_) != VK_SUCCESS) { XELOGE( - "VulkanRenderTargetCache: Failed to create the {}-sample host depth " - "storing pipeline", - uint32_t(1) << i); + "VulkanRenderTargetCache: Failed to create the host depth storing " + "pipeline layout"); Shutdown(); return false; } - host_depth_store_pipelines_[i] = host_depth_store_pipeline; - } - - // Transfer and clear vertex buffer, for quads of up to tile granularity. - transfer_vertex_buffer_pool_ = - std::make_unique( - provider, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - std::max(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, - sizeof(float) * 2 * 6 * - Transfer::kMaxCutoutBorderRectangles * - xenos::kEdramTileCount)); - - // Transfer vertex shader. - transfer_passthrough_vertex_shader_ = ui::vulkan::util::CreateShaderModule( - provider, shaders::passthrough_position_xy_vs, - sizeof(shaders::passthrough_position_xy_vs)); - if (transfer_passthrough_vertex_shader_ == VK_NULL_HANDLE) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the render target ownership " - "transfer vertex shader"); - Shutdown(); - return false; - } - - // Transfer pipeline layouts. - VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layouts - [kTransferUsedDescriptorSetCount]; - VkPushConstantRange transfer_pipeline_layout_push_constant_range; - transfer_pipeline_layout_push_constant_range.stageFlags = - VK_SHADER_STAGE_FRAGMENT_BIT; - transfer_pipeline_layout_push_constant_range.offset = 0; - VkPipelineLayoutCreateInfo transfer_pipeline_layout_create_info; - transfer_pipeline_layout_create_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - transfer_pipeline_layout_create_info.pNext = nullptr; - transfer_pipeline_layout_create_info.flags = 0; - transfer_pipeline_layout_create_info.pSetLayouts = - transfer_pipeline_layout_descriptor_set_layouts; - transfer_pipeline_layout_create_info.pPushConstantRanges = - &transfer_pipeline_layout_push_constant_range; - for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { - const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = - kTransferPipelineLayoutInfos[i]; - transfer_pipeline_layout_create_info.setLayoutCount = 0; - uint32_t transfer_pipeline_layout_descriptor_sets_remaining = - transfer_pipeline_layout_info.used_descriptor_sets; - uint32_t transfer_pipeline_layout_descriptor_set_index; - while ( - xe::bit_scan_forward(transfer_pipeline_layout_descriptor_sets_remaining, - &transfer_pipeline_layout_descriptor_set_index)) { - transfer_pipeline_layout_descriptor_sets_remaining &= - ~(uint32_t(1) << transfer_pipeline_layout_descriptor_set_index); - VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layout = - VK_NULL_HANDLE; - switch (TransferUsedDescriptorSet( - transfer_pipeline_layout_descriptor_set_index)) { - case kTransferUsedDescriptorSetHostDepthBuffer: - transfer_pipeline_layout_descriptor_set_layout = - descriptor_set_layout_storage_buffer_; - break; - case kTransferUsedDescriptorSetHostDepthStencilTextures: - case kTransferUsedDescriptorSetDepthStencilTextures: - transfer_pipeline_layout_descriptor_set_layout = - descriptor_set_layout_sampled_image_x2_; - break; - case kTransferUsedDescriptorSetColorTexture: - transfer_pipeline_layout_descriptor_set_layout = - descriptor_set_layout_sampled_image_; - break; - default: - assert_unhandled_case(TransferUsedDescriptorSet( - transfer_pipeline_layout_descriptor_set_index)); + const std::pair host_depth_store_shaders[] = { + {shaders::host_depth_store_1xmsaa_cs, + sizeof(shaders::host_depth_store_1xmsaa_cs)}, + {shaders::host_depth_store_2xmsaa_cs, + sizeof(shaders::host_depth_store_2xmsaa_cs)}, + {shaders::host_depth_store_4xmsaa_cs, + sizeof(shaders::host_depth_store_4xmsaa_cs)}, + }; + for (size_t i = 0; i < xe::countof(host_depth_store_shaders); ++i) { + const std::pair host_depth_store_shader = + host_depth_store_shaders[i]; + VkPipeline host_depth_store_pipeline = + ui::vulkan::util::CreateComputePipeline( + provider, host_depth_store_pipeline_layout_, + host_depth_store_shader.first, host_depth_store_shader.second); + if (host_depth_store_pipeline == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the {}-sample host " + "depth storing pipeline", + uint32_t(1) << i); + Shutdown(); + return false; } - transfer_pipeline_layout_descriptor_set_layouts - [transfer_pipeline_layout_create_info.setLayoutCount++] = - transfer_pipeline_layout_descriptor_set_layout; + host_depth_store_pipelines_[i] = host_depth_store_pipeline; } - transfer_pipeline_layout_push_constant_range.size = uint32_t( - sizeof(uint32_t) * - xe::bit_count(transfer_pipeline_layout_info.used_push_constant_dwords)); - transfer_pipeline_layout_create_info.pushConstantRangeCount = - transfer_pipeline_layout_info.used_push_constant_dwords ? 1 : 0; - if (dfn.vkCreatePipelineLayout( - device, &transfer_pipeline_layout_create_info, nullptr, - &transfer_pipeline_layouts_[i]) != VK_SUCCESS) { + + // Transfer and clear vertex buffer, for quads of up to tile granularity. + transfer_vertex_buffer_pool_ = + std::make_unique( + provider, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + std::max(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, + sizeof(float) * 2 * 6 * + Transfer::kMaxCutoutBorderRectangles * + xenos::kEdramTileCount)); + + // Transfer vertex shader. + transfer_passthrough_vertex_shader_ = ui::vulkan::util::CreateShaderModule( + provider, shaders::passthrough_position_xy_vs, + sizeof(shaders::passthrough_position_xy_vs)); + if (transfer_passthrough_vertex_shader_ == VK_NULL_HANDLE) { XELOGE( "VulkanRenderTargetCache: Failed to create the render target " - "ownership transfer pipeline layout {}", - i); + "ownership transfer vertex shader"); Shutdown(); return false; } + + // Transfer pipeline layouts. + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layouts + [kTransferUsedDescriptorSetCount]; + VkPushConstantRange transfer_pipeline_layout_push_constant_range; + transfer_pipeline_layout_push_constant_range.stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT; + transfer_pipeline_layout_push_constant_range.offset = 0; + VkPipelineLayoutCreateInfo transfer_pipeline_layout_create_info; + transfer_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + transfer_pipeline_layout_create_info.pNext = nullptr; + transfer_pipeline_layout_create_info.flags = 0; + transfer_pipeline_layout_create_info.pSetLayouts = + transfer_pipeline_layout_descriptor_set_layouts; + transfer_pipeline_layout_create_info.pPushConstantRanges = + &transfer_pipeline_layout_push_constant_range; + for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { + const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = + kTransferPipelineLayoutInfos[i]; + transfer_pipeline_layout_create_info.setLayoutCount = 0; + uint32_t transfer_pipeline_layout_descriptor_sets_remaining = + transfer_pipeline_layout_info.used_descriptor_sets; + uint32_t transfer_pipeline_layout_descriptor_set_index; + while (xe::bit_scan_forward( + transfer_pipeline_layout_descriptor_sets_remaining, + &transfer_pipeline_layout_descriptor_set_index)) { + transfer_pipeline_layout_descriptor_sets_remaining &= + ~(uint32_t(1) << transfer_pipeline_layout_descriptor_set_index); + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layout = + VK_NULL_HANDLE; + switch (TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)) { + case kTransferUsedDescriptorSetHostDepthBuffer: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_storage_buffer_; + break; + case kTransferUsedDescriptorSetHostDepthStencilTextures: + case kTransferUsedDescriptorSetDepthStencilTextures: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_x2_; + break; + case kTransferUsedDescriptorSetColorTexture: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_; + break; + default: + assert_unhandled_case(TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)); + } + transfer_pipeline_layout_descriptor_set_layouts + [transfer_pipeline_layout_create_info.setLayoutCount++] = + transfer_pipeline_layout_descriptor_set_layout; + } + transfer_pipeline_layout_push_constant_range.size = uint32_t( + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info.used_push_constant_dwords)); + transfer_pipeline_layout_create_info.pushConstantRangeCount = + transfer_pipeline_layout_info.used_push_constant_dwords ? 1 : 0; + if (dfn.vkCreatePipelineLayout( + device, &transfer_pipeline_layout_create_info, nullptr, + &transfer_pipeline_layouts_[i]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target " + "ownership transfer pipeline layout {}", + i); + Shutdown(); + return false; + } + } + + // Dump pipeline layouts. + VkDescriptorSetLayout + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetCount]; + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetEdram] = + descriptor_set_layout_storage_buffer_; + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = + descriptor_set_layout_sampled_image_; + VkPushConstantRange dump_pipeline_layout_push_constant_range; + dump_pipeline_layout_push_constant_range.stageFlags = + VK_SHADER_STAGE_COMPUTE_BIT; + dump_pipeline_layout_push_constant_range.offset = 0; + dump_pipeline_layout_push_constant_range.size = + sizeof(uint32_t) * kDumpPushConstantCount; + VkPipelineLayoutCreateInfo dump_pipeline_layout_create_info; + dump_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + dump_pipeline_layout_create_info.pNext = nullptr; + dump_pipeline_layout_create_info.flags = 0; + dump_pipeline_layout_create_info.setLayoutCount = + uint32_t(xe::countof(dump_pipeline_layout_descriptor_set_layouts)); + dump_pipeline_layout_create_info.pSetLayouts = + dump_pipeline_layout_descriptor_set_layouts; + dump_pipeline_layout_create_info.pushConstantRangeCount = 1; + dump_pipeline_layout_create_info.pPushConstantRanges = + &dump_pipeline_layout_push_constant_range; + if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, + nullptr, &dump_pipeline_layout_color_) != + VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the color render target " + "dumping pipeline layout"); + Shutdown(); + return false; + } + dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = + descriptor_set_layout_sampled_image_x2_; + if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, + nullptr, &dump_pipeline_layout_depth_) != + VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the depth render target " + "dumping pipeline layout"); + Shutdown(); + return false; + } + } else if (path_ == Path::kPixelShaderInterlock) { + // Pixel (fragment) shader interlock. + + // Blending is done in linear space directly in shaders. + gamma_render_target_as_srgb_ = false; + + // Always true float24 depth rounded to the nearest even. + depth_float24_round_ = true; + + // The pipeline layout and the pipelines for clearing the EDRAM buffer in + // resolves. + VkPushConstantRange resolve_fsi_clear_push_constant_range; + resolve_fsi_clear_push_constant_range.stageFlags = + VK_SHADER_STAGE_COMPUTE_BIT; + resolve_fsi_clear_push_constant_range.offset = 0; + resolve_fsi_clear_push_constant_range.size = + sizeof(draw_util::ResolveClearShaderConstants); + VkPipelineLayoutCreateInfo resolve_fsi_clear_pipeline_layout_create_info; + resolve_fsi_clear_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + resolve_fsi_clear_pipeline_layout_create_info.pNext = nullptr; + resolve_fsi_clear_pipeline_layout_create_info.flags = 0; + resolve_fsi_clear_pipeline_layout_create_info.setLayoutCount = 1; + resolve_fsi_clear_pipeline_layout_create_info.pSetLayouts = + &descriptor_set_layout_storage_buffer_; + resolve_fsi_clear_pipeline_layout_create_info.pushConstantRangeCount = 1; + resolve_fsi_clear_pipeline_layout_create_info.pPushConstantRanges = + &resolve_fsi_clear_push_constant_range; + if (dfn.vkCreatePipelineLayout( + device, &resolve_fsi_clear_pipeline_layout_create_info, nullptr, + &resolve_fsi_clear_pipeline_layout_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the resolve EDRAM buffer " + "clear pipeline layout"); + Shutdown(); + return false; + } + resolve_fsi_clear_32bpp_pipeline_ = ui::vulkan::util::CreateComputePipeline( + provider, resolve_fsi_clear_pipeline_layout_, + draw_resolution_scaled ? shaders::resolve_clear_32bpp_scaled_cs + : shaders::resolve_clear_32bpp_cs, + draw_resolution_scaled ? sizeof(shaders::resolve_clear_32bpp_scaled_cs) + : sizeof(shaders::resolve_clear_32bpp_cs)); + if (resolve_fsi_clear_32bpp_pipeline_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the 32bpp resolve EDRAM " + "buffer clear pipeline"); + Shutdown(); + return false; + } + resolve_fsi_clear_64bpp_pipeline_ = ui::vulkan::util::CreateComputePipeline( + provider, resolve_fsi_clear_pipeline_layout_, + draw_resolution_scaled ? shaders::resolve_clear_64bpp_scaled_cs + : shaders::resolve_clear_64bpp_cs, + draw_resolution_scaled ? sizeof(shaders::resolve_clear_64bpp_scaled_cs) + : sizeof(shaders::resolve_clear_64bpp_cs)); + if (resolve_fsi_clear_32bpp_pipeline_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the 64bpp resolve EDRAM " + "buffer clear pipeline"); + Shutdown(); + return false; + } + + // Common render pass. + VkSubpassDescription fsi_subpass = {}; + fsi_subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + // Fragment shader interlock provides synchronization and ordering within a + // subpass, create an external by-region dependency to maintain interlocking + // between passes. Framebuffer-global dependencies will be made with + // explicit barriers when the addressing of the EDRAM buffer relatively to + // the fragment coordinates is changed. + VkSubpassDependency fsi_subpass_dependencies[2]; + fsi_subpass_dependencies[0].srcSubpass = VK_SUBPASS_EXTERNAL; + fsi_subpass_dependencies[0].dstSubpass = 0; + fsi_subpass_dependencies[0].srcStageMask = + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + fsi_subpass_dependencies[0].dstStageMask = + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + fsi_subpass_dependencies[0].srcAccessMask = + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + fsi_subpass_dependencies[0].dstAccessMask = + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + fsi_subpass_dependencies[0].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT; + fsi_subpass_dependencies[1] = fsi_subpass_dependencies[0]; + std::swap(fsi_subpass_dependencies[1].srcSubpass, + fsi_subpass_dependencies[1].dstSubpass); + VkRenderPassCreateInfo fsi_render_pass_create_info; + fsi_render_pass_create_info.sType = + VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + fsi_render_pass_create_info.pNext = nullptr; + fsi_render_pass_create_info.flags = 0; + fsi_render_pass_create_info.attachmentCount = 0; + fsi_render_pass_create_info.pAttachments = nullptr; + fsi_render_pass_create_info.subpassCount = 1; + fsi_render_pass_create_info.pSubpasses = &fsi_subpass; + fsi_render_pass_create_info.dependencyCount = + uint32_t(xe::countof(fsi_subpass_dependencies)); + fsi_render_pass_create_info.pDependencies = fsi_subpass_dependencies; + if (dfn.vkCreateRenderPass(device, &fsi_render_pass_create_info, nullptr, + &fsi_render_pass_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the fragment shader " + "interlock render backend render pass"); + Shutdown(); + return false; + } + + // Common framebuffer. + VkFramebufferCreateInfo fsi_framebuffer_create_info; + fsi_framebuffer_create_info.sType = + VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + fsi_framebuffer_create_info.pNext = nullptr; + fsi_framebuffer_create_info.flags = 0; + fsi_framebuffer_create_info.renderPass = fsi_render_pass_; + fsi_framebuffer_create_info.attachmentCount = 0; + fsi_framebuffer_create_info.pAttachments = nullptr; + fsi_framebuffer_create_info.width = std::min( + xenos::kTexture2DCubeMaxWidthHeight * draw_resolution_scale_x(), + device_limits.maxFramebufferWidth); + fsi_framebuffer_create_info.height = std::min( + xenos::kTexture2DCubeMaxWidthHeight * draw_resolution_scale_y(), + device_limits.maxFramebufferHeight); + fsi_framebuffer_create_info.layers = 1; + if (dfn.vkCreateFramebuffer(device, &fsi_framebuffer_create_info, nullptr, + &fsi_framebuffer_.framebuffer) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the fragment shader " + "interlock render backend framebuffer"); + Shutdown(); + return false; + } + fsi_framebuffer_.host_extent.width = fsi_framebuffer_create_info.width; + fsi_framebuffer_.host_extent.height = fsi_framebuffer_create_info.height; + } else { + assert_unhandled_case(path_); + Shutdown(); + return false; } - // Dump pipeline layouts. - VkDescriptorSetLayout - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetCount]; - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetEdram] = - descriptor_set_layout_storage_buffer_; - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = - descriptor_set_layout_sampled_image_; - VkPushConstantRange dump_pipeline_layout_push_constant_range; - dump_pipeline_layout_push_constant_range.stageFlags = - VK_SHADER_STAGE_COMPUTE_BIT; - dump_pipeline_layout_push_constant_range.offset = 0; - dump_pipeline_layout_push_constant_range.size = - sizeof(uint32_t) * kDumpPushConstantCount; - VkPipelineLayoutCreateInfo dump_pipeline_layout_create_info; - dump_pipeline_layout_create_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - dump_pipeline_layout_create_info.pNext = nullptr; - dump_pipeline_layout_create_info.flags = 0; - dump_pipeline_layout_create_info.setLayoutCount = - uint32_t(xe::countof(dump_pipeline_layout_descriptor_set_layouts)); - dump_pipeline_layout_create_info.pSetLayouts = - dump_pipeline_layout_descriptor_set_layouts; - dump_pipeline_layout_create_info.pushConstantRangeCount = 1; - dump_pipeline_layout_create_info.pPushConstantRanges = - &dump_pipeline_layout_push_constant_range; - if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, - nullptr, - &dump_pipeline_layout_color_) != VK_SUCCESS) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the color render target " - "dumping pipeline layout"); - Shutdown(); - return false; - } - dump_pipeline_layout_descriptor_set_layouts[kDumpDescriptorSetSource] = - descriptor_set_layout_sampled_image_x2_; - if (dfn.vkCreatePipelineLayout(device, &dump_pipeline_layout_create_info, - nullptr, - &dump_pipeline_layout_depth_) != VK_SUCCESS) { - XELOGE( - "VulkanRenderTargetCache: Failed to create the depth render target " - "dumping pipeline layout"); - Shutdown(); - return false; - } + // Reset the last update structures, to keep the defaults consistent between + // paths regardless of whether the update for the path actually modifies them. + last_update_render_pass_key_ = RenderPassKey(); + last_update_render_pass_ = VK_NULL_HANDLE; + last_update_framebuffer_pitch_tiles_at_32bpp_ = 0; + std::memset(last_update_framebuffer_attachments_, 0, + sizeof(last_update_framebuffer_attachments_)); + last_update_framebuffer_ = VK_NULL_HANDLE; InitializeCommon(); return true; @@ -667,6 +893,18 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) { // already too late. DestroyAllRenderTargets(true); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipeline, device, + resolve_fsi_clear_64bpp_pipeline_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipeline, device, + resolve_fsi_clear_32bpp_pipeline_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device, + resolve_fsi_clear_pipeline_layout_); + + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyFramebuffer, device, + fsi_framebuffer_.framebuffer); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyRenderPass, device, + fsi_render_pass_); + for (const auto& dump_pipeline_pair : dump_pipelines_) { // May be null to prevent recreation attempts. if (dump_pipeline_pair.second != VK_NULL_HANDLE) { @@ -951,25 +1189,81 @@ bool VulkanRenderTargetCache::Resolve(const Memory& memory, bool clear_depth = resolve_info.IsClearingDepth(); bool clear_color = resolve_info.IsClearingColor(); if (clear_depth || clear_color) { - // TODO(Triang3l): Fragment shader interlock path EDRAM buffer clearing. - if (GetPath() == Path::kHostRenderTargets) { - Transfer::Rectangle clear_rectangle; - RenderTarget* clear_render_targets[2]; - // If PrepareHostRenderTargetsResolveClear returns false, may be just an - // empty region (success) or an error - don't care. - if (PrepareHostRenderTargetsResolveClear( - resolve_info, clear_rectangle, clear_render_targets[0], - clear_transfers_[0], clear_render_targets[1], - clear_transfers_[1])) { - uint64_t clear_values[2]; - clear_values[0] = resolve_info.rb_depth_clear; - clear_values[1] = resolve_info.rb_color_clear | - (uint64_t(resolve_info.rb_color_clear_lo) << 32); - PerformTransfersAndResolveClears(2, clear_render_targets, - clear_transfers_, clear_values, - &clear_rectangle); - } - cleared = true; + switch (GetPath()) { + case Path::kHostRenderTargets: { + Transfer::Rectangle clear_rectangle; + RenderTarget* clear_render_targets[2]; + // If PrepareHostRenderTargetsResolveClear returns false, may be just an + // empty region (success) or an error - don't care. + if (PrepareHostRenderTargetsResolveClear( + resolve_info, clear_rectangle, clear_render_targets[0], + clear_transfers_[0], clear_render_targets[1], + clear_transfers_[1])) { + uint64_t clear_values[2]; + clear_values[0] = resolve_info.rb_depth_clear; + clear_values[1] = resolve_info.rb_color_clear | + (uint64_t(resolve_info.rb_color_clear_lo) << 32); + PerformTransfersAndResolveClears(2, clear_render_targets, + clear_transfers_, clear_values, + &clear_rectangle); + } + cleared = true; + } break; + case Path::kPixelShaderInterlock: { + UseEdramBuffer(EdramBufferUsage::kComputeWrite); + // Should be safe to only commit once (if was accessed as unordered or + // with fragment shader interlock previously - if there was nothing to + // copy, only to clear, for some reason, for instance), overlap of the + // depth and the color ranges is highly unlikely. + CommitEdramBufferShaderWrites(); + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_COMPUTE, resolve_fsi_clear_pipeline_layout_, + 0, 1, &edram_storage_buffer_descriptor_set_, 0, nullptr); + std::pair clear_group_count = + resolve_info.GetClearShaderGroupCount(draw_resolution_scale_x(), + draw_resolution_scale_y()); + assert_true(clear_group_count.first && clear_group_count.second); + if (clear_depth) { + command_processor_.BindExternalComputePipeline( + resolve_fsi_clear_32bpp_pipeline_); + draw_util::ResolveClearShaderConstants depth_clear_constants; + resolve_info.GetDepthClearShaderConstants(depth_clear_constants); + command_buffer.CmdVkPushConstants( + resolve_fsi_clear_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(depth_clear_constants), &depth_clear_constants); + command_processor_.SubmitBarriers(true); + command_buffer.CmdVkDispatch(clear_group_count.first, + clear_group_count.second, 1); + } + if (clear_color) { + command_processor_.BindExternalComputePipeline( + resolve_info.color_edram_info.format_is_64bpp + ? resolve_fsi_clear_64bpp_pipeline_ + : resolve_fsi_clear_32bpp_pipeline_); + draw_util::ResolveClearShaderConstants color_clear_constants; + resolve_info.GetColorClearShaderConstants(color_clear_constants); + if (clear_depth) { + // Non-RT-specific constants have already been set. + command_buffer.CmdVkPushConstants( + resolve_fsi_clear_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + uint32_t(offsetof(draw_util::ResolveClearShaderConstants, + rt_specific)), + sizeof(color_clear_constants.rt_specific), + &color_clear_constants.rt_specific); + } else { + command_buffer.CmdVkPushConstants( + resolve_fsi_clear_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(color_clear_constants), &color_clear_constants); + } + command_processor_.SubmitBarriers(true); + command_buffer.CmdVkDispatch(clear_group_count.first, + clear_group_count.second, 1); + } + MarkEdramBufferModified(); + cleared = true; + } break; + default: + assert_unhandled_case(GetPath()); } } else { cleared = true; @@ -987,128 +1281,161 @@ bool VulkanRenderTargetCache::Update( return false; } - // TODO(Triang3l): All paths (FSI). - - RenderTarget* const* depth_and_color_render_targets = - last_update_accumulated_render_targets(); - - PerformTransfersAndResolveClears(1 + xenos::kMaxColorRenderTargets, - depth_and_color_render_targets, - last_update_transfers()); - auto rb_surface_info = register_file().Get(); - uint32_t render_targets_are_srgb = - gamma_render_target_as_srgb_ - ? last_update_accumulated_color_targets_are_gamma() - : 0; RenderPassKey render_pass_key; + // Needed even with the fragment shader interlock render backend for passing + // the sample count to the pipeline cache. render_pass_key.msaa_samples = rb_surface_info.msaa_samples; - if (depth_and_color_render_targets[0]) { - render_pass_key.depth_and_color_used |= 1 << 0; - render_pass_key.depth_format = - depth_and_color_render_targets[0]->key().GetDepthFormat(); - } - if (depth_and_color_render_targets[1]) { - render_pass_key.depth_and_color_used |= 1 << 1; - render_pass_key.color_0_view_format = - (render_targets_are_srgb & (1 << 0)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[1]->key().GetColorFormat(); - } - if (depth_and_color_render_targets[2]) { - render_pass_key.depth_and_color_used |= 1 << 2; - render_pass_key.color_1_view_format = - (render_targets_are_srgb & (1 << 1)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[2]->key().GetColorFormat(); - } - if (depth_and_color_render_targets[3]) { - render_pass_key.depth_and_color_used |= 1 << 3; - render_pass_key.color_2_view_format = - (render_targets_are_srgb & (1 << 2)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[3]->key().GetColorFormat(); - } - if (depth_and_color_render_targets[4]) { - render_pass_key.depth_and_color_used |= 1 << 4; - render_pass_key.color_3_view_format = - (render_targets_are_srgb & (1 << 3)) - ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA - : depth_and_color_render_targets[4]->key().GetColorFormat(); - } - const Framebuffer* framebuffer = last_update_framebuffer_; - VkRenderPass render_pass = last_update_render_pass_key_ == render_pass_key - ? last_update_render_pass_ - : VK_NULL_HANDLE; - if (render_pass == VK_NULL_HANDLE) { - render_pass = GetRenderPass(render_pass_key); - if (render_pass == VK_NULL_HANDLE) { + switch (GetPath()) { + case Path::kHostRenderTargets: { + RenderTarget* const* depth_and_color_render_targets = + last_update_accumulated_render_targets(); + + PerformTransfersAndResolveClears(1 + xenos::kMaxColorRenderTargets, + depth_and_color_render_targets, + last_update_transfers()); + + uint32_t render_targets_are_srgb = + gamma_render_target_as_srgb_ + ? last_update_accumulated_color_targets_are_gamma() + : 0; + + if (depth_and_color_render_targets[0]) { + render_pass_key.depth_and_color_used |= 1 << 0; + render_pass_key.depth_format = + depth_and_color_render_targets[0]->key().GetDepthFormat(); + } + if (depth_and_color_render_targets[1]) { + render_pass_key.depth_and_color_used |= 1 << 1; + render_pass_key.color_0_view_format = + (render_targets_are_srgb & (1 << 0)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[1]->key().GetColorFormat(); + } + if (depth_and_color_render_targets[2]) { + render_pass_key.depth_and_color_used |= 1 << 2; + render_pass_key.color_1_view_format = + (render_targets_are_srgb & (1 << 1)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[2]->key().GetColorFormat(); + } + if (depth_and_color_render_targets[3]) { + render_pass_key.depth_and_color_used |= 1 << 3; + render_pass_key.color_2_view_format = + (render_targets_are_srgb & (1 << 2)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[3]->key().GetColorFormat(); + } + if (depth_and_color_render_targets[4]) { + render_pass_key.depth_and_color_used |= 1 << 4; + render_pass_key.color_3_view_format = + (render_targets_are_srgb & (1 << 3)) + ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA + : depth_and_color_render_targets[4]->key().GetColorFormat(); + } + + const Framebuffer* framebuffer = last_update_framebuffer_; + VkRenderPass render_pass = last_update_render_pass_key_ == render_pass_key + ? last_update_render_pass_ + : VK_NULL_HANDLE; + if (render_pass == VK_NULL_HANDLE) { + render_pass = GetHostRenderTargetsRenderPass(render_pass_key); + if (render_pass == VK_NULL_HANDLE) { + return false; + } + // Framebuffer for a different render pass needed now. + framebuffer = nullptr; + } + + uint32_t pitch_tiles_at_32bpp = + ((rb_surface_info.surface_pitch << uint32_t( + rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X)) + + (xenos::kEdramTileWidthSamples - 1)) / + xenos::kEdramTileWidthSamples; + if (framebuffer) { + if (last_update_framebuffer_pitch_tiles_at_32bpp_ != + pitch_tiles_at_32bpp || + std::memcmp(last_update_framebuffer_attachments_, + depth_and_color_render_targets, + sizeof(last_update_framebuffer_attachments_))) { + framebuffer = nullptr; + } + } + if (!framebuffer) { + framebuffer = GetHostRenderTargetsFramebuffer( + render_pass_key, pitch_tiles_at_32bpp, + depth_and_color_render_targets); + if (!framebuffer) { + return false; + } + } + + // Successful update - write the new configuration. + last_update_render_pass_key_ = render_pass_key; + last_update_render_pass_ = render_pass; + last_update_framebuffer_pitch_tiles_at_32bpp_ = pitch_tiles_at_32bpp; + std::memcpy(last_update_framebuffer_attachments_, + depth_and_color_render_targets, + sizeof(last_update_framebuffer_attachments_)); + last_update_framebuffer_ = framebuffer; + + // Transition the used render targets. + for (uint32_t i = 0; i < 1 + xenos::kMaxColorRenderTargets; ++i) { + RenderTarget* rt = depth_and_color_render_targets[i]; + if (!rt) { + continue; + } + auto& vulkan_rt = *static_cast(rt); + VkPipelineStageFlags rt_dst_stage_mask; + VkAccessFlags rt_dst_access_mask; + VkImageLayout rt_new_layout; + VulkanRenderTarget::GetDrawUsage(i == 0, &rt_dst_stage_mask, + &rt_dst_access_mask, &rt_new_layout); + command_processor_.PushImageMemoryBarrier( + vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + i ? VK_IMAGE_ASPECT_COLOR_BIT + : (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)), + vulkan_rt.current_stage_mask(), rt_dst_stage_mask, + vulkan_rt.current_access_mask(), rt_dst_access_mask, + vulkan_rt.current_layout(), rt_new_layout); + vulkan_rt.SetUsage(rt_dst_stage_mask, rt_dst_access_mask, + rt_new_layout); + } + } break; + + case Path::kPixelShaderInterlock: { + // For FSI, only the barrier is needed - already scheduled if required. + // But the buffer will be used for FSI drawing now. + UseEdramBuffer(EdramBufferUsage::kFragmentReadWrite); + // Commit preceding unordered (but not FSI) writes like clears as they + // aren't synchronized with FSI accesses. + CommitEdramBufferShaderWrites( + EdramBufferModificationStatus::kViaUnordered); + // TODO(Triang3l): Check if this draw call modifies color or depth / + // stencil, at least coarsely, to prevent useless barriers. + MarkEdramBufferModified( + EdramBufferModificationStatus::kViaFragmentShaderInterlock); + last_update_render_pass_key_ = render_pass_key; + last_update_render_pass_ = fsi_render_pass_; + last_update_framebuffer_ = &fsi_framebuffer_; + } break; + + default: + assert_unhandled_case(GetPath()); return false; - } - // Framebuffer for a different render pass needed now. - framebuffer = nullptr; - } - - uint32_t pitch_tiles_at_32bpp = - ((rb_surface_info.surface_pitch - << uint32_t(rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X)) + - (xenos::kEdramTileWidthSamples - 1)) / - xenos::kEdramTileWidthSamples; - if (framebuffer) { - if (last_update_framebuffer_pitch_tiles_at_32bpp_ != pitch_tiles_at_32bpp || - std::memcmp(last_update_framebuffer_attachments_, - depth_and_color_render_targets, - sizeof(last_update_framebuffer_attachments_))) { - framebuffer = nullptr; - } - } - if (!framebuffer) { - framebuffer = GetFramebuffer(render_pass_key, pitch_tiles_at_32bpp, - depth_and_color_render_targets); - if (!framebuffer) { - return false; - } - } - - // Successful update - write the new configuration. - last_update_render_pass_key_ = render_pass_key; - last_update_render_pass_ = render_pass; - last_update_framebuffer_pitch_tiles_at_32bpp_ = pitch_tiles_at_32bpp; - std::memcpy(last_update_framebuffer_attachments_, - depth_and_color_render_targets, - sizeof(last_update_framebuffer_attachments_)); - last_update_framebuffer_ = framebuffer; - - // Transition the used render targets. - for (uint32_t i = 0; i < 1 + xenos::kMaxColorRenderTargets; ++i) { - RenderTarget* rt = depth_and_color_render_targets[i]; - if (!rt) { - continue; - } - auto& vulkan_rt = *static_cast(rt); - VkPipelineStageFlags rt_dst_stage_mask; - VkAccessFlags rt_dst_access_mask; - VkImageLayout rt_new_layout; - VulkanRenderTarget::GetDrawUsage(i == 0, &rt_dst_stage_mask, - &rt_dst_access_mask, &rt_new_layout); - command_processor_.PushImageMemoryBarrier( - vulkan_rt.image(), - ui::vulkan::util::InitializeSubresourceRange( - i ? VK_IMAGE_ASPECT_COLOR_BIT - : (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)), - vulkan_rt.current_stage_mask(), rt_dst_stage_mask, - vulkan_rt.current_access_mask(), rt_dst_access_mask, - vulkan_rt.current_layout(), rt_new_layout); - vulkan_rt.SetUsage(rt_dst_stage_mask, rt_dst_access_mask, rt_new_layout); } return true; } -VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { - auto it = render_passes_.find(key.key); +VkRenderPass VulkanRenderTargetCache::GetHostRenderTargetsRenderPass( + RenderPassKey key) { + assert_true(GetPath() == Path::kHostRenderTargets); + + auto it = render_passes_.find(key); if (it != render_passes_.end()) { return it->second; } @@ -1244,10 +1571,10 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { if (dfn.vkCreateRenderPass(device, &render_pass_create_info, nullptr, &render_pass) != VK_SUCCESS) { XELOGE("VulkanRenderTargetCache: Failed to create a render pass"); - render_passes_.emplace(key.key, VK_NULL_HANDLE); + render_passes_.emplace(key, VK_NULL_HANDLE); return VK_NULL_HANDLE; } - render_passes_.emplace(key.key, render_pass); + render_passes_.emplace(key, render_pass); return render_pass; } @@ -1353,15 +1680,17 @@ VulkanRenderTargetCache::VulkanRenderTarget::~VulkanRenderTarget() { } uint32_t VulkanRenderTargetCache::GetMaxRenderTargetWidth() const { - const ui::vulkan::VulkanProvider& provider = - command_processor_.GetVulkanProvider(); - return provider.device_properties().limits.maxFramebufferWidth; + const VkPhysicalDeviceLimits& device_limits = + command_processor_.GetVulkanProvider().device_properties().limits; + return std::min(device_limits.maxFramebufferWidth, + device_limits.maxImageDimension2D); } uint32_t VulkanRenderTargetCache::GetMaxRenderTargetHeight() const { - const ui::vulkan::VulkanProvider& provider = - command_processor_.GetVulkanProvider(); - return provider.device_properties().limits.maxFramebufferHeight; + const VkPhysicalDeviceLimits& device_limits = + command_processor_.GetVulkanProvider().device_properties().limits; + return std::min(device_limits.maxFramebufferHeight, + device_limits.maxImageDimension2D); } RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( @@ -1615,6 +1944,12 @@ bool VulkanRenderTargetCache::IsHostDepthEncodingDifferent( return false; } +void VulkanRenderTargetCache::RequestPixelShaderInterlockBarrier() { + if (edram_buffer_usage_ == EdramBufferUsage::kFragmentReadWrite) { + CommitEdramBufferShaderWrites(); + } +} + void VulkanRenderTargetCache::GetEdramBufferUsageMasks( EdramBufferUsage usage, VkPipelineStageFlags& stage_mask_out, VkAccessFlags& access_mask_out) { @@ -1715,7 +2050,7 @@ void VulkanRenderTargetCache::CommitEdramBufferShaderWrites( } const VulkanRenderTargetCache::Framebuffer* -VulkanRenderTargetCache::GetFramebuffer( +VulkanRenderTargetCache::GetHostRenderTargetsFramebuffer( RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp, const RenderTarget* const* depth_and_color_render_targets) { FramebufferKey key; @@ -1749,8 +2084,10 @@ VulkanRenderTargetCache::GetFramebuffer( command_processor_.GetVulkanProvider(); const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; - VkRenderPass render_pass = GetRenderPass(render_pass_key); + VkRenderPass render_pass = GetHostRenderTargetsRenderPass(render_pass_key); if (render_pass == VK_NULL_HANDLE) { return nullptr; } @@ -1789,12 +2126,19 @@ VulkanRenderTargetCache::GetFramebuffer( render_pass_key.msaa_samples); } else { assert_zero(render_pass_key.depth_and_color_used); - host_extent.width = 0; - host_extent.height = 0; + // Still needed for occlusion queries. + host_extent.width = xenos::kTexture2DCubeMaxWidthHeight; + host_extent.height = xenos::kTexture2DCubeMaxWidthHeight; } - // Vulkan requires width and height greater than 0. - framebuffer_create_info.width = std::max(host_extent.width, uint32_t(1)); - framebuffer_create_info.height = std::max(host_extent.height, uint32_t(1)); + // Limiting to the device limit for the case of no attachments, for which + // there's no limit imposed by the sizes of the attachments that have been + // created successfully. + host_extent.width = std::min(host_extent.width * draw_resolution_scale_x(), + device_limits.maxFramebufferWidth); + host_extent.height = std::min(host_extent.height * draw_resolution_scale_y(), + device_limits.maxFramebufferHeight); + framebuffer_create_info.width = host_extent.width; + framebuffer_create_info.height = host_extent.height; framebuffer_create_info.layers = 1; VkFramebuffer framebuffer; if (dfn.vkCreateFramebuffer(device, &framebuffer_create_info, nullptr, @@ -4070,7 +4414,8 @@ VkPipeline const* VulkanRenderTargetCache::GetTransferPipelines( : nullptr; } - VkRenderPass render_pass = GetRenderPass(key.render_pass_key); + VkRenderPass render_pass = + GetHostRenderTargetsRenderPass(key.render_pass_key); VkShaderModule fragment_shader_module = GetTransferShader(key.shader_key); if (render_pass == VK_NULL_HANDLE || fragment_shader_module == VK_NULL_HANDLE) { @@ -4643,7 +4988,8 @@ void VulkanRenderTargetCache::PerformTransfersAndResolveClears( dest_rt_key.GetColorFormat(); transfer_render_pass_key.color_rts_use_transfer_formats = 1; } - VkRenderPass transfer_render_pass = GetRenderPass(transfer_render_pass_key); + VkRenderPass transfer_render_pass = + GetHostRenderTargetsRenderPass(transfer_render_pass_key); if (transfer_render_pass == VK_NULL_HANDLE) { continue; } @@ -4651,7 +4997,7 @@ void VulkanRenderTargetCache::PerformTransfersAndResolveClears( transfer_framebuffer_render_targets[1 + xenos::kMaxColorRenderTargets] = {}; transfer_framebuffer_render_targets[dest_rt_key.is_depth ? 0 : 1] = dest_rt; - const Framebuffer* transfer_framebuffer = GetFramebuffer( + const Framebuffer* transfer_framebuffer = GetHostRenderTargetsFramebuffer( transfer_render_pass_key, dest_rt_key.pitch_tiles_at_32bpp, transfer_framebuffer_render_targets); if (!transfer_framebuffer) { diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h index 6fa9c6ab0..c5032f82d 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h @@ -43,6 +43,10 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // true 4x MSAA passes (framebuffers because render target cache render // targets are different for 2x and 4x guest MSAA, pipelines because the // sample mask will have 2 samples excluded for 2x-as-4x). + // This has effect only on the attachments, but even in cases when there + // are no attachments, it can be used to the sample count between + // subsystems, for instance, to specify the desired number of samples to + // use when there are no attachments in pipelines. xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits; // 2 // << 0 is depth, << 1...4 is color. uint32_t depth_and_color_used : 1 + xenos::kMaxColorRenderTargets; // 7 @@ -81,8 +85,9 @@ class VulkanRenderTargetCache final : public RenderTargetCache { static_assert_size(RenderPassKey, sizeof(uint32_t)); struct Framebuffer { - VkFramebuffer framebuffer; - VkExtent2D host_extent; + VkFramebuffer framebuffer = VK_NULL_HANDLE; + VkExtent2D host_extent{}; + Framebuffer() = default; Framebuffer(VkFramebuffer framebuffer, const VkExtent2D& host_extent) : framebuffer(framebuffer), host_extent(host_extent) {} }; @@ -96,15 +101,16 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Transient descriptor set layouts must be initialized in the command // processor. - bool Initialize(); + bool Initialize(uint32_t shared_memory_binding_count); void Shutdown(bool from_destructor = false); void ClearCache() override; void CompletedSubmissionUpdated(); void EndSubmission(); - // TODO(Triang3l): Fragment shader interlock. - Path GetPath() const override { return Path::kHostRenderTargets; } + Path GetPath() const override { return path_; } + + VkBuffer edram_buffer() const { return edram_buffer_; } // Performs the resolve to a shared memory area according to the current // register values, and also clears the render targets if needed. Must be in a @@ -161,7 +167,11 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Returns the render pass object, or VK_NULL_HANDLE if failed to create. // A render pass managed by the render target cache may be ended and resumed // at any time (to allow for things like copying and texture loading). - VkRenderPass GetRenderPass(RenderPassKey key); + VkRenderPass GetHostRenderTargetsRenderPass(RenderPassKey key); + VkRenderPass GetFragmentShaderInterlockRenderPass() const { + assert_true(GetPath() == Path::kPixelShaderInterlock); + return fsi_render_pass_; + } VkFormat GetDepthVulkanFormat(xenos::DepthRenderTargetFormat format) const; VkFormat GetColorVulkanFormat(xenos::ColorRenderTargetFormat format) const; @@ -178,6 +188,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool IsHostDepthEncodingDifferent( xenos::DepthRenderTargetFormat format) const override; + void RequestPixelShaderInterlockBarrier() override; + private: enum class EdramBufferUsage { // There's no need for combined fragment and compute usages. @@ -251,6 +263,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache { VulkanCommandProcessor& command_processor_; TraceWriter& trace_writer_; + Path path_ = Path::kHostRenderTargets; + // Accessible in fragment and compute shaders. VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE; VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE; @@ -276,9 +290,18 @@ class VulkanRenderTargetCache final : public RenderTargetCache { std::array resolve_copy_pipelines_{}; - // RenderPassKey::key -> VkRenderPass. - // VK_NULL_HANDLE if failed to create. - std::unordered_map render_passes_; + // On the fragment shader interlock path, the render pass key is used purely + // for passing parameters to pipeline setup - there's always only one render + // pass. + RenderPassKey last_update_render_pass_key_; + VkRenderPass last_update_render_pass_ = VK_NULL_HANDLE; + // The pitch is not used on the fragment shader interlock path. + uint32_t last_update_framebuffer_pitch_tiles_at_32bpp_ = 0; + // The attachments are not used on the fragment shader interlock path. + const RenderTarget* const* + last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] = + {}; + const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE; // For host render targets. @@ -809,7 +832,7 @@ class VulkanRenderTargetCache final : public RenderTargetCache { }; // Returns the framebuffer object, or VK_NULL_HANDLE if failed to create. - const Framebuffer* GetFramebuffer( + const Framebuffer* GetHostRenderTargetsFramebuffer( RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp, const RenderTarget* const* depth_and_color_render_targets); @@ -845,17 +868,13 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool msaa_2x_attachments_supported_ = false; bool msaa_2x_no_attachments_supported_ = false; + // VK_NULL_HANDLE if failed to create. + std::unordered_map + render_passes_; + std::unordered_map framebuffers_; - RenderPassKey last_update_render_pass_key_; - VkRenderPass last_update_render_pass_ = VK_NULL_HANDLE; - uint32_t last_update_framebuffer_pitch_tiles_at_32bpp_ = 0; - const RenderTarget* const* - last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] = - {}; - const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE; - // Set 0 - EDRAM storage buffer, set 1 - source depth sampled image (and // unused stencil from the transfer descriptor set), HostDepthStoreConstants // passed via push constants. @@ -895,6 +914,15 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Temporary storage for DumpRenderTargets. std::vector dump_rectangles_; std::vector dump_invocations_; + + // For pixel (fragment) shader interlock. + + VkRenderPass fsi_render_pass_ = VK_NULL_HANDLE; + Framebuffer fsi_framebuffer_; + + VkPipelineLayout resolve_fsi_clear_pipeline_layout_ = VK_NULL_HANDLE; + VkPipeline resolve_fsi_clear_32bpp_pipeline_ = VK_NULL_HANDLE; + VkPipeline resolve_fsi_clear_64bpp_pipeline_ = VK_NULL_HANDLE; }; } // namespace vulkan diff --git a/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc b/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc index 45153db06..bdc483c43 100644 --- a/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc +++ b/src/xenia/ui/vulkan/functions/instance_khr_get_physical_device_properties2.inc @@ -1,5 +1,7 @@ // VK_KHR_get_physical_device_properties2 functions used in Xenia. // Promoted to Vulkan 1.1 core. +XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceFeatures2KHR, + vkGetPhysicalDeviceFeatures2) XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceMemoryProperties2KHR, vkGetPhysicalDeviceMemoryProperties2) XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceProperties2KHR, diff --git a/src/xenia/ui/vulkan/vulkan_provider.cc b/src/xenia/ui/vulkan/vulkan_provider.cc index 3a30220fb..a1ffd3e61 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.cc +++ b/src/xenia/ui/vulkan/vulkan_provider.cc @@ -696,6 +696,7 @@ bool VulkanProvider::Initialize() { device_extensions_.khr_shader_float_controls = true; device_extensions_.khr_spirv_1_4 = true; if (device_properties_.apiVersion >= VK_MAKE_API_VERSION(0, 1, 3, 0)) { + device_extensions_.ext_shader_demote_to_helper_invocation = true; device_extensions_.khr_maintenance4 = true; } } @@ -709,6 +710,8 @@ bool VulkanProvider::Initialize() { {"VK_EXT_fragment_shader_interlock", offsetof(DeviceExtensions, ext_fragment_shader_interlock)}, {"VK_EXT_memory_budget", offsetof(DeviceExtensions, ext_memory_budget)}, + {"VK_EXT_shader_demote_to_helper_invocation", + offsetof(DeviceExtensions, ext_shader_demote_to_helper_invocation)}, {"VK_EXT_shader_stencil_export", offsetof(DeviceExtensions, ext_shader_stencil_export)}, {"VK_KHR_bind_memory2", offsetof(DeviceExtensions, khr_bind_memory2)}, @@ -816,6 +819,16 @@ bool VulkanProvider::Initialize() { // Get additional device properties. std::memset(&device_float_controls_properties_, 0, sizeof(device_float_controls_properties_)); + device_float_controls_properties_.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR; + std::memset(&device_fragment_shader_interlock_features_, 0, + sizeof(device_fragment_shader_interlock_features_)); + device_fragment_shader_interlock_features_.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT; + std::memset(&device_shader_demote_to_helper_invocation_features_, 0, + sizeof(device_shader_demote_to_helper_invocation_features_)); + device_shader_demote_to_helper_invocation_features_.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT; if (instance_extensions_.khr_get_physical_device_properties2) { VkPhysicalDeviceProperties2KHR device_properties_2; device_properties_2.sType = @@ -824,8 +837,6 @@ bool VulkanProvider::Initialize() { VkPhysicalDeviceProperties2KHR* device_properties_2_last = &device_properties_2; if (device_extensions_.khr_shader_float_controls) { - device_float_controls_properties_.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR; device_float_controls_properties_.pNext = nullptr; device_properties_2_last->pNext = &device_float_controls_properties_; device_properties_2_last = @@ -836,6 +847,28 @@ bool VulkanProvider::Initialize() { ifn_.vkGetPhysicalDeviceProperties2KHR(physical_device_, &device_properties_2); } + VkPhysicalDeviceFeatures2KHR device_features_2; + device_features_2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR; + device_features_2.pNext = nullptr; + VkPhysicalDeviceFeatures2KHR* device_features_2_last = &device_features_2; + if (device_extensions_.ext_fragment_shader_interlock) { + device_fragment_shader_interlock_features_.pNext = nullptr; + device_features_2_last->pNext = + &device_fragment_shader_interlock_features_; + device_features_2_last = reinterpret_cast( + &device_fragment_shader_interlock_features_); + } + if (device_extensions_.ext_shader_demote_to_helper_invocation) { + device_shader_demote_to_helper_invocation_features_.pNext = nullptr; + device_features_2_last->pNext = + &device_shader_demote_to_helper_invocation_features_; + device_features_2_last = reinterpret_cast( + &device_shader_demote_to_helper_invocation_features_); + } + if (device_features_2_last != &device_features_2) { + ifn_.vkGetPhysicalDeviceFeatures2KHR(physical_device_, + &device_features_2); + } } // Create the device. @@ -888,6 +921,21 @@ bool VulkanProvider::Initialize() { device_create_info_last = reinterpret_cast( &device_portability_subset_features_); } + if (device_extensions_.ext_fragment_shader_interlock) { + // TODO(Triang3l): Enable only needed fragment shader interlock features. + device_fragment_shader_interlock_features_.pNext = nullptr; + device_create_info_last->pNext = + &device_fragment_shader_interlock_features_; + device_create_info_last = reinterpret_cast( + &device_fragment_shader_interlock_features_); + } + if (device_extensions_.ext_shader_demote_to_helper_invocation) { + device_shader_demote_to_helper_invocation_features_.pNext = nullptr; + device_create_info_last->pNext = + &device_shader_demote_to_helper_invocation_features_; + device_create_info_last = reinterpret_cast( + &device_shader_demote_to_helper_invocation_features_); + } if (ifn_.vkCreateDevice(physical_device_, &device_create_info, nullptr, &device_) != VK_SUCCESS) { XELOGE("Failed to create a Vulkan device"); @@ -995,8 +1043,30 @@ bool VulkanProvider::Initialize() { XELOGVK("Vulkan device extensions:"); XELOGVK("* VK_EXT_fragment_shader_interlock: {}", device_extensions_.ext_fragment_shader_interlock ? "yes" : "no"); + if (device_extensions_.ext_fragment_shader_interlock) { + XELOGVK( + " * Sample interlock: {}", + device_fragment_shader_interlock_features_.fragmentShaderSampleInterlock + ? "yes" + : "no"); + XELOGVK( + " * Pixel interlock: {}", + device_fragment_shader_interlock_features_.fragmentShaderPixelInterlock + ? "yes" + : "no"); + } XELOGVK("* VK_EXT_memory_budget: {}", device_extensions_.ext_memory_budget ? "yes" : "no"); + XELOGVK( + "* VK_EXT_shader_demote_to_helper_invocation: {}", + device_extensions_.ext_shader_demote_to_helper_invocation ? "yes" : "no"); + if (device_extensions_.ext_shader_demote_to_helper_invocation) { + XELOGVK(" * Demote to helper invocation: {}", + device_shader_demote_to_helper_invocation_features_ + .shaderDemoteToHelperInvocation + ? "yes" + : "no"); + } XELOGVK("* VK_EXT_shader_stencil_export: {}", device_extensions_.ext_shader_stencil_export ? "yes" : "no"); XELOGVK("* VK_KHR_bind_memory2: {}", diff --git a/src/xenia/ui/vulkan/vulkan_provider.h b/src/xenia/ui/vulkan/vulkan_provider.h index 8dc83283c..2d499a614 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.h +++ b/src/xenia/ui/vulkan/vulkan_provider.h @@ -133,6 +133,8 @@ class VulkanProvider : public GraphicsProvider { struct DeviceExtensions { bool ext_fragment_shader_interlock; bool ext_memory_budget; + // Core since 1.3.0. + bool ext_shader_demote_to_helper_invocation; bool ext_shader_stencil_export; // Core since 1.1.0. bool khr_bind_memory2; @@ -198,6 +200,14 @@ class VulkanProvider : public GraphicsProvider { device_float_controls_properties() const { return device_float_controls_properties_; } + const VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT& + device_fragment_shader_interlock_features() const { + return device_fragment_shader_interlock_features_; + } + const VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT& + device_shader_demote_to_helper_invocation_features() const { + return device_shader_demote_to_helper_invocation_features_; + } struct Queue { VkQueue queue = VK_NULL_HANDLE; @@ -320,6 +330,10 @@ class VulkanProvider : public GraphicsProvider { uint32_t queue_family_graphics_compute_; uint32_t queue_family_sparse_binding_; VkPhysicalDeviceFloatControlsPropertiesKHR device_float_controls_properties_; + VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT + device_fragment_shader_interlock_features_; + VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT + device_shader_demote_to_helper_invocation_features_; VkDevice device_ = VK_NULL_HANDLE; DeviceFunctions dfn_ = {}; diff --git a/tools/shader-playground/Editor.Designer.cs b/tools/shader-playground/Editor.Designer.cs index f57e550e4..dfb971e91 100644 --- a/tools/shader-playground/Editor.Designer.cs +++ b/tools/shader-playground/Editor.Designer.cs @@ -191,9 +191,10 @@ this.translationComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList; this.translationComboBox.FormattingEnabled = true; this.translationComboBox.Items.AddRange(new object[] { - "DXBC (RTV/DSV RB)", - "DXBC (ROV RB)", - "SPIR-V"}); + "DXBC (render target RB)", + "DXBC (rasterizer-ordered view RB)", + "SPIR-V (framebuffer RB)", + "SPIR-V (fragment shader interlock RB)"}); this.translationComboBox.Location = new System.Drawing.Point(1224, 0); this.translationComboBox.Margin = new System.Windows.Forms.Padding(3, 0, 3, 0); this.translationComboBox.Name = "translationComboBox"; diff --git a/tools/shader-playground/Editor.cs b/tools/shader-playground/Editor.cs index 52d1f6a6e..cb0aa7145 100644 --- a/tools/shader-playground/Editor.cs +++ b/tools/shader-playground/Editor.cs @@ -235,6 +235,7 @@ namespace shader_playground { outputType = "dxbctext"; break; case 2: + case 3: outputType = "spirvtext"; break; } @@ -269,8 +270,9 @@ namespace shader_playground { "--vertex_shader_output_type=" + vertexShaderType, "--dxbc_source_map=true", }; - if (translationComboBox.SelectedIndex == 1) { - startArguments.Add("--shader_output_dxbc_rov=true"); + if (translationComboBox.SelectedIndex == 1 || + translationComboBox.SelectedIndex == 3) { + startArguments.Add("--shader_output_pixel_shader_interlock=true"); } startInfo = new ProcessStartInfo(compilerPath_);