[GPU] Vulkan fragment shader interlock RB and related fixes/cleanup

Also fixes addressing of MSAA samples 2 and 3 for 64bpp color render targets in the ROV RB implementation on Direct3D 12.
Additionally, with FSI/ROV, alpha test and alpha to coverage are done only if the render target 0 was dynamically written to (according to the Direct3D 9 rules for writing to color render targets, though not sure if they actually apply to the alpha tests on Direct3D 9, but for safety).
There is also some code cleanup for things spotted during the development of the feature.
This commit is contained in:
Triang3l 2022-10-09 22:06:41 +03:00
parent 9ab4db285c
commit 45050b2380
24 changed files with 6168 additions and 1530 deletions

View File

@ -3189,15 +3189,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
// flow.
reg::RB_COLOR_INFO color_infos[4];
float rt_clamp[4][4];
// Two UINT32_MAX if no components actually existing in the RT are written.
uint32_t rt_keep_masks[4][2];
for (uint32_t i = 0; i < 4; ++i) {
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
reg::RB_COLOR_INFO::rt_register_indices[i]);
color_infos[i] = color_info;
if (edram_rov_used) {
// Get the mask for keeping previous color's components unmodified,
// or two UINT32_MAX if no colors actually existing in the RT are written.
DxbcShaderTranslator::ROV_GetColorFormatSystemConstants(
RenderTargetCache::GetPSIColorFormatInfo(
color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111,
rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3],
rt_keep_masks[i][0], rt_keep_masks[i][1]);
@ -3506,8 +3505,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
rt_base_dwords_scaled;
system_constants_.edram_rt_base_dwords_scaled[i] =
rt_base_dwords_scaled;
uint32_t format_flags = DxbcShaderTranslator::ROV_AddColorFormatFlags(
color_info.color_format);
uint32_t format_flags =
RenderTargetCache::AddPSIColorFormatFlags(color_info.color_format);
dirty |= system_constants_.edram_rt_format_flags[i] != format_flags;
system_constants_.edram_rt_format_flags[i] = format_flags;
// Can't do float comparisons here because NaNs would result in always

View File

@ -267,19 +267,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
};
static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants");
// Appended to the format in the format constant.
enum : uint32_t {
// Starting from bit 4 because the format itself needs 4 bits.
kRTFormatFlag_64bpp_Shift = 4,
// Requires clamping of blending sources and factors.
kRTFormatFlag_FixedPointColor_Shift,
kRTFormatFlag_FixedPointAlpha_Shift,
kRTFormatFlag_64bpp = 1u << kRTFormatFlag_64bpp_Shift,
kRTFormatFlag_FixedPointColor = 1u << kRTFormatFlag_FixedPointColor_Shift,
kRTFormatFlag_FixedPointAlpha = 1u << kRTFormatFlag_FixedPointAlpha_Shift,
};
// IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED:
// - SystemConstants::Index enum.
// - system_constant_rdef_.
@ -383,7 +370,8 @@ class DxbcShaderTranslator : public ShaderTranslator {
uint32_t edram_rt_base_dwords_scaled[4];
// RT format combined with kRTFormatFlags.
// RT format combined with RenderTargetCache::kPSIColorFormatFlag values
// (pass via RenderTargetCache::AddPSIColorFormatFlags).
uint32_t edram_rt_format_flags[4];
// Format info - values to clamp the color to before blending or storing.
@ -524,40 +512,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
kEdram,
};
// Returns the format with internal flags for passing via the
// edram_rt_format_flags system constant.
static constexpr uint32_t ROV_AddColorFormatFlags(
xenos::ColorRenderTargetFormat format) {
uint32_t format_flags = uint32_t(format);
if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16 ||
format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT ||
format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
format_flags |= kRTFormatFlag_64bpp;
}
if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8 ||
format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA ||
format == xenos::ColorRenderTargetFormat::k_2_10_10_10 ||
format == xenos::ColorRenderTargetFormat::k_16_16 ||
format == xenos::ColorRenderTargetFormat::k_16_16_16_16 ||
format == xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10) {
format_flags |=
kRTFormatFlag_FixedPointColor | kRTFormatFlag_FixedPointAlpha;
} else if (format == xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT ||
format == xenos::ColorRenderTargetFormat::
k_2_10_10_10_FLOAT_AS_16_16_16_16) {
format_flags |= kRTFormatFlag_FixedPointAlpha;
}
return format_flags;
}
// Returns the bits that need to be added to the RT flags constant - needs to
// be done externally, not in SetColorFormatConstants, because the flags
// contain other state.
static void ROV_GetColorFormatSystemConstants(
xenos::ColorRenderTargetFormat format, uint32_t write_mask,
float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high,
float& clamp_alpha_high, uint32_t& keep_mask_low,
uint32_t& keep_mask_high);
uint64_t GetDefaultVertexShaderModification(
uint32_t dynamic_addressable_register_count,
Shader::HostVertexShaderType host_vertex_shader_type =
@ -772,6 +726,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
// Whether it's possible and worth skipping running the translated shader for
// 2x2 quads.
bool ROV_IsDepthStencilEarly() const {
assert_true(edram_rov_used_);
return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
!current_shader().is_valid_memexport_used();
}

View File

@ -14,139 +14,13 @@
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/render_target_cache.h"
#include "xenia/gpu/texture_cache.h"
namespace xe {
namespace gpu {
using namespace ucode;
void DxbcShaderTranslator::ROV_GetColorFormatSystemConstants(
xenos::ColorRenderTargetFormat format, uint32_t write_mask,
float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high,
float& clamp_alpha_high, uint32_t& keep_mask_low,
uint32_t& keep_mask_high) {
keep_mask_low = keep_mask_high = 0;
switch (format) {
case xenos::ColorRenderTargetFormat::k_8_8_8_8:
case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: {
clamp_rgb_low = clamp_alpha_low = 0.0f;
clamp_rgb_high = clamp_alpha_high = 1.0f;
for (uint32_t i = 0; i < 4; ++i) {
if (!(write_mask & (1 << i))) {
keep_mask_low |= uint32_t(0xFF) << (i * 8);
}
}
} break;
case xenos::ColorRenderTargetFormat::k_2_10_10_10:
case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: {
clamp_rgb_low = clamp_alpha_low = 0.0f;
clamp_rgb_high = clamp_alpha_high = 1.0f;
for (uint32_t i = 0; i < 3; ++i) {
if (!(write_mask & (1 << i))) {
keep_mask_low |= uint32_t(0x3FF) << (i * 10);
}
}
if (!(write_mask & 0b1000)) {
keep_mask_low |= uint32_t(3) << 30;
}
} break;
case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: {
clamp_rgb_low = clamp_alpha_low = 0.0f;
clamp_rgb_high = 31.875f;
clamp_alpha_high = 1.0f;
for (uint32_t i = 0; i < 3; ++i) {
if (!(write_mask & (1 << i))) {
keep_mask_low |= uint32_t(0x3FF) << (i * 10);
}
}
if (!(write_mask & 0b1000)) {
keep_mask_low |= uint32_t(3) << 30;
}
} break;
case xenos::ColorRenderTargetFormat::k_16_16:
case xenos::ColorRenderTargetFormat::k_16_16_16_16:
// Alpha clamping affects blending source, so it's non-zero for alpha for
// k_16_16 (the render target is fixed-point). There's one deviation from
// how Direct3D 11.3 functional specification defines SNorm conversion
// (NaN should be 0, not the lowest negative number), but NaN handling in
// output shouldn't be very important.
clamp_rgb_low = clamp_alpha_low = -32.0f;
clamp_rgb_high = clamp_alpha_high = 32.0f;
if (!(write_mask & 0b0001)) {
keep_mask_low |= 0xFFFFu;
}
if (!(write_mask & 0b0010)) {
keep_mask_low |= 0xFFFF0000u;
}
if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16) {
if (!(write_mask & 0b0100)) {
keep_mask_high |= 0xFFFFu;
}
if (!(write_mask & 0b1000)) {
keep_mask_high |= 0xFFFF0000u;
}
} else {
write_mask &= 0b0011;
}
break;
case xenos::ColorRenderTargetFormat::k_16_16_FLOAT:
case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
// No NaNs on the Xbox 360 GPU, though can't use the extended range with
// f32tof16.
clamp_rgb_low = clamp_alpha_low = -65504.0f;
clamp_rgb_high = clamp_alpha_high = 65504.0f;
if (!(write_mask & 0b0001)) {
keep_mask_low |= 0xFFFFu;
}
if (!(write_mask & 0b0010)) {
keep_mask_low |= 0xFFFF0000u;
}
if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT) {
if (!(write_mask & 0b0100)) {
keep_mask_high |= 0xFFFFu;
}
if (!(write_mask & 0b1000)) {
keep_mask_high |= 0xFFFF0000u;
}
} else {
write_mask &= 0b0011;
}
break;
case xenos::ColorRenderTargetFormat::k_32_FLOAT:
// No clamping - let min/max always pick the original value.
clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high =
std::nanf("");
write_mask &= 0b0001;
if (!(write_mask & 0b0001)) {
keep_mask_low = ~uint32_t(0);
}
break;
case xenos::ColorRenderTargetFormat::k_32_32_FLOAT:
// No clamping - let min/max always pick the original value.
clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high =
std::nanf("");
write_mask &= 0b0011;
if (!(write_mask & 0b0001)) {
keep_mask_low = ~uint32_t(0);
}
if (!(write_mask & 0b0010)) {
keep_mask_high = ~uint32_t(0);
}
break;
default:
assert_unhandled_case(format);
// Disable invalid render targets.
write_mask = 0;
break;
}
// Special case handled in the shaders for empty write mask to completely skip
// a disabled render target: all keep bits are set.
if (!write_mask) {
keep_mask_low = keep_mask_high = ~uint32_t(0);
}
}
void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
bool any_color_targets_written = current_shader().writes_color_targets() != 0;
@ -484,8 +358,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
{
// Copy the 4x AA coverage to system_temp_rov_params_.x, making top-right
// the sample [2] and bottom-left the sample [1] (the opposite of Direct3D
// 12), because on the Xbox 360, 2x MSAA doubles the storage width, 4x MSAA
// doubles the storage height.
// 12), because on the Xbox 360, 2x MSAA doubles the storage height, 4x MSAA
// doubles the storage width.
// Flip samples in bits 0:1 to bits 29:30.
a_.OpBFRev(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
dxbc::Src::VCoverage());
@ -1304,7 +1178,7 @@ void DxbcShaderTranslator::ROV_UnpackColor(
// k_8_8_8_8_GAMMA
// ***************************************************************************
for (uint32_t i = 0; i < 2; ++i) {
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
i ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA
: xenos::ColorRenderTargetFormat::k_8_8_8_8)));
// Unpack the components.
@ -1328,9 +1202,9 @@ void DxbcShaderTranslator::ROV_UnpackColor(
// k_2_10_10_10
// k_2_10_10_10_AS_10_10_10_10
// ***************************************************************************
a_.OpCase(dxbc::Src::LU(
ROV_AddColorFormatFlags(xenos::ColorRenderTargetFormat::k_2_10_10_10)));
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10)));
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10)));
{
// Unpack the components.
@ -1350,9 +1224,9 @@ void DxbcShaderTranslator::ROV_UnpackColor(
// k_2_10_10_10_FLOAT_AS_16_16_16_16
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// ***************************************************************************
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT)));
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16)));
{
// Unpack the alpha.
@ -1381,7 +1255,7 @@ void DxbcShaderTranslator::ROV_UnpackColor(
// k_16_16_16_16 (64bpp)
// ***************************************************************************
for (uint32_t i = 0; i < 2; ++i) {
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
i ? xenos::ColorRenderTargetFormat::k_16_16_16_16
: xenos::ColorRenderTargetFormat::k_16_16)));
dxbc::Dest color_components_dest(
@ -1404,7 +1278,7 @@ void DxbcShaderTranslator::ROV_UnpackColor(
// k_16_16_16_16_FLOAT (64bpp)
// ***************************************************************************
for (uint32_t i = 0; i < 2; ++i) {
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
i ? xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT
: xenos::ColorRenderTargetFormat::k_16_16_FLOAT)));
dxbc::Dest color_components_dest(
@ -1465,7 +1339,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor(
// k_8_8_8_8_GAMMA
// ***************************************************************************
for (uint32_t i = 0; i < 2; ++i) {
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
i ? xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA
: xenos::ColorRenderTargetFormat::k_8_8_8_8)));
for (uint32_t j = 0; j < 4; ++j) {
@ -1496,9 +1370,9 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor(
// k_2_10_10_10
// k_2_10_10_10_AS_10_10_10_10
// ***************************************************************************
a_.OpCase(dxbc::Src::LU(
ROV_AddColorFormatFlags(xenos::ColorRenderTargetFormat::k_2_10_10_10)));
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10)));
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10)));
for (uint32_t i = 0; i < 4; ++i) {
// Denormalize and convert to fixed-point.
@ -1518,9 +1392,9 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor(
// k_2_10_10_10_FLOAT_AS_16_16_16_16
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// ***************************************************************************
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT)));
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16)));
{
// Convert red directly to the destination, which may be the same as the
@ -1550,7 +1424,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor(
// k_16_16_16_16 (64bpp)
// ***************************************************************************
for (uint32_t i = 0; i < 2; ++i) {
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
i ? xenos::ColorRenderTargetFormat::k_16_16_16_16
: xenos::ColorRenderTargetFormat::k_16_16)));
for (uint32_t j = 0; j < (uint32_t(2) << i); ++j) {
@ -1582,7 +1456,7 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor(
// k_16_16_16_16_FLOAT (64bpp)
// ***************************************************************************
for (uint32_t i = 0; i < 2; ++i) {
a_.OpCase(dxbc::Src::LU(ROV_AddColorFormatFlags(
a_.OpCase(dxbc::Src::LU(RenderTargetCache::AddPSIColorFormatFlags(
i ? xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT
: xenos::ColorRenderTargetFormat::k_16_16_FLOAT)));
for (uint32_t j = 0; j < (uint32_t(2) << i); ++j) {
@ -2230,7 +2104,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Load whether the render target is 64bpp to system_temp_rov_params_.y to
// get the needed relative sample address.
a_.OpAnd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
rt_format_flags_src, dxbc::Src::LU(kRTFormatFlag_64bpp));
rt_format_flags_src,
dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp));
// Choose the relative sample address for the render target to
// system_temp_rov_params_.y.
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
@ -2287,7 +2162,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Get if the blending source color is fixed-point for clamping if it is.
// temp.x = whether color is fixed-point.
a_.OpAnd(temp_x_dest, rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_FixedPointColor));
dxbc::Src::LU(
RenderTargetCache::kPSIColorFormatFlag_FixedPointColor));
// Check if the blending source color is fixed-point and needs clamping.
// temp.x = free.
a_.OpIf(true, temp_x_src);
@ -2306,7 +2182,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Get if the blending source alpha is fixed-point for clamping if it is.
// temp.x = whether alpha is fixed-point.
a_.OpAnd(temp_x_dest, rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha));
dxbc::Src::LU(
RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha));
// Check if the blending source alpha is fixed-point and needs clamping.
// temp.x = free.
a_.OpIf(true, temp_x_src);
@ -2387,7 +2264,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Get if the format is 64bpp to temp.w.
// temp.w = whether the render target is 64bpp.
a_.OpAnd(temp_w_dest, rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_64bpp));
dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp));
// Check if the format is 64bpp.
// temp.w = free.
a_.OpIf(true, temp_w_src);
@ -2478,8 +2355,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Get if the render target color is fixed-point and the source
// color factor needs clamping to temp.x.
// temp.x = whether color is fixed-point.
a_.OpAnd(temp_x_dest, rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_FixedPointColor));
a_.OpAnd(
temp_x_dest, rt_format_flags_src,
dxbc::Src::LU(
RenderTargetCache::kPSIColorFormatFlag_FixedPointColor));
// Check if the source color factor needs clamping.
a_.OpIf(true, temp_x_src);
{
@ -2558,8 +2437,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Get if the render target color is fixed-point and the
// destination color factor needs clamping to temp.x.
// temp.x = whether color is fixed-point.
a_.OpAnd(temp_x_dest, rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_FixedPointColor));
a_.OpAnd(
temp_x_dest, rt_format_flags_src,
dxbc::Src::LU(
RenderTargetCache::kPSIColorFormatFlag_FixedPointColor));
// Check if the destination color factor needs clamping.
a_.OpIf(true, temp_x_src);
{
@ -2701,8 +2582,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Get if the render target alpha is fixed-point and the source
// alpha factor needs clamping to temp.y.
// temp.y = whether alpha is fixed-point.
a_.OpAnd(temp_y_dest, rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha));
a_.OpAnd(
temp_y_dest, rt_format_flags_src,
dxbc::Src::LU(
RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha));
// Check if the source alpha factor needs clamping.
a_.OpIf(true, temp_y_src);
{
@ -2769,9 +2652,11 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// destination alpha factor needs clamping.
// alpha_is_fixed_temp.x = whether alpha is fixed-point.
uint32_t alpha_is_fixed_temp = PushSystemTemp();
a_.OpAnd(dxbc::Dest::R(alpha_is_fixed_temp, 0b0001),
rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_FixedPointAlpha));
a_.OpAnd(
dxbc::Dest::R(alpha_is_fixed_temp, 0b0001),
rt_format_flags_src,
dxbc::Src::LU(
RenderTargetCache::kPSIColorFormatFlag_FixedPointAlpha));
// Check if the destination alpha factor needs clamping.
a_.OpIf(true,
dxbc::Src::R(alpha_is_fixed_temp, dxbc::Src::kXXXX));
@ -2925,7 +2810,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Get if the format is 64bpp to temp.z.
// temp.z = whether the render target is 64bpp.
a_.OpAnd(temp_z_dest, rt_format_flags_src,
dxbc::Src::LU(kRTFormatFlag_64bpp));
dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp));
// Check if the format is 64bpp.
// temp.z = free.
a_.OpIf(true, temp_z_src);
@ -2954,16 +2839,29 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Close the sample covered check.
a_.OpEndIf();
// Go to the next sample (samples are at +0, +(80*scale_x), +1,
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
// +(80*scale_x) and -(80*scale_x+1) after each sample).
// Go to the next sample (samples are at +0, +(80*scale_x), +dwpp,
// +(80*scale_x+dwpp), so need to do +(80*scale_x), -(80*scale_x-dwpp),
// +(80*scale_x) and -(80*scale_x+dwpp) after each sample).
// Though no need to do this for the last sample as for the next render
// target, the address will be recalculated.
if (j < 3) {
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LI((j & 1) ? -int32_t(tile_width) + 2 - j
: int32_t(tile_width)));
if (j & 1) {
// temp.z = whether the render target is 64bpp.
a_.OpAnd(temp_z_dest, rt_format_flags_src,
dxbc::Src::LU(RenderTargetCache::kPSIColorFormatFlag_64bpp));
// temp.z = offset from the current sample to the next.
a_.OpMovC(temp_z_dest, temp_z_src,
dxbc::Src::LI(-int32_t(tile_width) + 2 * (2 - int32_t(j))),
dxbc::Src::LI(-int32_t(tile_width) + (2 - int32_t(j))));
// temp.z = free.
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
temp_z_src);
} else {
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LU(tile_width));
}
}
}
@ -2987,6 +2885,17 @@ void DxbcShaderTranslator::CompletePixelShader() {
if (current_shader().writes_color_target(0) &&
!IsForceEarlyDepthStencilGlobalFlagEnabled()) {
if (edram_rov_used_) {
// Check if the render target 0 was written to on the execution path.
uint32_t rt_0_written_temp = PushSystemTemp();
a_.OpAnd(dxbc::Dest::R(rt_0_written_temp, 0b0001),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
dxbc::Src::LU(1 << 8));
a_.OpIf(true, dxbc::Src::R(rt_0_written_temp, dxbc::Src::kXXXX));
// Release rt_0_written_temp.
PopSystemTemp();
}
// Alpha test.
// X - mask, then masked result (SGPR for loading, VGPR for masking).
// Y - operation result (SGPR for mask operations, VGPR for alpha
@ -3057,10 +2966,15 @@ void DxbcShaderTranslator::CompletePixelShader() {
a_.OpEndIf();
// Release alpha_test_temp.
PopSystemTemp();
}
// Discard samples with alpha to coverage.
CompletePixelShader_AlphaToMask();
// Discard samples with alpha to coverage.
CompletePixelShader_AlphaToMask();
if (edram_rov_used_) {
// Close the render target 0 written check.
a_.OpEndIf();
}
}
// Write the values to the render targets. Not applying the exponent bias yet
// because the original 0 to 1 alpha value is needed for alpha to coverage,

View File

@ -207,6 +207,134 @@ DEFINE_bool(
namespace xe {
namespace gpu {
void RenderTargetCache::GetPSIColorFormatInfo(
xenos::ColorRenderTargetFormat format, uint32_t write_mask,
float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high,
float& clamp_alpha_high, uint32_t& keep_mask_low,
uint32_t& keep_mask_high) {
keep_mask_low = keep_mask_high = 0;
switch (format) {
case xenos::ColorRenderTargetFormat::k_8_8_8_8:
case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: {
clamp_rgb_low = clamp_alpha_low = 0.0f;
clamp_rgb_high = clamp_alpha_high = 1.0f;
for (uint32_t i = 0; i < 4; ++i) {
if (!(write_mask & (1 << i))) {
keep_mask_low |= uint32_t(0xFF) << (i * 8);
}
}
} break;
case xenos::ColorRenderTargetFormat::k_2_10_10_10:
case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: {
clamp_rgb_low = clamp_alpha_low = 0.0f;
clamp_rgb_high = clamp_alpha_high = 1.0f;
for (uint32_t i = 0; i < 3; ++i) {
if (!(write_mask & (1 << i))) {
keep_mask_low |= uint32_t(0x3FF) << (i * 10);
}
}
if (!(write_mask & 0b1000)) {
keep_mask_low |= uint32_t(3) << 30;
}
} break;
case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: {
clamp_rgb_low = clamp_alpha_low = 0.0f;
clamp_rgb_high = 31.875f;
clamp_alpha_high = 1.0f;
for (uint32_t i = 0; i < 3; ++i) {
if (!(write_mask & (1 << i))) {
keep_mask_low |= uint32_t(0x3FF) << (i * 10);
}
}
if (!(write_mask & 0b1000)) {
keep_mask_low |= uint32_t(3) << 30;
}
} break;
case xenos::ColorRenderTargetFormat::k_16_16:
case xenos::ColorRenderTargetFormat::k_16_16_16_16:
// Alpha clamping affects blending source, so it's non-zero for alpha for
// k_16_16 (the render target is fixed-point). There's one deviation from
// how Direct3D 11.3 functional specification defines SNorm conversion
// (NaN should be 0, not the lowest negative number), and that needs to be
// handled separately.
clamp_rgb_low = clamp_alpha_low = -32.0f;
clamp_rgb_high = clamp_alpha_high = 32.0f;
if (!(write_mask & 0b0001)) {
keep_mask_low |= 0xFFFFu;
}
if (!(write_mask & 0b0010)) {
keep_mask_low |= 0xFFFF0000u;
}
if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16) {
if (!(write_mask & 0b0100)) {
keep_mask_high |= 0xFFFFu;
}
if (!(write_mask & 0b1000)) {
keep_mask_high |= 0xFFFF0000u;
}
} else {
write_mask &= 0b0011;
}
break;
case xenos::ColorRenderTargetFormat::k_16_16_FLOAT:
case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
// No NaNs on the Xbox 360 GPU, though can't use the extended range with
// Direct3D and Vulkan conversions.
// TODO(Triang3l): Use the extended-range encoding in all implementations.
clamp_rgb_low = clamp_alpha_low = -65504.0f;
clamp_rgb_high = clamp_alpha_high = 65504.0f;
if (!(write_mask & 0b0001)) {
keep_mask_low |= 0xFFFFu;
}
if (!(write_mask & 0b0010)) {
keep_mask_low |= 0xFFFF0000u;
}
if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT) {
if (!(write_mask & 0b0100)) {
keep_mask_high |= 0xFFFFu;
}
if (!(write_mask & 0b1000)) {
keep_mask_high |= 0xFFFF0000u;
}
} else {
write_mask &= 0b0011;
}
break;
case xenos::ColorRenderTargetFormat::k_32_FLOAT:
// No clamping - let min/max always pick the original value.
clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high =
std::nanf("");
write_mask &= 0b0001;
if (!(write_mask & 0b0001)) {
keep_mask_low = ~uint32_t(0);
}
break;
case xenos::ColorRenderTargetFormat::k_32_32_FLOAT:
// No clamping - let min/max always pick the original value.
clamp_rgb_low = clamp_alpha_low = clamp_rgb_high = clamp_alpha_high =
std::nanf("");
write_mask &= 0b0011;
if (!(write_mask & 0b0001)) {
keep_mask_low = ~uint32_t(0);
}
if (!(write_mask & 0b0010)) {
keep_mask_high = ~uint32_t(0);
}
break;
default:
assert_unhandled_case(format);
// Disable invalid render targets.
write_mask = 0;
break;
}
// Special case handled in the shaders for empty write mask to completely skip
// a disabled render target: all keep bits are set.
if (!write_mask) {
keep_mask_low = keep_mask_high = ~uint32_t(0);
}
}
uint32_t RenderTargetCache::Transfer::GetRangeRectangles(
uint32_t start_tiles, uint32_t end_tiles, uint32_t base_tiles,
uint32_t pitch_tiles, xenos::MsaaSamples msaa_samples, bool is_64bpp,

View File

@ -113,6 +113,54 @@ class RenderTargetCache {
kSrgbToLinearExponent);
}
// Pixel shader interlock implementation helpers.
// Appended to the format in the format constant via bitwise OR.
enum : uint32_t {
kPSIColorFormatFlag_64bpp_Shift = xenos::kColorRenderTargetFormatBits,
// Requires clamping of blending sources and factors.
kPSIColorFormatFlag_FixedPointColor_Shift,
kPSIColorFormatFlag_FixedPointAlpha_Shift,
kPSIColorFormatFlag_64bpp = uint32_t(1) << kPSIColorFormatFlag_64bpp_Shift,
kPSIColorFormatFlag_FixedPointColor =
uint32_t(1) << kPSIColorFormatFlag_FixedPointColor_Shift,
kPSIColorFormatFlag_FixedPointAlpha =
uint32_t(1) << kPSIColorFormatFlag_FixedPointAlpha_Shift,
};
static constexpr uint32_t AddPSIColorFormatFlags(
xenos::ColorRenderTargetFormat format) {
uint32_t format_flags = uint32_t(format);
if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16 ||
format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT ||
format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
format_flags |= kPSIColorFormatFlag_64bpp;
}
if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8 ||
format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA ||
format == xenos::ColorRenderTargetFormat::k_2_10_10_10 ||
format == xenos::ColorRenderTargetFormat::k_16_16 ||
format == xenos::ColorRenderTargetFormat::k_16_16_16_16 ||
format == xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10) {
format_flags |= kPSIColorFormatFlag_FixedPointColor |
kPSIColorFormatFlag_FixedPointAlpha;
} else if (format == xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT ||
format == xenos::ColorRenderTargetFormat::
k_2_10_10_10_FLOAT_AS_16_16_16_16) {
format_flags |= kPSIColorFormatFlag_FixedPointAlpha;
}
return format_flags;
}
static void GetPSIColorFormatInfo(xenos::ColorRenderTargetFormat format,
uint32_t write_mask, float& clamp_rgb_low,
float& clamp_alpha_low,
float& clamp_rgb_high,
float& clamp_alpha_high,
uint32_t& keep_mask_low,
uint32_t& keep_mask_high);
virtual ~RenderTargetCache();
virtual Path GetPath() const = 0;

View File

@ -54,9 +54,11 @@ DEFINE_string(
"GPU");
DEFINE_bool(shader_output_bindless_resources, false,
"Output host shader with bindless resources used.", "GPU");
DEFINE_bool(shader_output_dxbc_rov, false,
"Output ROV-based output-merger code in DXBC pixel shaders.",
"GPU");
DEFINE_bool(
shader_output_pixel_shader_interlock, false,
"Output host shader with a render backend implementation based on pixel "
"shader interlock.",
"GPU");
namespace xe {
namespace gpu {
@ -124,12 +126,15 @@ int shader_compiler_main(const std::vector<std::string>& args) {
SpirvShaderTranslator::Features spirv_features(true);
if (cvars::shader_output_type == "spirv" ||
cvars::shader_output_type == "spirvtext") {
translator = std::make_unique<SpirvShaderTranslator>(spirv_features);
translator = std::make_unique<SpirvShaderTranslator>(
spirv_features, true, true,
cvars::shader_output_pixel_shader_interlock);
} else if (cvars::shader_output_type == "dxbc" ||
cvars::shader_output_type == "dxbctext") {
translator = std::make_unique<DxbcShaderTranslator>(
ui::GraphicsProvider::GpuVendorID(0),
cvars::shader_output_bindless_resources, cvars::shader_output_dxbc_rov);
cvars::shader_output_bindless_resources,
cvars::shader_output_pixel_shader_interlock);
} else {
// Just output microcode disassembly generated during microcode information
// gathering.

View File

@ -21,6 +21,7 @@
#include "third_party/glslang/SPIRV/GLSL.std.450.h"
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
#include "xenia/base/string_buffer.h"
#include "xenia/gpu/spirv_shader.h"
namespace xe {
@ -31,6 +32,8 @@ SpirvShaderTranslator::Features::Features(bool all)
max_storage_buffer_range(all ? UINT32_MAX : (128 * 1024 * 1024)),
clip_distance(all),
cull_distance(all),
demote_to_helper_invocation(all),
fragment_shader_sample_interlock(all),
full_draw_index_uint32(all),
image_view_format_swizzle(all),
signed_zero_inf_nan_preserve_float32(all),
@ -42,6 +45,14 @@ SpirvShaderTranslator::Features::Features(
provider.device_properties().limits.maxStorageBufferRange),
clip_distance(provider.device_features().shaderClipDistance),
cull_distance(provider.device_features().shaderCullDistance),
demote_to_helper_invocation(
provider.device_extensions().ext_shader_demote_to_helper_invocation &&
provider.device_shader_demote_to_helper_invocation_features()
.shaderDemoteToHelperInvocation),
fragment_shader_sample_interlock(
provider.device_extensions().ext_fragment_shader_interlock &&
provider.device_fragment_shader_interlock_features()
.fragmentShaderSampleInterlock),
full_draw_index_uint32(provider.device_features().fullDrawIndexUint32) {
uint32_t device_version = provider.device_properties().apiVersion;
const ui::vulkan::VulkanProvider::DeviceExtensions& device_extensions =
@ -78,9 +89,6 @@ SpirvShaderTranslator::Features::Features(
}
}
SpirvShaderTranslator::SpirvShaderTranslator(const Features& features)
: features_(features) {}
uint64_t SpirvShaderTranslator::GetDefaultVertexShaderModification(
uint32_t dynamic_addressable_register_count,
Shader::HostVertexShaderType host_vertex_shader_type) const {
@ -99,6 +107,19 @@ uint64_t SpirvShaderTranslator::GetDefaultPixelShaderModification(
return shader_modification.value;
}
std::vector<uint8_t> SpirvShaderTranslator::CreateDepthOnlyFragmentShader() {
is_depth_only_fragment_shader_ = true;
// TODO(Triang3l): Handle in a nicer way (is_depth_only_fragment_shader_ is a
// leftover from when a Shader object wasn't used during translation).
Shader shader(xenos::ShaderType::kPixel, 0, nullptr, 0);
StringBuffer instruction_disassembly_buffer;
shader.AnalyzeUcode(instruction_disassembly_buffer);
Shader::Translation& translation = *shader.GetOrCreateTranslation(0);
TranslateAnalyzedShader(translation);
is_depth_only_fragment_shader_ = false;
return translation.translated_binary();
}
void SpirvShaderTranslator::Reset() {
ShaderTranslator::Reset();
@ -109,6 +130,7 @@ void SpirvShaderTranslator::Reset() {
input_point_coordinates_ = spv::NoResult;
input_fragment_coordinates_ = spv::NoResult;
input_front_facing_ = spv::NoResult;
input_sample_mask_ = spv::NoResult;
std::fill(input_output_interpolators_.begin(),
input_output_interpolators_.end(), spv::NoResult);
output_point_coordinates_ = spv::NoResult;
@ -120,6 +142,8 @@ void SpirvShaderTranslator::Reset() {
main_interface_.clear();
var_main_registers_ = spv::NoResult;
var_main_point_size_edge_flag_kill_vertex_ = spv::NoResult;
var_main_kill_pixel_ = spv::NoResult;
var_main_fsi_color_written_ = spv::NoResult;
main_switch_op_.reset();
main_switch_next_pc_phi_operands_.clear();
@ -217,6 +241,10 @@ void SpirvShaderTranslator::StartTranslation() {
size_t offset;
spv::Id type;
};
spv::Id type_float4_array_4 = builder_->makeArrayType(
type_float4_, builder_->makeUintConstant(4), sizeof(float) * 4);
builder_->addDecoration(type_float4_array_4, spv::DecorationArrayStride,
sizeof(float) * 4);
spv::Id type_uint4_array_2 = builder_->makeArrayType(
type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4);
builder_->addDecoration(type_uint4_array_2, spv::DecorationArrayStride,
@ -250,8 +278,37 @@ void SpirvShaderTranslator::StartTranslation() {
type_uint4_array_4},
{"alpha_test_reference", offsetof(SystemConstants, alpha_test_reference),
type_float_},
{"edram_32bpp_tile_pitch_dwords_scaled",
offsetof(SystemConstants, edram_32bpp_tile_pitch_dwords_scaled),
type_uint_},
{"edram_depth_base_dwords_scaled",
offsetof(SystemConstants, edram_depth_base_dwords_scaled), type_uint_},
{"color_exp_bias", offsetof(SystemConstants, color_exp_bias),
type_float4_},
{"edram_poly_offset_front_scale",
offsetof(SystemConstants, edram_poly_offset_front_scale), type_float_},
{"edram_poly_offset_back_scale",
offsetof(SystemConstants, edram_poly_offset_back_scale), type_float_},
{"edram_poly_offset_front_offset",
offsetof(SystemConstants, edram_poly_offset_front_offset), type_float_},
{"edram_poly_offset_back_offset",
offsetof(SystemConstants, edram_poly_offset_back_offset), type_float_},
{"edram_stencil_front", offsetof(SystemConstants, edram_stencil_front),
type_uint2_},
{"edram_stencil_back", offsetof(SystemConstants, edram_stencil_back),
type_uint2_},
{"edram_rt_base_dwords_scaled",
offsetof(SystemConstants, edram_rt_base_dwords_scaled), type_uint4_},
{"edram_rt_format_flags",
offsetof(SystemConstants, edram_rt_format_flags), type_uint4_},
{"edram_rt_blend_factors_ops",
offsetof(SystemConstants, edram_rt_blend_factors_ops), type_uint4_},
{"edram_rt_keep_mask", offsetof(SystemConstants, edram_rt_keep_mask),
type_uint4_array_2},
{"edram_rt_clamp", offsetof(SystemConstants, edram_rt_clamp),
type_float4_array_4},
{"edram_blend_constant", offsetof(SystemConstants, edram_blend_constant),
type_float4_},
};
id_vector_temp_.clear();
id_vector_temp_.reserve(xe::countof(system_constants));
@ -281,139 +338,145 @@ void SpirvShaderTranslator::StartTranslation() {
main_interface_.push_back(uniform_system_constants_);
}
// Common uniform buffer - float constants.
uint32_t float_constant_count =
current_shader().constant_register_map().float_count;
if (float_constant_count) {
if (!is_depth_only_fragment_shader_) {
// Common uniform buffer - float constants.
uint32_t float_constant_count =
current_shader().constant_register_map().float_count;
if (float_constant_count) {
id_vector_temp_.clear();
id_vector_temp_.push_back(builder_->makeArrayType(
type_float4_, builder_->makeUintConstant(float_constant_count),
sizeof(float) * 4));
// Currently (as of October 24, 2020) makeArrayType only uses the stride
// to check if deduplication can be done - the array stride decoration
// needs to be applied explicitly.
builder_->addDecoration(id_vector_temp_.back(),
spv::DecorationArrayStride, sizeof(float) * 4);
spv::Id type_float_constants =
builder_->makeStructType(id_vector_temp_, "XeFloatConstants");
builder_->addMemberName(type_float_constants, 0, "float_constants");
builder_->addMemberDecoration(type_float_constants, 0,
spv::DecorationOffset, 0);
builder_->addDecoration(type_float_constants, spv::DecorationBlock);
uniform_float_constants_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassUniform, type_float_constants,
"xe_uniform_float_constants");
builder_->addDecoration(uniform_float_constants_,
spv::DecorationDescriptorSet,
int(kDescriptorSetConstants));
builder_->addDecoration(
uniform_float_constants_, spv::DecorationBinding,
int(is_pixel_shader() ? kConstantBufferFloatPixel
: kConstantBufferFloatVertex));
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(uniform_float_constants_);
}
}
// Common uniform buffer - bool and loop constants.
// Uniform buffers must have std140 packing, so using arrays of 4-component
// vectors instead of scalar arrays because the latter would have padding to
// 16 bytes in each element.
id_vector_temp_.clear();
id_vector_temp_.reserve(2);
// 256 bool constants.
id_vector_temp_.push_back(builder_->makeArrayType(
type_float4_, builder_->makeUintConstant(float_constant_count),
sizeof(float) * 4));
// Currently (as of October 24, 2020) makeArrayType only uses the stride to
// check if deduplication can be done - the array stride decoration needs to
// be applied explicitly.
type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4));
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(float) * 4);
spv::Id type_float_constants =
builder_->makeStructType(id_vector_temp_, "XeFloatConstants");
builder_->addMemberName(type_float_constants, 0, "float_constants");
builder_->addMemberDecoration(type_float_constants, 0,
sizeof(uint32_t) * 4);
// 32 loop constants.
id_vector_temp_.push_back(builder_->makeArrayType(
type_uint4_, builder_->makeUintConstant(8), sizeof(uint32_t) * 4));
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t) * 4);
spv::Id type_bool_loop_constants =
builder_->makeStructType(id_vector_temp_, "XeBoolLoopConstants");
builder_->addMemberName(type_bool_loop_constants, 0, "bool_constants");
builder_->addMemberDecoration(type_bool_loop_constants, 0,
spv::DecorationOffset, 0);
builder_->addDecoration(type_float_constants, spv::DecorationBlock);
uniform_float_constants_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassUniform, type_float_constants,
"xe_uniform_float_constants");
builder_->addDecoration(uniform_float_constants_,
builder_->addMemberName(type_bool_loop_constants, 1, "loop_constants");
builder_->addMemberDecoration(type_bool_loop_constants, 1,
spv::DecorationOffset, sizeof(uint32_t) * 8);
builder_->addDecoration(type_bool_loop_constants, spv::DecorationBlock);
uniform_bool_loop_constants_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassUniform, type_bool_loop_constants,
"xe_uniform_bool_loop_constants");
builder_->addDecoration(uniform_bool_loop_constants_,
spv::DecorationDescriptorSet,
int(kDescriptorSetConstants));
builder_->addDecoration(
uniform_float_constants_, spv::DecorationBinding,
int(is_pixel_shader() ? kConstantBufferFloatPixel
: kConstantBufferFloatVertex));
builder_->addDecoration(uniform_bool_loop_constants_,
spv::DecorationBinding,
int(kConstantBufferBoolLoop));
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(uniform_float_constants_);
main_interface_.push_back(uniform_bool_loop_constants_);
}
}
// Common uniform buffer - bool and loop constants.
// Uniform buffers must have std140 packing, so using arrays of 4-component
// vectors instead of scalar arrays because the latter would have padding to
// 16 bytes in each element.
id_vector_temp_.clear();
id_vector_temp_.reserve(2);
// 256 bool constants.
id_vector_temp_.push_back(builder_->makeArrayType(
type_uint4_, builder_->makeUintConstant(2), sizeof(uint32_t) * 4));
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t) * 4);
// 32 loop constants.
id_vector_temp_.push_back(builder_->makeArrayType(
type_uint4_, builder_->makeUintConstant(8), sizeof(uint32_t) * 4));
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t) * 4);
spv::Id type_bool_loop_constants =
builder_->makeStructType(id_vector_temp_, "XeBoolLoopConstants");
builder_->addMemberName(type_bool_loop_constants, 0, "bool_constants");
builder_->addMemberDecoration(type_bool_loop_constants, 0,
spv::DecorationOffset, 0);
builder_->addMemberName(type_bool_loop_constants, 1, "loop_constants");
builder_->addMemberDecoration(type_bool_loop_constants, 1,
spv::DecorationOffset, sizeof(uint32_t) * 8);
builder_->addDecoration(type_bool_loop_constants, spv::DecorationBlock);
uniform_bool_loop_constants_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassUniform, type_bool_loop_constants,
"xe_uniform_bool_loop_constants");
builder_->addDecoration(uniform_bool_loop_constants_,
spv::DecorationDescriptorSet,
int(kDescriptorSetConstants));
builder_->addDecoration(uniform_bool_loop_constants_, spv::DecorationBinding,
int(kConstantBufferBoolLoop));
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(uniform_bool_loop_constants_);
}
// Common uniform buffer - fetch constants (32 x 6 uints packed in std140 as
// 4-component vectors).
id_vector_temp_.clear();
id_vector_temp_.push_back(builder_->makeArrayType(
type_uint4_, builder_->makeUintConstant(32 * 6 / 4),
sizeof(uint32_t) * 4));
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t) * 4);
spv::Id type_fetch_constants =
builder_->makeStructType(id_vector_temp_, "XeFetchConstants");
builder_->addMemberName(type_fetch_constants, 0, "fetch_constants");
builder_->addMemberDecoration(type_fetch_constants, 0,
spv::DecorationOffset, 0);
builder_->addDecoration(type_fetch_constants, spv::DecorationBlock);
uniform_fetch_constants_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassUniform, type_fetch_constants,
"xe_uniform_fetch_constants");
builder_->addDecoration(uniform_fetch_constants_,
spv::DecorationDescriptorSet,
int(kDescriptorSetConstants));
builder_->addDecoration(uniform_fetch_constants_, spv::DecorationBinding,
int(kConstantBufferFetch));
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(uniform_fetch_constants_);
}
// Common uniform buffer - fetch constants (32 x 6 uints packed in std140 as
// 4-component vectors).
id_vector_temp_.clear();
id_vector_temp_.push_back(builder_->makeArrayType(
type_uint4_, builder_->makeUintConstant(32 * 6 / 4),
sizeof(uint32_t) * 4));
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t) * 4);
spv::Id type_fetch_constants =
builder_->makeStructType(id_vector_temp_, "XeFetchConstants");
builder_->addMemberName(type_fetch_constants, 0, "fetch_constants");
builder_->addMemberDecoration(type_fetch_constants, 0, spv::DecorationOffset,
0);
builder_->addDecoration(type_fetch_constants, spv::DecorationBlock);
uniform_fetch_constants_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassUniform, type_fetch_constants,
"xe_uniform_fetch_constants");
builder_->addDecoration(uniform_fetch_constants_,
spv::DecorationDescriptorSet,
int(kDescriptorSetConstants));
builder_->addDecoration(uniform_fetch_constants_, spv::DecorationBinding,
int(kConstantBufferFetch));
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(uniform_fetch_constants_);
}
// Common storage buffers - shared memory uint[], each 128 MB or larger,
// depending on what's possible on the device.
id_vector_temp_.clear();
id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_));
// Storage buffers have std430 packing, no padding to 4-component vectors.
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t));
spv::Id type_shared_memory =
builder_->makeStructType(id_vector_temp_, "XeSharedMemory");
builder_->addMemberName(type_shared_memory, 0, "shared_memory");
// TODO(Triang3l): Make writable when memexport is implemented.
builder_->addMemberDecoration(type_shared_memory, 0,
spv::DecorationNonWritable);
builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset,
0);
builder_->addDecoration(type_shared_memory,
features_.spirv_version >= spv::Spv_1_3
? spv::DecorationBlock
: spv::DecorationBufferBlock);
unsigned int shared_memory_binding_count =
1 << GetSharedMemoryStorageBufferCountLog2();
if (shared_memory_binding_count > 1) {
type_shared_memory = builder_->makeArrayType(
type_shared_memory,
builder_->makeUintConstant(shared_memory_binding_count), 0);
}
buffers_shared_memory_ = builder_->createVariable(
spv::NoPrecision,
features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer
: spv::StorageClassUniform,
type_shared_memory, "xe_shared_memory");
builder_->addDecoration(buffers_shared_memory_, spv::DecorationDescriptorSet,
int(kDescriptorSetSharedMemoryAndEdram));
builder_->addDecoration(buffers_shared_memory_, spv::DecorationBinding, 0);
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(buffers_shared_memory_);
// Common storage buffers - shared memory uint[], each 128 MB or larger,
// depending on what's possible on the device.
id_vector_temp_.clear();
id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_));
// Storage buffers have std430 packing, no padding to 4-component vectors.
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t));
spv::Id type_shared_memory =
builder_->makeStructType(id_vector_temp_, "XeSharedMemory");
builder_->addMemberName(type_shared_memory, 0, "shared_memory");
builder_->addMemberDecoration(type_shared_memory, 0,
spv::DecorationRestrict);
// TODO(Triang3l): Make writable when memexport is implemented.
builder_->addMemberDecoration(type_shared_memory, 0,
spv::DecorationNonWritable);
builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset,
0);
builder_->addDecoration(type_shared_memory,
features_.spirv_version >= spv::Spv_1_3
? spv::DecorationBlock
: spv::DecorationBufferBlock);
unsigned int shared_memory_binding_count =
1 << GetSharedMemoryStorageBufferCountLog2();
if (shared_memory_binding_count > 1) {
type_shared_memory = builder_->makeArrayType(
type_shared_memory,
builder_->makeUintConstant(shared_memory_binding_count), 0);
}
buffers_shared_memory_ = builder_->createVariable(
spv::NoPrecision,
features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer
: spv::StorageClassUniform,
type_shared_memory, "xe_shared_memory");
builder_->addDecoration(buffers_shared_memory_,
spv::DecorationDescriptorSet,
int(kDescriptorSetSharedMemoryAndEdram));
builder_->addDecoration(buffers_shared_memory_, spv::DecorationBinding, 0);
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(buffers_shared_memory_);
}
}
if (is_vertex_shader()) {
@ -438,41 +501,43 @@ void SpirvShaderTranslator::StartTranslation() {
uniform_system_constants_, id_vector_temp_),
spv::NoPrecision);
// Begin ucode translation. Initialize everything, even without defined
// defaults, for safety.
var_main_predicate_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_bool_,
"xe_var_predicate", builder_->makeBoolConstant(false));
var_main_loop_count_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_uint4_,
"xe_var_loop_count", const_uint4_0_);
var_main_address_register_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_int_,
"xe_var_address_register", const_int_0_);
var_main_loop_address_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_int4_,
"xe_var_loop_address", const_int4_0_);
var_main_previous_scalar_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float_,
"xe_var_previous_scalar", const_float_0_);
var_main_vfetch_address_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_int_,
"xe_var_vfetch_address", const_int_0_);
var_main_tfetch_lod_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float_,
"xe_var_tfetch_lod", const_float_0_);
var_main_tfetch_gradients_h_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float3_,
"xe_var_tfetch_gradients_h", const_float3_0_);
var_main_tfetch_gradients_v_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float3_,
"xe_var_tfetch_gradients_v", const_float3_0_);
if (register_count()) {
spv::Id type_register_array = builder_->makeArrayType(
type_float4_, builder_->makeUintConstant(register_count()), 0);
var_main_registers_ =
builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction,
type_register_array, "xe_var_registers");
if (!is_depth_only_fragment_shader_) {
// Begin ucode translation. Initialize everything, even without defined
// defaults, for safety.
var_main_predicate_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_bool_,
"xe_var_predicate", builder_->makeBoolConstant(false));
var_main_loop_count_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_uint4_,
"xe_var_loop_count", const_uint4_0_);
var_main_address_register_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_int_,
"xe_var_address_register", const_int_0_);
var_main_loop_address_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_int4_,
"xe_var_loop_address", const_int4_0_);
var_main_previous_scalar_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float_,
"xe_var_previous_scalar", const_float_0_);
var_main_vfetch_address_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_int_,
"xe_var_vfetch_address", const_int_0_);
var_main_tfetch_lod_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float_,
"xe_var_tfetch_lod", const_float_0_);
var_main_tfetch_gradients_h_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float3_,
"xe_var_tfetch_gradients_h", const_float3_0_);
var_main_tfetch_gradients_v_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float3_,
"xe_var_tfetch_gradients_v", const_float3_0_);
if (register_count()) {
spv::Id type_register_array = builder_->makeArrayType(
type_float4_, builder_->makeUintConstant(register_count()), 0);
var_main_registers_ =
builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction,
type_register_array, "xe_var_registers");
}
}
// Write the execution model-specific prologue with access to variables in the
@ -483,6 +548,10 @@ void SpirvShaderTranslator::StartTranslation() {
StartFragmentShaderInMain();
}
if (is_depth_only_fragment_shader_) {
return;
}
// Open the main loop.
spv::Block& main_loop_pre_header = *builder_->getBuildPoint();
main_loop_header_ = &builder_->makeNewBlock();
@ -551,57 +620,62 @@ void SpirvShaderTranslator::StartTranslation() {
}
std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
// Close flow control within the last switch case.
CloseExecConditionals();
bool has_main_switch = !current_shader().label_addresses().empty();
// After the final exec (if it happened to be not exece, which would already
// have a break branch), break from the switch if it exists, or from the
// loop it doesn't.
if (!builder_->getBuildPoint()->isTerminated()) {
builder_->createBranch(has_main_switch ? main_switch_merge_
: main_loop_merge_);
}
if (has_main_switch) {
// Insert the switch instruction with all cases added as operands.
builder_->setBuildPoint(main_switch_header_);
builder_->getBuildPoint()->addInstruction(std::move(main_switch_op_));
// Build the main switch merge, breaking out of the loop after falling
// through the end or breaking from exece (only continuing if a jump - from
// a guest loop or from jmp/call - was made).
function_main_->addBlock(main_switch_merge_);
builder_->setBuildPoint(main_switch_merge_);
builder_->createBranch(main_loop_merge_);
}
// Main loop continuation - choose the program counter based on the path
// taken (-1 if not from a jump as a safe fallback, which would result in not
// hitting any switch case and reaching the final break in the body).
function_main_->addBlock(main_loop_continue_);
builder_->setBuildPoint(main_loop_continue_);
if (has_main_switch) {
// OpPhi, if added, must be the first in the block.
// If labels were added, but not jumps (for example, due to the call
// instruction not being implemented as of October 18, 2020), send an
// impossible program counter value (-1) to the OpPhi at the next iteration.
if (main_switch_next_pc_phi_operands_.empty()) {
main_switch_next_pc_phi_operands_.push_back(
builder_->makeIntConstant(-1));
if (!is_depth_only_fragment_shader_) {
// Close flow control within the last switch case.
CloseExecConditionals();
bool has_main_switch = !current_shader().label_addresses().empty();
// After the final exec (if it happened to be not exece, which would already
// have a break branch), break from the switch if it exists, or from the
// loop it doesn't.
if (!builder_->getBuildPoint()->isTerminated()) {
builder_->createBranch(has_main_switch ? main_switch_merge_
: main_loop_merge_);
}
std::unique_ptr<spv::Instruction> main_loop_pc_next_op =
std::make_unique<spv::Instruction>(
main_loop_pc_next_, type_int_,
main_switch_next_pc_phi_operands_.size() >= 2 ? spv::OpPhi
: spv::OpCopyObject);
for (spv::Id operand : main_switch_next_pc_phi_operands_) {
main_loop_pc_next_op->addIdOperand(operand);
if (has_main_switch) {
// Insert the switch instruction with all cases added as operands.
builder_->setBuildPoint(main_switch_header_);
builder_->getBuildPoint()->addInstruction(std::move(main_switch_op_));
// Build the main switch merge, breaking out of the loop after falling
// through the end or breaking from exece (only continuing if a jump -
// from a guest loop or from jmp/call - was made).
function_main_->addBlock(main_switch_merge_);
builder_->setBuildPoint(main_switch_merge_);
builder_->createBranch(main_loop_merge_);
}
builder_->getBuildPoint()->addInstruction(std::move(main_loop_pc_next_op));
}
builder_->createBranch(main_loop_header_);
// Add the main loop merge block and go back to the function.
function_main_->addBlock(main_loop_merge_);
builder_->setBuildPoint(main_loop_merge_);
// Main loop continuation - choose the program counter based on the path
// taken (-1 if not from a jump as a safe fallback, which would result in
// not hitting any switch case and reaching the final break in the body).
function_main_->addBlock(main_loop_continue_);
builder_->setBuildPoint(main_loop_continue_);
if (has_main_switch) {
// OpPhi, if added, must be the first in the block.
// If labels were added, but not jumps (for example, due to the call
// instruction not being implemented as of October 18, 2020), send an
// impossible program counter value (-1) to the OpPhi at the next
// iteration.
if (main_switch_next_pc_phi_operands_.empty()) {
main_switch_next_pc_phi_operands_.push_back(
builder_->makeIntConstant(-1));
}
std::unique_ptr<spv::Instruction> main_loop_pc_next_op =
std::make_unique<spv::Instruction>(
main_loop_pc_next_, type_int_,
main_switch_next_pc_phi_operands_.size() >= 2
? spv::OpPhi
: spv::OpCopyObject);
for (spv::Id operand : main_switch_next_pc_phi_operands_) {
main_loop_pc_next_op->addIdOperand(operand);
}
builder_->getBuildPoint()->addInstruction(
std::move(main_loop_pc_next_op));
}
builder_->createBranch(main_loop_header_);
// Add the main loop merge block and go back to the function.
function_main_->addBlock(main_loop_merge_);
builder_->setBuildPoint(main_loop_merge_);
}
if (is_vertex_shader()) {
CompleteVertexOrTessEvalShaderInMain();
@ -622,6 +696,20 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
builder_->addExecutionMode(function_main_,
spv::ExecutionModeEarlyFragmentTests);
}
if (edram_fragment_shader_interlock_) {
// Accessing per-sample values, so interlocking just when there's common
// coverage is enough if the device exposes that.
if (features_.fragment_shader_sample_interlock) {
builder_->addCapability(
spv::CapabilityFragmentShaderSampleInterlockEXT);
builder_->addExecutionMode(function_main_,
spv::ExecutionModeSampleInterlockOrderedEXT);
} else {
builder_->addCapability(spv::CapabilityFragmentShaderPixelInterlockEXT);
builder_->addExecutionMode(function_main_,
spv::ExecutionModePixelInterlockOrderedEXT);
}
}
} else {
assert_true(is_vertex_shader());
execution_model = IsSpirvTessEvalShader()
@ -649,14 +737,17 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
entry_point->addIdOperand(interface_id);
}
// Specify the binding indices for samplers when the number of textures is
// known, as samplers are located after images in the texture descriptor set.
size_t texture_binding_count = texture_bindings_.size();
size_t sampler_binding_count = sampler_bindings_.size();
for (size_t i = 0; i < sampler_binding_count; ++i) {
builder_->addDecoration(sampler_bindings_[i].variable,
spv::DecorationBinding,
int(texture_binding_count + i));
if (!is_depth_only_fragment_shader_) {
// Specify the binding indices for samplers when the number of textures is
// known, as samplers are located after images in the texture descriptor
// set.
size_t texture_binding_count = texture_bindings_.size();
size_t sampler_binding_count = sampler_bindings_.size();
for (size_t i = 0; i < sampler_binding_count; ++i) {
builder_->addDecoration(sampler_bindings_[i].variable,
spv::DecorationBinding,
int(texture_binding_count + i));
}
}
// TODO(Triang3l): Avoid copy?
@ -1682,49 +1773,83 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
void SpirvShaderTranslator::StartFragmentShaderBeforeMain() {
Modification shader_modification = GetSpirvShaderModification();
uint32_t input_location = 0;
if (edram_fragment_shader_interlock_) {
builder_->addExtension("SPV_EXT_fragment_shader_interlock");
// Interpolator inputs.
{
uint32_t interpolators_remaining = GetModificationInterpolatorMask();
uint32_t interpolator_index;
while (xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) {
interpolators_remaining &= ~(UINT32_C(1) << interpolator_index);
spv::Id interpolator = builder_->createVariable(
spv::NoPrecision, spv::StorageClassInput, type_float4_,
fmt::format("xe_in_interpolator_{}", interpolator_index).c_str());
input_output_interpolators_[interpolator_index] = interpolator;
builder_->addDecoration(interpolator, spv::DecorationLocation,
int(input_location));
if (shader_modification.pixel.interpolators_centroid &
(UINT32_C(1) << interpolator_index)) {
builder_->addDecoration(interpolator, spv::DecorationCentroid);
// EDRAM buffer uint[].
id_vector_temp_.clear();
id_vector_temp_.push_back(builder_->makeRuntimeArray(type_uint_));
// Storage buffers have std430 packing, no padding to 4-component vectors.
builder_->addDecoration(id_vector_temp_.back(), spv::DecorationArrayStride,
sizeof(uint32_t));
spv::Id type_edram = builder_->makeStructType(id_vector_temp_, "XeEdram");
builder_->addMemberName(type_edram, 0, "edram");
builder_->addMemberDecoration(type_edram, 0, spv::DecorationCoherent);
builder_->addMemberDecoration(type_edram, 0, spv::DecorationRestrict);
builder_->addMemberDecoration(type_edram, 0, spv::DecorationOffset, 0);
builder_->addDecoration(type_edram, features_.spirv_version >= spv::Spv_1_3
? spv::DecorationBlock
: spv::DecorationBufferBlock);
buffer_edram_ = builder_->createVariable(
spv::NoPrecision,
features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer
: spv::StorageClassUniform,
type_edram, "xe_edram");
builder_->addDecoration(buffer_edram_, spv::DecorationDescriptorSet,
int(kDescriptorSetSharedMemoryAndEdram));
builder_->addDecoration(buffer_edram_, spv::DecorationBinding, 1);
if (features_.spirv_version >= spv::Spv_1_4) {
main_interface_.push_back(buffer_edram_);
}
}
bool param_gen_needed = !is_depth_only_fragment_shader_ &&
GetPsParamGenInterpolator() != UINT32_MAX;
if (!is_depth_only_fragment_shader_) {
uint32_t input_location = 0;
// Interpolator inputs.
{
uint32_t interpolators_remaining = GetModificationInterpolatorMask();
uint32_t interpolator_index;
while (
xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) {
interpolators_remaining &= ~(UINT32_C(1) << interpolator_index);
spv::Id interpolator = builder_->createVariable(
spv::NoPrecision, spv::StorageClassInput, type_float4_,
fmt::format("xe_in_interpolator_{}", interpolator_index).c_str());
input_output_interpolators_[interpolator_index] = interpolator;
builder_->addDecoration(interpolator, spv::DecorationLocation,
int(input_location));
if (shader_modification.pixel.interpolators_centroid &
(UINT32_C(1) << interpolator_index)) {
builder_->addDecoration(interpolator, spv::DecorationCentroid);
}
main_interface_.push_back(interpolator);
++input_location;
}
}
// Point coordinate input.
if (shader_modification.pixel.param_gen_point) {
if (param_gen_needed) {
input_point_coordinates_ =
builder_->createVariable(spv::NoPrecision, spv::StorageClassInput,
type_float2_, "xe_in_point_coordinates");
builder_->addDecoration(input_point_coordinates_,
spv::DecorationLocation, int(input_location));
main_interface_.push_back(input_point_coordinates_);
}
main_interface_.push_back(interpolator);
++input_location;
}
}
bool param_gen_needed = GetPsParamGenInterpolator() != UINT32_MAX;
// Point coordinate input.
if (shader_modification.pixel.param_gen_point) {
if (param_gen_needed) {
input_point_coordinates_ =
builder_->createVariable(spv::NoPrecision, spv::StorageClassInput,
type_float2_, "xe_in_point_coordinates");
builder_->addDecoration(input_point_coordinates_, spv::DecorationLocation,
int(input_location));
main_interface_.push_back(input_point_coordinates_);
}
++input_location;
}
// Fragment coordinates.
// TODO(Triang3l): More conditions - fragment shader interlock render backend,
// alpha to coverage (if RT 0 is written, and there's no early depth /
// stencil), depth writing in the fragment shader (per-sample if supported).
if (param_gen_needed) {
// TODO(Triang3l): More conditions - alpha to coverage (if RT 0 is written,
// and there's no early depth / stencil), depth writing in the fragment shader
// (per-sample if supported).
if (edram_fragment_shader_interlock_ || param_gen_needed) {
input_fragment_coordinates_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassInput, type_float4_, "gl_FragCoord");
builder_->addDecoration(input_fragment_coordinates_, spv::DecorationBuiltIn,
@ -1733,9 +1858,9 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() {
}
// Is front facing.
// TODO(Triang3l): Needed for stencil in the fragment shader interlock render
// backend.
if (param_gen_needed && !GetSpirvShaderModification().pixel.param_gen_point) {
if (edram_fragment_shader_interlock_ ||
(param_gen_needed &&
!GetSpirvShaderModification().pixel.param_gen_point)) {
input_front_facing_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassInput, type_bool_, "gl_FrontFacing");
builder_->addDecoration(input_front_facing_, spv::DecorationBuiltIn,
@ -1743,33 +1868,165 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() {
main_interface_.push_back(input_front_facing_);
}
// Framebuffer attachment outputs.
std::fill(output_fragment_data_.begin(), output_fragment_data_.end(),
spv::NoResult);
static const char* const kFragmentDataNames[] = {
"xe_out_fragment_data_0",
"xe_out_fragment_data_1",
"xe_out_fragment_data_2",
"xe_out_fragment_data_3",
};
uint32_t color_targets_remaining = current_shader().writes_color_targets();
uint32_t color_target_index;
while (xe::bit_scan_forward(color_targets_remaining, &color_target_index)) {
color_targets_remaining &= ~(UINT32_C(1) << color_target_index);
spv::Id output_fragment_data_rt = builder_->createVariable(
spv::NoPrecision, spv::StorageClassOutput, type_float4_,
kFragmentDataNames[color_target_index]);
output_fragment_data_[color_target_index] = output_fragment_data_rt;
builder_->addDecoration(output_fragment_data_rt, spv::DecorationLocation,
int(color_target_index));
// Make invariant as pixel shaders may be used for various precise
// computations.
builder_->addDecoration(output_fragment_data_rt, spv::DecorationInvariant);
main_interface_.push_back(output_fragment_data_rt);
// Sample mask input.
if (edram_fragment_shader_interlock_) {
// SampleMask depends on SampleRateShading in some SPIR-V revisions.
builder_->addCapability(spv::CapabilitySampleRateShading);
input_sample_mask_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassInput,
builder_->makeArrayType(type_int_, builder_->makeUintConstant(1), 0),
"gl_SampleMaskIn");
builder_->addDecoration(input_sample_mask_, spv::DecorationFlat);
builder_->addDecoration(input_sample_mask_, spv::DecorationBuiltIn,
spv::BuiltInSampleMask);
main_interface_.push_back(input_sample_mask_);
}
if (!is_depth_only_fragment_shader_) {
// Framebuffer color attachment outputs.
if (!edram_fragment_shader_interlock_) {
std::fill(output_or_var_fragment_data_.begin(),
output_or_var_fragment_data_.end(), spv::NoResult);
static const char* const kFragmentDataOutputNames[] = {
"xe_out_fragment_data_0",
"xe_out_fragment_data_1",
"xe_out_fragment_data_2",
"xe_out_fragment_data_3",
};
uint32_t color_targets_remaining =
current_shader().writes_color_targets();
uint32_t color_target_index;
while (
xe::bit_scan_forward(color_targets_remaining, &color_target_index)) {
color_targets_remaining &= ~(UINT32_C(1) << color_target_index);
spv::Id output_fragment_data_rt = builder_->createVariable(
spv::NoPrecision, spv::StorageClassOutput, type_float4_,
kFragmentDataOutputNames[color_target_index]);
output_or_var_fragment_data_[color_target_index] =
output_fragment_data_rt;
builder_->addDecoration(output_fragment_data_rt,
spv::DecorationLocation,
int(color_target_index));
// Make invariant as pixel shaders may be used for various precise
// computations.
builder_->addDecoration(output_fragment_data_rt,
spv::DecorationInvariant);
main_interface_.push_back(output_fragment_data_rt);
}
}
}
}
void SpirvShaderTranslator::StartFragmentShaderInMain() {
// Set up pixel killing from within the translated shader without affecting
// the control flow (unlike with OpKill), similarly to how pixel killing works
// on the Xenos, and also keeping a single critical section exit and return
// for safety across different Vulkan implementations with fragment shader
// interlock.
if (current_shader().kills_pixels()) {
if (features_.demote_to_helper_invocation) {
// TODO(Triang3l): Promoted to SPIR-V 1.6 - don't add the extension there.
builder_->addExtension("SPV_EXT_demote_to_helper_invocation");
builder_->addCapability(spv::CapabilityDemoteToHelperInvocationEXT);
} else {
var_main_kill_pixel_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_bool_,
"xe_var_kill_pixel", builder_->makeBoolConstant(false));
}
// For killing with fragment shader interlock when demotion is supported,
// using OpIsHelperInvocationEXT to avoid allocating a variable in addition
// to the execution mask GPUs naturally have.
}
if (edram_fragment_shader_interlock_) {
// Initialize color output variables with fragment shader interlock.
std::fill(output_or_var_fragment_data_.begin(),
output_or_var_fragment_data_.end(), spv::NoResult);
var_main_fsi_color_written_ = spv::NoResult;
uint32_t color_targets_written = current_shader().writes_color_targets();
if (color_targets_written) {
static const char* const kFragmentDataVariableNames[] = {
"xe_var_fragment_data_0",
"xe_var_fragment_data_1",
"xe_var_fragment_data_2",
"xe_var_fragment_data_3",
};
uint32_t color_targets_remaining = color_targets_written;
uint32_t color_target_index;
while (
xe::bit_scan_forward(color_targets_remaining, &color_target_index)) {
color_targets_remaining &= ~(UINT32_C(1) << color_target_index);
output_or_var_fragment_data_[color_target_index] =
builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_float4_,
kFragmentDataVariableNames[color_target_index],
const_float4_0_);
}
var_main_fsi_color_written_ = builder_->createVariable(
spv::NoPrecision, spv::StorageClassFunction, type_uint_,
"xe_var_fsi_color_written", const_uint_0_);
}
}
if (edram_fragment_shader_interlock_ && FSI_IsDepthStencilEarly()) {
spv::Id msaa_samples = LoadMsaaSamplesFromFlags();
FSI_LoadSampleMask(msaa_samples);
FSI_LoadEdramOffsets(msaa_samples);
builder_->createNoResultOp(spv::OpBeginInvocationInterlockEXT);
FSI_DepthStencilTest(msaa_samples, false);
if (!is_depth_only_fragment_shader_) {
// Skip the rest of the shader if the whole quad (due to derivatives) has
// failed the depth / stencil test, and there are no depth and stencil
// values to conditionally write after running the shader to check if
// samples don't additionally need to be discarded.
spv::Id quad_needs_execution = builder_->createBinOp(
spv::OpINotEqual, type_bool_, main_fsi_sample_mask_, const_uint_0_);
// TODO(Triang3l): Use GroupNonUniformQuad operations where supported.
// If none of the pixels in the quad passed the depth / stencil test, the
// value of (any samples covered ? 1.0f : 0.0f) for the current pixel will
// be 0.0f, and since it will be 0.0f in other pixels too, the derivatives
// will be zero as well.
builder_->addCapability(spv::CapabilityDerivativeControl);
// Query the horizontally adjacent pixel.
quad_needs_execution = builder_->createBinOp(
spv::OpLogicalOr, type_bool_, quad_needs_execution,
builder_->createBinOp(
spv::OpFOrdNotEqual, type_bool_,
builder_->createUnaryOp(
spv::OpDPdxFine, type_float_,
builder_->createTriOp(spv::OpSelect, type_float_,
quad_needs_execution, const_float_1_,
const_float_0_)),
const_float_0_));
// Query the vertically adjacent pair of pixels.
quad_needs_execution = builder_->createBinOp(
spv::OpLogicalOr, type_bool_, quad_needs_execution,
builder_->createBinOp(
spv::OpFOrdNotEqual, type_bool_,
builder_->createUnaryOp(
spv::OpDPdyCoarse, type_float_,
builder_->createTriOp(spv::OpSelect, type_float_,
quad_needs_execution, const_float_1_,
const_float_0_)),
const_float_0_));
spv::Block& main_fsi_early_depth_stencil_execute_quad =
builder_->makeNewBlock();
main_fsi_early_depth_stencil_execute_quad_merge_ =
&builder_->makeNewBlock();
SpirvCreateSelectionMerge(
main_fsi_early_depth_stencil_execute_quad_merge_->getId(),
spv::SelectionControlDontFlattenMask);
builder_->createConditionalBranch(
quad_needs_execution, &main_fsi_early_depth_stencil_execute_quad,
main_fsi_early_depth_stencil_execute_quad_merge_);
builder_->setBuildPoint(&main_fsi_early_depth_stencil_execute_quad);
}
}
if (is_depth_only_fragment_shader_) {
return;
}
uint32_t param_gen_interpolator = GetPsParamGenInterpolator();
// Zero general-purpose registers to prevent crashes when the game
@ -1928,11 +2185,13 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() {
var_main_registers_, id_vector_temp_));
}
// Initialize the colors for safety.
for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
spv::Id output_fragment_data_rt = output_fragment_data_[i];
if (output_fragment_data_rt != spv::NoResult) {
builder_->createStore(const_float4_0_, output_fragment_data_rt);
if (!edram_fragment_shader_interlock_) {
// Initialize the colors for safety.
for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
spv::Id output_fragment_data_rt = output_or_var_fragment_data_[i];
if (output_fragment_data_rt != spv::NoResult) {
builder_->createStore(const_float4_0_, output_fragment_data_rt);
}
}
}
}
@ -2299,11 +2558,18 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result,
assert_true(is_pixel_shader());
assert_not_zero(used_write_mask);
assert_true(current_shader().writes_color_target(result.storage_index));
target_pointer = output_fragment_data_[result.storage_index];
// May be spv::NoResult if the color output is explicitly removed due to
// an empty write mask without independent blending.
// TODO(Triang3l): Store the alpha of the first output in this case for
// alpha test and alpha to coverage.
target_pointer = output_or_var_fragment_data_[result.storage_index];
if (edram_fragment_shader_interlock_) {
assert_true(var_main_fsi_color_written_ != spv::NoResult);
builder_->createStore(
builder_->createBinOp(
spv::OpBitwiseOr, type_uint_,
builder_->createLoad(var_main_fsi_color_written_,
spv::NoPrecision),
builder_->makeUintConstant(uint32_t(1)
<< result.storage_index)),
var_main_fsi_color_written_);
}
} break;
default:
// TODO(Triang3l): All storage targets.

View File

@ -96,6 +96,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
kSysFlag_WNotReciprocal_Shift,
kSysFlag_PrimitivePolygonal_Shift,
kSysFlag_PrimitiveLine_Shift,
kSysFlag_MsaaSamples_Shift,
kSysFlag_DepthFloat24_Shift =
kSysFlag_MsaaSamples_Shift + xenos::kMsaaSamplesBits,
kSysFlag_AlphaPassIfLess_Shift,
kSysFlag_AlphaPassIfEqual_Shift,
kSysFlag_AlphaPassIfGreater_Shift,
@ -104,6 +107,26 @@ class SpirvShaderTranslator : public ShaderTranslator {
kSysFlag_ConvertColor2ToGamma_Shift,
kSysFlag_ConvertColor3ToGamma_Shift,
kSysFlag_FSIDepthStencil_Shift,
kSysFlag_FSIDepthPassIfLess_Shift,
kSysFlag_FSIDepthPassIfEqual_Shift,
kSysFlag_FSIDepthPassIfGreater_Shift,
// 1 to write new depth to the depth buffer, 0 to keep the old one if the
// depth test passes.
kSysFlag_FSIDepthWrite_Shift,
kSysFlag_FSIStencilTest_Shift,
// If the depth / stencil test has failed, but resulted in a stencil value
// that is different than the one currently in the depth buffer, write it
// anyway and don't run the rest of the shader (to check if the sample may
// be discarded some way) - use when alpha test and alpha to coverage are
// disabled. Ignored by the shader if not applicable to it (like if it has
// kill instructions or writes the depth output).
// TODO(Triang3l): Investigate replacement with an alpha-to-mask flag,
// checking `(flags & (alpha test | alpha to mask)) == (always | disabled)`,
// taking into account the potential relation with occlusion queries (but
// should be safe at least temporarily).
kSysFlag_FSIDepthStencilEarlyWrite_Shift,
kSysFlag_Count,
// For HostVertexShaderType kVertex, if fullDrawIndexUint32 is not
@ -127,6 +150,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift,
kSysFlag_PrimitivePolygonal = 1u << kSysFlag_PrimitivePolygonal_Shift,
kSysFlag_PrimitiveLine = 1u << kSysFlag_PrimitiveLine_Shift,
kSysFlag_DepthFloat24 = 1u << kSysFlag_DepthFloat24_Shift,
kSysFlag_AlphaPassIfLess = 1u << kSysFlag_AlphaPassIfLess_Shift,
kSysFlag_AlphaPassIfEqual = 1u << kSysFlag_AlphaPassIfEqual_Shift,
kSysFlag_AlphaPassIfGreater = 1u << kSysFlag_AlphaPassIfGreater_Shift,
@ -134,6 +158,14 @@ class SpirvShaderTranslator : public ShaderTranslator {
kSysFlag_ConvertColor1ToGamma = 1u << kSysFlag_ConvertColor1ToGamma_Shift,
kSysFlag_ConvertColor2ToGamma = 1u << kSysFlag_ConvertColor2ToGamma_Shift,
kSysFlag_ConvertColor3ToGamma = 1u << kSysFlag_ConvertColor3ToGamma_Shift,
kSysFlag_FSIDepthStencil = 1u << kSysFlag_FSIDepthStencil_Shift,
kSysFlag_FSIDepthPassIfLess = 1u << kSysFlag_FSIDepthPassIfLess_Shift,
kSysFlag_FSIDepthPassIfEqual = 1u << kSysFlag_FSIDepthPassIfEqual_Shift,
kSysFlag_FSIDepthPassIfGreater = 1u << kSysFlag_FSIDepthPassIfGreater_Shift,
kSysFlag_FSIDepthWrite = 1u << kSysFlag_FSIDepthWrite_Shift,
kSysFlag_FSIStencilTest = 1u << kSysFlag_FSIStencilTest_Shift,
kSysFlag_FSIDepthStencilEarlyWrite =
1u << kSysFlag_FSIDepthStencilEarlyWrite_Shift,
};
static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants");
@ -171,9 +203,55 @@ class SpirvShaderTranslator : public ShaderTranslator {
uint32_t texture_swizzles[16];
float alpha_test_reference;
float padding_alpha_test_reference[3];
uint32_t edram_32bpp_tile_pitch_dwords_scaled;
uint32_t edram_depth_base_dwords_scaled;
float padding_edram_depth_base_dwords_scaled;
float color_exp_bias[4];
float edram_poly_offset_front_scale;
float edram_poly_offset_back_scale;
float edram_poly_offset_front_offset;
float edram_poly_offset_back_offset;
union {
struct {
uint32_t edram_stencil_front_reference_masks;
uint32_t edram_stencil_front_func_ops;
uint32_t edram_stencil_back_reference_masks;
uint32_t edram_stencil_back_func_ops;
};
struct {
uint32_t edram_stencil_front[2];
uint32_t edram_stencil_back[2];
};
};
uint32_t edram_rt_base_dwords_scaled[4];
// RT format combined with RenderTargetCache::kPSIColorFormatFlag values
// (pass via RenderTargetCache::AddPSIColorFormatFlags).
uint32_t edram_rt_format_flags[4];
// Render target blending options - RB_BLENDCONTROL, with only the relevant
// options (factors and operations - AND 0x1FFF1FFF). If 0x00010001
// (1 * src + 0 * dst), blending is disabled for the render target.
uint32_t edram_rt_blend_factors_ops[4];
// Format info - mask to apply to the old packed RT data, and to apply as
// inverted to the new packed data, before storing (more or less the inverse
// of the write mask packed like render target channels). This can be used
// to bypass unpacking if blending is not used. If 0 and not blending,
// reading the old data from the EDRAM buffer is not required.
uint32_t edram_rt_keep_mask[4][2];
// Format info - values to clamp the color to before blending or storing.
// Low color, low alpha, high color, high alpha.
float edram_rt_clamp[4][4];
// The constant blend factor for the respective modes.
float edram_blend_constant[4];
};
enum ConstantBuffer : uint32_t {
@ -248,12 +326,22 @@ class SpirvShaderTranslator : public ShaderTranslator {
uint32_t max_storage_buffer_range;
bool clip_distance;
bool cull_distance;
bool demote_to_helper_invocation;
bool fragment_shader_sample_interlock;
bool full_draw_index_uint32;
bool image_view_format_swizzle;
bool signed_zero_inf_nan_preserve_float32;
bool denorm_flush_to_zero_float32;
};
SpirvShaderTranslator(const Features& features);
SpirvShaderTranslator(const Features& features,
bool native_2x_msaa_with_attachments,
bool native_2x_msaa_no_attachments,
bool edram_fragment_shader_interlock)
: features_(features),
native_2x_msaa_with_attachments_(native_2x_msaa_with_attachments),
native_2x_msaa_no_attachments_(native_2x_msaa_no_attachments),
edram_fragment_shader_interlock_(edram_fragment_shader_interlock) {}
uint64_t GetDefaultVertexShaderModification(
uint32_t dynamic_addressable_register_count,
@ -277,6 +365,10 @@ class SpirvShaderTranslator : public ShaderTranslator {
features_.max_storage_buffer_range);
}
// Creates a special fragment shader without color outputs - this resets the
// state of the translator.
std::vector<uint8_t> CreateDepthOnlyFragmentShader();
// Common functions useful not only for the translator, but also for EDRAM
// emulation via conventional render targets.
@ -385,10 +477,10 @@ class SpirvShaderTranslator : public ShaderTranslator {
}
bool IsExecutionModeEarlyFragmentTests() const {
// TODO(Triang3l): Not applicable to fragment shader interlock.
return is_pixel_shader() &&
GetSpirvShaderModification().pixel.depth_stencil_mode ==
Modification::DepthStencilMode::kEarlyHint &&
!edram_fragment_shader_interlock_ &&
current_shader().implicit_early_z_write_allowed();
}
@ -528,7 +620,72 @@ class SpirvShaderTranslator : public ShaderTranslator {
spv::Id image_unsigned, spv::Id image_signed,
spv::Id sampler, spv::Id is_all_signed);
spv::Id LoadMsaaSamplesFromFlags();
// Whether it's possible and worth skipping running the translated shader for
// 2x2 quads.
bool FSI_IsDepthStencilEarly() const {
assert_true(edram_fragment_shader_interlock_);
return !is_depth_only_fragment_shader_ &&
!current_shader().writes_depth() &&
!current_shader().is_valid_memexport_used();
}
void FSI_LoadSampleMask(spv::Id msaa_samples);
void FSI_LoadEdramOffsets(spv::Id msaa_samples);
// The address must be a signed int. Whether the render target is 64bpp, if
// present at all, must be a bool (if it's NoResult, 32bpp will be assumed).
spv::Id FSI_AddSampleOffset(spv::Id sample_0_address, uint32_t sample_index,
spv::Id is_64bpp = spv::NoResult);
// Updates main_fsi_sample_mask_. Must be called outside non-uniform control
// flow because of taking derivatives of the fragment depth.
void FSI_DepthStencilTest(spv::Id msaa_samples,
bool sample_mask_potentially_narrowed_previouly);
// Returns the first and the second 32 bits as two uints.
std::array<spv::Id, 2> FSI_ClampAndPackColor(spv::Id color_float4,
spv::Id format_with_flags);
std::array<spv::Id, 4> FSI_UnpackColor(std::array<spv::Id, 2> color_packed,
spv::Id format_with_flags);
// The bounds must have the same number of components as the color or alpha.
spv::Id FSI_FlushNaNClampAndInBlending(spv::Id color_or_alpha,
spv::Id is_fixed_point,
spv::Id min_value, spv::Id max_value);
spv::Id FSI_ApplyColorBlendFactor(spv::Id value, spv::Id is_fixed_point,
spv::Id clamp_min_value,
spv::Id clamp_max_value, spv::Id factor,
spv::Id source_color, spv::Id source_alpha,
spv::Id dest_color, spv::Id dest_alpha,
spv::Id constant_color,
spv::Id constant_alpha);
spv::Id FSI_ApplyAlphaBlendFactor(spv::Id value, spv::Id is_fixed_point,
spv::Id clamp_min_value,
spv::Id clamp_max_value, spv::Id factor,
spv::Id source_alpha, spv::Id dest_alpha,
spv::Id constant_alpha);
// If source_color_clamped, dest_color, constant_color_clamped are
// spv::NoResult, will blend the alpha. Otherwise, will blend the color.
// The result will be unclamped (color packing is supposed to clamp it).
spv::Id FSI_BlendColorOrAlphaWithUnclampedResult(
spv::Id is_fixed_point, spv::Id clamp_min_value, spv::Id clamp_max_value,
spv::Id source_color_clamped, spv::Id source_alpha_clamped,
spv::Id dest_color, spv::Id dest_alpha, spv::Id constant_color_clamped,
spv::Id constant_alpha_clamped, spv::Id equation, spv::Id source_factor,
spv::Id dest_factor);
Features features_;
bool native_2x_msaa_with_attachments_;
bool native_2x_msaa_no_attachments_;
// For safety with different drivers (even though fragment shader interlock in
// SPIR-V only has one control flow requirement - that both begin and end must
// be dynamically executed exactly once in this order), adhering to the more
// strict control flow limitations of OpenGL (GLSL) fragment shader interlock,
// that begin and end are called only on the outermost level of the control
// flow of the main function, and that there are no returns before either
// (there's a single return from the shader).
bool edram_fragment_shader_interlock_;
// Is currently writing the empty depth-only pixel shader, such as for depth
// and stencil testing with fragment shader interlock.
bool is_depth_only_fragment_shader_ = false;
std::unique_ptr<spv::Builder> builder_;
@ -621,7 +778,23 @@ class SpirvShaderTranslator : public ShaderTranslator {
kSystemConstantTextureSwizzledSigns,
kSystemConstantTextureSwizzles,
kSystemConstantAlphaTestReference,
kSystemConstantEdram32bppTilePitchDwordsScaled,
kSystemConstantEdramDepthBaseDwordsScaled,
kSystemConstantColorExpBias,
kSystemConstantEdramPolyOffsetFrontScale,
kSystemConstantEdramPolyOffsetBackScale,
kSystemConstantEdramPolyOffsetFrontOffset,
kSystemConstantEdramPolyOffsetBackOffset,
kSystemConstantEdramStencilFront,
kSystemConstantEdramStencilBack,
kSystemConstantEdramRTBaseDwordsScaled,
kSystemConstantEdramRTFormatFlags,
kSystemConstantEdramRTBlendFactorsOps,
// Accessed as float4[2], not float2[4], due to std140 array stride
// alignment.
kSystemConstantEdramRTKeepMask,
kSystemConstantEdramRTClamp,
kSystemConstantEdramBlendConstant,
};
spv::Id uniform_system_constants_;
spv::Id uniform_float_constants_;
@ -629,6 +802,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
spv::Id uniform_fetch_constants_;
spv::Id buffers_shared_memory_;
spv::Id buffer_edram_;
// Not using combined images and samplers because
// maxPerStageDescriptorSamplers is often lower than
@ -647,6 +821,8 @@ class SpirvShaderTranslator : public ShaderTranslator {
spv::Id input_fragment_coordinates_;
// PS, only when needed - bool.
spv::Id input_front_facing_;
// PS, only when needed - int[1].
spv::Id input_sample_mask_;
// VS output or PS input, only the ones that are needed (spv::NoResult for the
// unneeded interpolators), indexed by the guest interpolator index - float4.
@ -671,7 +847,10 @@ class SpirvShaderTranslator : public ShaderTranslator {
};
spv::Id output_per_vertex_;
std::array<spv::Id, xenos::kMaxColorRenderTargets> output_fragment_data_;
// With fragment shader interlock, variables in the main function.
// Otherwise, framebuffer color attachment outputs.
std::array<spv::Id, xenos::kMaxColorRenderTargets>
output_or_var_fragment_data_;
std::vector<spv::Id> main_interface_;
spv::Function* function_main_;
@ -698,6 +877,40 @@ class SpirvShaderTranslator : public ShaderTranslator {
spv::Id var_main_registers_;
// VS only - float3 (special exports).
spv::Id var_main_point_size_edge_flag_kill_vertex_;
// PS, only when needed - bool.
spv::Id var_main_kill_pixel_;
// PS, only when writing to color render targets with fragment shader
// interlock - uint.
// Whether color buffers have been written to, if not written on the taken
// execution path, don't export according to Direct3D 9 register documentation
// (some games rely on this behavior).
spv::Id var_main_fsi_color_written_;
// Loaded by FSI_LoadSampleMask.
// Can be modified on the outermost control flow level in the main function.
// 0:3 - Per-sample coverage at the current stage of the shader's execution.
// Affected by things like gl_SampleMaskIn, early or late depth /
// stencil (always resets bits for failing, no matter if need to defer
// writing), alpha to coverage.
// 4:7 - Depth write deferred mask - when early depth / stencil resulted in a
// different value for the sample (like different stencil if the test
// failed), but can't write it before running the shader because it's
// not known if the sample will be discarded by the shader, alphatest or
// AtoC.
// Early depth / stencil rejection of the pixel is possible when both 0:3 and
// 4:7 are zero.
spv::Id main_fsi_sample_mask_;
// Loaded by FSI_LoadEdramOffsets.
// Including the depth render target base.
spv::Id main_fsi_address_depth_;
// Not including the render target base.
spv::Id main_fsi_offset_32bpp_;
spv::Id main_fsi_offset_64bpp_;
// Loaded by FSI_DepthStencilTest for early depth / stencil, the depth /
// stencil values to write at the end of the shader if the specified in
// main_fsi_sample_mask_ and if the samples were not discarded later after the
// early test.
std::array<spv::Id, 4> main_fsi_late_write_depth_stencil_;
spv::Block* main_fsi_early_depth_stencil_execute_quad_merge_;
spv::Block* main_loop_header_;
spv::Block* main_loop_continue_;
spv::Block* main_loop_merge_;

View File

@ -123,7 +123,7 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation(
: spv::NoType;
// In case the paired scalar instruction (if processed first) terminates the
// block (like via OpKill).
// block.
EnsureBuildPointAvailable();
// Lookup table for variants of instructions with similar structure.
@ -838,9 +838,15 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation(
SpirvCreateSelectionMerge(merge_block.getId());
builder_->createConditionalBranch(condition, &kill_block, &merge_block);
builder_->setBuildPoint(&kill_block);
// TODO(Triang3l): Demote to helper invocation to keep derivatives if
// needed (and return 1 if killed in this case).
builder_->createNoResultOp(spv::OpKill);
// Kill without influencing the control flow in the translated shader.
if (var_main_kill_pixel_ != spv::NoResult) {
builder_->createStore(builder_->makeBoolConstant(true),
var_main_kill_pixel_);
}
if (features_.demote_to_helper_invocation) {
builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT);
}
builder_->createBranch(&merge_block);
builder_->setBuildPoint(&merge_block);
return const_float_0_;
}
@ -938,7 +944,7 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation(
}
// In case the paired vector instruction (if processed first) terminates the
// block (like via OpKill).
// block.
EnsureBuildPointAvailable();
// Lookup table for variants of instructions with similar structure.
@ -1393,9 +1399,15 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation(
SpirvCreateSelectionMerge(merge_block.getId());
builder_->createConditionalBranch(condition, &kill_block, &merge_block);
builder_->setBuildPoint(&kill_block);
// TODO(Triang3l): Demote to helper invocation to keep derivatives if
// needed (and return 1 if killed in this case).
builder_->createNoResultOp(spv::OpKill);
// Kill without influencing the control flow in the translated shader.
if (var_main_kill_pixel_ != spv::NoResult) {
builder_->createStore(builder_->makeBoolConstant(true),
var_main_kill_pixel_);
}
if (features_.demote_to_helper_invocation) {
builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT);
}
builder_->createBranch(&merge_block);
builder_->setBuildPoint(&merge_block);
return const_float_0_;
}

View File

@ -1898,30 +1898,14 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
builder_->setBuildPoint(&block_dimension_stacked_start);
if (use_computed_lod) {
// Extract 2D gradients for stacked textures which are 2D arrays.
{
std::unique_ptr<spv::Instruction> shuffle_op =
std::make_unique<spv::Instruction>(builder_->getUniqueId(),
type_float2_,
spv::OpVectorShuffle);
shuffle_op->addIdOperand(gradients_h);
shuffle_op->addIdOperand(gradients_h);
shuffle_op->addImmediateOperand(0);
shuffle_op->addImmediateOperand(1);
texture_parameters.gradX = shuffle_op->getResultId();
builder_->getBuildPoint()->addInstruction(std::move(shuffle_op));
}
{
std::unique_ptr<spv::Instruction> shuffle_op =
std::make_unique<spv::Instruction>(builder_->getUniqueId(),
type_float2_,
spv::OpVectorShuffle);
shuffle_op->addIdOperand(gradients_v);
shuffle_op->addIdOperand(gradients_v);
shuffle_op->addImmediateOperand(0);
shuffle_op->addImmediateOperand(1);
texture_parameters.gradY = shuffle_op->getResultId();
builder_->getBuildPoint()->addInstruction(std::move(shuffle_op));
}
uint_vector_temp_.clear();
uint_vector_temp_.reserve(2);
uint_vector_temp_.push_back(0);
uint_vector_temp_.push_back(1);
texture_parameters.gradX = builder_->createRvalueSwizzle(
spv::NoPrecision, type_float2_, gradients_h, uint_vector_temp_);
texture_parameters.gradY = builder_->createRvalueSwizzle(
spv::NoPrecision, type_float2_, gradients_v, uint_vector_temp_);
}
// Check if linear filtering is needed.
bool vol_mag_filter_is_fetch_const =

File diff suppressed because it is too large Load Diff

View File

@ -67,9 +67,6 @@ const VkDescriptorPoolSize
{VK_DESCRIPTOR_TYPE_SAMPLER, kLinkedTypeDescriptorPoolSetCount},
};
// No specific reason for 32768 descriptors, just the "too much" amount from
// Direct3D 12 PIX warnings. 2x descriptors for textures because of unsigned and
// signed bindings.
VulkanCommandProcessor::VulkanCommandProcessor(
VulkanGraphicsSystem* graphics_system, kernel::KernelState* kernel_state)
: CommandProcessor(graphics_system, kernel_state),
@ -106,6 +103,32 @@ void VulkanCommandProcessor::TracePlaybackWroteMemory(uint32_t base_ptr,
void VulkanCommandProcessor::RestoreEdramSnapshot(const void* snapshot) {}
std::string VulkanCommandProcessor::GetWindowTitleText() const {
std::ostringstream title;
title << "Vulkan";
if (render_target_cache_) {
switch (render_target_cache_->GetPath()) {
case RenderTargetCache::Path::kHostRenderTargets:
title << " - FBO";
break;
case RenderTargetCache::Path::kPixelShaderInterlock:
title << " - FSI";
break;
default:
break;
}
uint32_t draw_resolution_scale_x =
texture_cache_ ? texture_cache_->draw_resolution_scale_x() : 1;
uint32_t draw_resolution_scale_y =
texture_cache_ ? texture_cache_->draw_resolution_scale_y() : 1;
if (draw_resolution_scale_x > 1 || draw_resolution_scale_y > 1) {
title << ' ' << draw_resolution_scale_x << 'x' << draw_resolution_scale_y;
}
}
title << " - HEAVILY INCOMPLETE, early development";
return title.str();
}
bool VulkanCommandProcessor::SetupContext() {
if (!CommandProcessor::SetupContext()) {
XELOGE("Failed to initialize base command processor context");
@ -146,7 +169,7 @@ bool VulkanCommandProcessor::SetupContext() {
size_t(16384)),
size_t(uniform_buffer_alignment)));
// Descriptor set layouts.
// Descriptor set layouts that don't depend on the setup of other subsystems.
VkShaderStageFlags guest_shader_stages =
guest_shader_vertex_stages_ | VK_SHADER_STAGE_FRAGMENT_BIT;
// Empty.
@ -163,37 +186,6 @@ bool VulkanCommandProcessor::SetupContext() {
XELOGE("Failed to create an empty Vulkan descriptor set layout");
return false;
}
// Shared memory and EDRAM.
uint32_t shared_memory_binding_count_log2 =
SpirvShaderTranslator::GetSharedMemoryStorageBufferCountLog2(
provider.device_properties().limits.maxStorageBufferRange);
uint32_t shared_memory_binding_count = UINT32_C(1)
<< shared_memory_binding_count_log2;
VkDescriptorSetLayoutBinding
descriptor_set_layout_bindings_shared_memory_and_edram[1];
descriptor_set_layout_bindings_shared_memory_and_edram[0].binding = 0;
descriptor_set_layout_bindings_shared_memory_and_edram[0].descriptorType =
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
descriptor_set_layout_bindings_shared_memory_and_edram[0].descriptorCount =
shared_memory_binding_count;
descriptor_set_layout_bindings_shared_memory_and_edram[0].stageFlags =
guest_shader_stages;
descriptor_set_layout_bindings_shared_memory_and_edram[0].pImmutableSamplers =
nullptr;
// TODO(Triang3l): EDRAM storage image binding for the fragment shader
// interlocks case.
descriptor_set_layout_create_info.bindingCount = uint32_t(
xe::countof(descriptor_set_layout_bindings_shared_memory_and_edram));
descriptor_set_layout_create_info.pBindings =
descriptor_set_layout_bindings_shared_memory_and_edram;
if (dfn.vkCreateDescriptorSetLayout(
device, &descriptor_set_layout_create_info, nullptr,
&descriptor_set_layout_shared_memory_and_edram_) != VK_SUCCESS) {
XELOGE(
"Failed to create a Vulkan descriptor set layout for the shared memory "
"and the EDRAM");
return false;
}
// Guest draw constants.
VkDescriptorSetLayoutBinding descriptor_set_layout_bindings_constants
[SpirvShaderTranslator::kConstantBufferCount] = {};
@ -289,16 +281,70 @@ bool VulkanCommandProcessor::SetupContext() {
return false;
}
uint32_t shared_memory_binding_count_log2 =
SpirvShaderTranslator::GetSharedMemoryStorageBufferCountLog2(
provider.device_properties().limits.maxStorageBufferRange);
uint32_t shared_memory_binding_count = UINT32_C(1)
<< shared_memory_binding_count_log2;
// Requires the transient descriptor set layouts.
// TODO(Triang3l): Get the actual draw resolution scale when the texture cache
// supports resolution scaling.
render_target_cache_ = std::make_unique<VulkanRenderTargetCache>(
*register_file_, *memory_, trace_writer_, 1, 1, *this);
if (!render_target_cache_->Initialize()) {
if (!render_target_cache_->Initialize(shared_memory_binding_count)) {
XELOGE("Failed to initialize the render target cache");
return false;
}
// Shared memory and EDRAM descriptor set layout.
bool edram_fragment_shader_interlock =
render_target_cache_->GetPath() ==
RenderTargetCache::Path::kPixelShaderInterlock;
VkDescriptorSetLayoutBinding
shared_memory_and_edram_descriptor_set_layout_bindings[2];
shared_memory_and_edram_descriptor_set_layout_bindings[0].binding = 0;
shared_memory_and_edram_descriptor_set_layout_bindings[0].descriptorType =
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
shared_memory_and_edram_descriptor_set_layout_bindings[0].descriptorCount =
shared_memory_binding_count;
shared_memory_and_edram_descriptor_set_layout_bindings[0].stageFlags =
guest_shader_stages;
shared_memory_and_edram_descriptor_set_layout_bindings[0].pImmutableSamplers =
nullptr;
VkDescriptorSetLayoutCreateInfo
shared_memory_and_edram_descriptor_set_layout_create_info;
shared_memory_and_edram_descriptor_set_layout_create_info.sType =
VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
shared_memory_and_edram_descriptor_set_layout_create_info.pNext = nullptr;
shared_memory_and_edram_descriptor_set_layout_create_info.flags = 0;
shared_memory_and_edram_descriptor_set_layout_create_info.pBindings =
shared_memory_and_edram_descriptor_set_layout_bindings;
if (edram_fragment_shader_interlock) {
// EDRAM.
shared_memory_and_edram_descriptor_set_layout_bindings[1].binding = 1;
shared_memory_and_edram_descriptor_set_layout_bindings[1].descriptorType =
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
shared_memory_and_edram_descriptor_set_layout_bindings[1].descriptorCount =
1;
shared_memory_and_edram_descriptor_set_layout_bindings[1].stageFlags =
VK_SHADER_STAGE_FRAGMENT_BIT;
shared_memory_and_edram_descriptor_set_layout_bindings[1]
.pImmutableSamplers = nullptr;
shared_memory_and_edram_descriptor_set_layout_create_info.bindingCount = 2;
} else {
shared_memory_and_edram_descriptor_set_layout_create_info.bindingCount = 1;
}
if (dfn.vkCreateDescriptorSetLayout(
device, &shared_memory_and_edram_descriptor_set_layout_create_info,
nullptr,
&descriptor_set_layout_shared_memory_and_edram_) != VK_SUCCESS) {
XELOGE(
"Failed to create a Vulkan descriptor set layout for the shared memory "
"and the EDRAM");
return false;
}
pipeline_cache_ = std::make_unique<VulkanPipelineCache>(
*this, *register_file_, *render_target_cache_,
guest_shader_vertex_stages_);
@ -320,9 +366,8 @@ bool VulkanCommandProcessor::SetupContext() {
// Shared memory and EDRAM common bindings.
VkDescriptorPoolSize descriptor_pool_sizes[1];
descriptor_pool_sizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
descriptor_pool_sizes[0].descriptorCount = shared_memory_binding_count;
// TODO(Triang3l): EDRAM storage image binding for the fragment shader
// interlocks case.
descriptor_pool_sizes[0].descriptorCount =
shared_memory_binding_count + uint32_t(edram_fragment_shader_interlock);
VkDescriptorPoolCreateInfo descriptor_pool_create_info;
descriptor_pool_create_info.sType =
VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
@ -369,20 +414,45 @@ bool VulkanCommandProcessor::SetupContext() {
shared_memory_binding_range * i;
shared_memory_descriptor_buffer_info.range = shared_memory_binding_range;
}
VkWriteDescriptorSet write_descriptor_sets[1];
write_descriptor_sets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
write_descriptor_sets[0].pNext = nullptr;
write_descriptor_sets[0].dstSet = shared_memory_and_edram_descriptor_set_;
write_descriptor_sets[0].dstBinding = 0;
write_descriptor_sets[0].dstArrayElement = 0;
write_descriptor_sets[0].descriptorCount = shared_memory_binding_count;
write_descriptor_sets[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
write_descriptor_sets[0].pImageInfo = nullptr;
write_descriptor_sets[0].pBufferInfo = shared_memory_descriptor_buffers_info;
write_descriptor_sets[0].pTexelBufferView = nullptr;
// TODO(Triang3l): EDRAM storage image binding for the fragment shader
// interlocks case.
dfn.vkUpdateDescriptorSets(device, 1, write_descriptor_sets, 0, nullptr);
VkWriteDescriptorSet write_descriptor_sets[2];
VkWriteDescriptorSet& write_descriptor_set_shared_memory =
write_descriptor_sets[0];
write_descriptor_set_shared_memory.sType =
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
write_descriptor_set_shared_memory.pNext = nullptr;
write_descriptor_set_shared_memory.dstSet =
shared_memory_and_edram_descriptor_set_;
write_descriptor_set_shared_memory.dstBinding = 0;
write_descriptor_set_shared_memory.dstArrayElement = 0;
write_descriptor_set_shared_memory.descriptorCount =
shared_memory_binding_count;
write_descriptor_set_shared_memory.descriptorType =
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
write_descriptor_set_shared_memory.pImageInfo = nullptr;
write_descriptor_set_shared_memory.pBufferInfo =
shared_memory_descriptor_buffers_info;
write_descriptor_set_shared_memory.pTexelBufferView = nullptr;
VkDescriptorBufferInfo edram_descriptor_buffer_info;
if (edram_fragment_shader_interlock) {
edram_descriptor_buffer_info.buffer = render_target_cache_->edram_buffer();
edram_descriptor_buffer_info.offset = 0;
edram_descriptor_buffer_info.range = VK_WHOLE_SIZE;
VkWriteDescriptorSet& write_descriptor_set_edram = write_descriptor_sets[1];
write_descriptor_set_edram.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
write_descriptor_set_edram.pNext = nullptr;
write_descriptor_set_edram.dstSet = shared_memory_and_edram_descriptor_set_;
write_descriptor_set_edram.dstBinding = 1;
write_descriptor_set_edram.dstArrayElement = 0;
write_descriptor_set_edram.descriptorCount = 1;
write_descriptor_set_edram.descriptorType =
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
write_descriptor_set_edram.pImageInfo = nullptr;
write_descriptor_set_edram.pBufferInfo = &edram_descriptor_buffer_info;
write_descriptor_set_edram.pTexelBufferView = nullptr;
}
dfn.vkUpdateDescriptorSets(device,
1 + uint32_t(edram_fragment_shader_interlock),
write_descriptor_sets, 0, nullptr);
// Swap objects.
@ -1041,6 +1111,9 @@ void VulkanCommandProcessor::ShutdownContext() {
}
descriptor_set_layouts_textures_.clear();
ui::vulkan::util::DestroyAndNullHandle(
dfn.vkDestroyDescriptorSetLayout, device,
descriptor_set_layout_shared_memory_and_edram_);
for (VkDescriptorSetLayout& descriptor_set_layout_single_transient :
descriptor_set_layouts_single_transient_) {
ui::vulkan::util::DestroyAndNullHandle(
@ -1050,9 +1123,6 @@ void VulkanCommandProcessor::ShutdownContext() {
ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout,
device,
descriptor_set_layout_constants_);
ui::vulkan::util::DestroyAndNullHandle(
dfn.vkDestroyDescriptorSetLayout, device,
descriptor_set_layout_shared_memory_and_edram_);
ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout,
device, descriptor_set_layout_empty_);
@ -2401,7 +2471,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
// Update system constants before uploading them.
UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result,
shader_32bit_index_dma, viewport_info,
used_texture_mask);
used_texture_mask, normalized_depth_control,
normalized_color_mask);
// Update uniform buffers and descriptor sets after binding the pipeline with
// the new layout.
@ -2461,6 +2532,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
// After all commands that may dispatch, copy or insert barriers, submit the
// barriers (may end the render pass), and (re)enter the render pass before
// drawing.
// TODO(Triang3l): Handle disabled variableMultisampleRate by restarting the
// render pass with no attachments if the sample count becomes different.
SubmitBarriersAndEnterRenderTargetCacheRenderPass(
render_target_cache_->last_update_render_pass(),
render_target_cache_->last_update_framebuffer());
@ -3180,175 +3253,180 @@ void VulkanCommandProcessor::UpdateDynamicState(
scissor_rect.extent.height = scissor.extent[1];
SetScissor(scissor_rect);
// Depth bias.
// TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB
// implementation.
float depth_bias_constant_factor, depth_bias_slope_factor;
draw_util::GetPreferredFacePolygonOffset(regs, primitive_polygonal,
depth_bias_slope_factor,
depth_bias_constant_factor);
depth_bias_constant_factor *=
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
xenos::DepthRenderTargetFormat::kD24S8
? draw_util::kD3D10PolygonOffsetFactorUnorm24
: draw_util::kD3D10PolygonOffsetFactorFloat24;
// With non-square resolution scaling, make sure the worst-case impact is
// reverted (slope only along the scaled axis), thus max. More bias is better
// than less bias, because less bias means Z fighting with the background is
// more likely.
depth_bias_slope_factor *=
xenos::kPolygonOffsetScaleSubpixelUnit *
float(std::max(render_target_cache_->draw_resolution_scale_x(),
render_target_cache_->draw_resolution_scale_y()));
// std::memcmp instead of != so in case of NaN, every draw won't be
// invalidating it.
dynamic_depth_bias_update_needed_ |=
std::memcmp(&dynamic_depth_bias_constant_factor_,
&depth_bias_constant_factor, sizeof(float)) != 0;
dynamic_depth_bias_update_needed_ |=
std::memcmp(&dynamic_depth_bias_slope_factor_, &depth_bias_slope_factor,
sizeof(float)) != 0;
if (dynamic_depth_bias_update_needed_) {
dynamic_depth_bias_constant_factor_ = depth_bias_constant_factor;
dynamic_depth_bias_slope_factor_ = depth_bias_slope_factor;
deferred_command_buffer_.CmdVkSetDepthBias(
dynamic_depth_bias_constant_factor_, 0.0f,
dynamic_depth_bias_slope_factor_);
dynamic_depth_bias_update_needed_ = false;
}
if (render_target_cache_->GetPath() ==
RenderTargetCache::Path::kHostRenderTargets) {
// Depth bias.
float depth_bias_constant_factor, depth_bias_slope_factor;
draw_util::GetPreferredFacePolygonOffset(regs, primitive_polygonal,
depth_bias_slope_factor,
depth_bias_constant_factor);
depth_bias_constant_factor *=
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
xenos::DepthRenderTargetFormat::kD24S8
? draw_util::kD3D10PolygonOffsetFactorUnorm24
: draw_util::kD3D10PolygonOffsetFactorFloat24;
// With non-square resolution scaling, make sure the worst-case impact is
// reverted (slope only along the scaled axis), thus max. More bias is
// better than less bias, because less bias means Z fighting with the
// background is more likely.
depth_bias_slope_factor *=
xenos::kPolygonOffsetScaleSubpixelUnit *
float(std::max(render_target_cache_->draw_resolution_scale_x(),
render_target_cache_->draw_resolution_scale_y()));
// std::memcmp instead of != so in case of NaN, every draw won't be
// invalidating it.
dynamic_depth_bias_update_needed_ |=
std::memcmp(&dynamic_depth_bias_constant_factor_,
&depth_bias_constant_factor, sizeof(float)) != 0;
dynamic_depth_bias_update_needed_ |=
std::memcmp(&dynamic_depth_bias_slope_factor_, &depth_bias_slope_factor,
sizeof(float)) != 0;
if (dynamic_depth_bias_update_needed_) {
dynamic_depth_bias_constant_factor_ = depth_bias_constant_factor;
dynamic_depth_bias_slope_factor_ = depth_bias_slope_factor;
deferred_command_buffer_.CmdVkSetDepthBias(
dynamic_depth_bias_constant_factor_, 0.0f,
dynamic_depth_bias_slope_factor_);
dynamic_depth_bias_update_needed_ = false;
}
// Blend constants.
float blend_constants[] = {
regs[XE_GPU_REG_RB_BLEND_RED].f32,
regs[XE_GPU_REG_RB_BLEND_GREEN].f32,
regs[XE_GPU_REG_RB_BLEND_BLUE].f32,
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32,
};
dynamic_blend_constants_update_needed_ |=
std::memcmp(dynamic_blend_constants_, blend_constants,
sizeof(float) * 4) != 0;
if (dynamic_blend_constants_update_needed_) {
std::memcpy(dynamic_blend_constants_, blend_constants, sizeof(float) * 4);
deferred_command_buffer_.CmdVkSetBlendConstants(dynamic_blend_constants_);
dynamic_blend_constants_update_needed_ = false;
}
// Blend constants.
float blend_constants[] = {
regs[XE_GPU_REG_RB_BLEND_RED].f32,
regs[XE_GPU_REG_RB_BLEND_GREEN].f32,
regs[XE_GPU_REG_RB_BLEND_BLUE].f32,
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32,
};
dynamic_blend_constants_update_needed_ |=
std::memcmp(dynamic_blend_constants_, blend_constants,
sizeof(float) * 4) != 0;
if (dynamic_blend_constants_update_needed_) {
std::memcpy(dynamic_blend_constants_, blend_constants, sizeof(float) * 4);
deferred_command_buffer_.CmdVkSetBlendConstants(dynamic_blend_constants_);
dynamic_blend_constants_update_needed_ = false;
}
// Stencil masks and references.
// Due to pretty complex conditions involving registers not directly related
// to stencil (primitive type, culling), changing the values only when stencil
// is actually needed. However, due to the way dynamic state needs to be set
// in Vulkan, which doesn't take into account whether the state actually has
// effect on drawing, and because the masks and the references are always
// dynamic in Xenia guest pipelines, they must be set in the command buffer
// before any draw.
if (normalized_depth_control.stencil_enable) {
Register stencil_ref_mask_front_reg, stencil_ref_mask_back_reg;
if (primitive_polygonal && normalized_depth_control.backface_enable) {
const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
const VkPhysicalDevicePortabilitySubsetFeaturesKHR*
device_portability_subset_features =
provider.device_portability_subset_features();
if (!device_portability_subset_features ||
device_portability_subset_features->separateStencilMaskRef) {
// Choose the back face values only if drawing only back faces.
stencil_ref_mask_front_reg =
regs.Get<reg::PA_SU_SC_MODE_CNTL>().cull_front
? XE_GPU_REG_RB_STENCILREFMASK_BF
: XE_GPU_REG_RB_STENCILREFMASK;
stencil_ref_mask_back_reg = stencil_ref_mask_front_reg;
// Stencil masks and references.
// Due to pretty complex conditions involving registers not directly related
// to stencil (primitive type, culling), changing the values only when
// stencil is actually needed. However, due to the way dynamic state needs
// to be set in Vulkan, which doesn't take into account whether the state
// actually has effect on drawing, and because the masks and the references
// are always dynamic in Xenia guest pipelines, they must be set in the
// command buffer before any draw.
if (normalized_depth_control.stencil_enable) {
Register stencil_ref_mask_front_reg, stencil_ref_mask_back_reg;
if (primitive_polygonal && normalized_depth_control.backface_enable) {
const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
const VkPhysicalDevicePortabilitySubsetFeaturesKHR*
device_portability_subset_features =
provider.device_portability_subset_features();
if (!device_portability_subset_features ||
device_portability_subset_features->separateStencilMaskRef) {
// Choose the back face values only if drawing only back faces.
stencil_ref_mask_front_reg =
regs.Get<reg::PA_SU_SC_MODE_CNTL>().cull_front
? XE_GPU_REG_RB_STENCILREFMASK_BF
: XE_GPU_REG_RB_STENCILREFMASK;
stencil_ref_mask_back_reg = stencil_ref_mask_front_reg;
} else {
stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK;
stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK_BF;
}
} else {
stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK;
stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK_BF;
stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK;
}
} else {
stencil_ref_mask_front_reg = XE_GPU_REG_RB_STENCILREFMASK;
stencil_ref_mask_back_reg = XE_GPU_REG_RB_STENCILREFMASK;
auto stencil_ref_mask_front =
regs.Get<reg::RB_STENCILREFMASK>(stencil_ref_mask_front_reg);
auto stencil_ref_mask_back =
regs.Get<reg::RB_STENCILREFMASK>(stencil_ref_mask_back_reg);
// Compare mask.
dynamic_stencil_compare_mask_front_update_needed_ |=
dynamic_stencil_compare_mask_front_ !=
stencil_ref_mask_front.stencilmask;
dynamic_stencil_compare_mask_front_ = stencil_ref_mask_front.stencilmask;
dynamic_stencil_compare_mask_back_update_needed_ |=
dynamic_stencil_compare_mask_back_ !=
stencil_ref_mask_back.stencilmask;
dynamic_stencil_compare_mask_back_ = stencil_ref_mask_back.stencilmask;
// Write mask.
dynamic_stencil_write_mask_front_update_needed_ |=
dynamic_stencil_write_mask_front_ !=
stencil_ref_mask_front.stencilwritemask;
dynamic_stencil_write_mask_front_ =
stencil_ref_mask_front.stencilwritemask;
dynamic_stencil_write_mask_back_update_needed_ |=
dynamic_stencil_write_mask_back_ !=
stencil_ref_mask_back.stencilwritemask;
dynamic_stencil_write_mask_back_ = stencil_ref_mask_back.stencilwritemask;
// Reference.
dynamic_stencil_reference_front_update_needed_ |=
dynamic_stencil_reference_front_ != stencil_ref_mask_front.stencilref;
dynamic_stencil_reference_front_ = stencil_ref_mask_front.stencilref;
dynamic_stencil_reference_back_update_needed_ |=
dynamic_stencil_reference_back_ != stencil_ref_mask_back.stencilref;
dynamic_stencil_reference_back_ = stencil_ref_mask_back.stencilref;
}
auto stencil_ref_mask_front =
regs.Get<reg::RB_STENCILREFMASK>(stencil_ref_mask_front_reg);
auto stencil_ref_mask_back =
regs.Get<reg::RB_STENCILREFMASK>(stencil_ref_mask_back_reg);
// Compare mask.
dynamic_stencil_compare_mask_front_update_needed_ |=
dynamic_stencil_compare_mask_front_ !=
stencil_ref_mask_front.stencilmask;
dynamic_stencil_compare_mask_front_ = stencil_ref_mask_front.stencilmask;
dynamic_stencil_compare_mask_back_update_needed_ |=
dynamic_stencil_compare_mask_back_ != stencil_ref_mask_back.stencilmask;
dynamic_stencil_compare_mask_back_ = stencil_ref_mask_back.stencilmask;
// Write mask.
dynamic_stencil_write_mask_front_update_needed_ |=
dynamic_stencil_write_mask_front_ !=
stencil_ref_mask_front.stencilwritemask;
dynamic_stencil_write_mask_front_ = stencil_ref_mask_front.stencilwritemask;
dynamic_stencil_write_mask_back_update_needed_ |=
dynamic_stencil_write_mask_back_ !=
stencil_ref_mask_back.stencilwritemask;
dynamic_stencil_write_mask_back_ = stencil_ref_mask_back.stencilwritemask;
// Reference.
dynamic_stencil_reference_front_update_needed_ |=
dynamic_stencil_reference_front_ != stencil_ref_mask_front.stencilref;
dynamic_stencil_reference_front_ = stencil_ref_mask_front.stencilref;
dynamic_stencil_reference_back_update_needed_ |=
dynamic_stencil_reference_back_ != stencil_ref_mask_back.stencilref;
dynamic_stencil_reference_back_ = stencil_ref_mask_back.stencilref;
}
// Using VK_STENCIL_FACE_FRONT_AND_BACK for higher safety when running on the
// Vulkan portability subset without separateStencilMaskRef.
if (dynamic_stencil_compare_mask_front_update_needed_ ||
dynamic_stencil_compare_mask_back_update_needed_) {
if (dynamic_stencil_compare_mask_front_ ==
dynamic_stencil_compare_mask_back_) {
deferred_command_buffer_.CmdVkSetStencilCompareMask(
VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_compare_mask_front_);
} else {
if (dynamic_stencil_compare_mask_front_update_needed_) {
// Using VK_STENCIL_FACE_FRONT_AND_BACK for higher safety when running on
// the Vulkan portability subset without separateStencilMaskRef.
if (dynamic_stencil_compare_mask_front_update_needed_ ||
dynamic_stencil_compare_mask_back_update_needed_) {
if (dynamic_stencil_compare_mask_front_ ==
dynamic_stencil_compare_mask_back_) {
deferred_command_buffer_.CmdVkSetStencilCompareMask(
VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_compare_mask_front_);
}
if (dynamic_stencil_compare_mask_back_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilCompareMask(
VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_compare_mask_back_);
VK_STENCIL_FACE_FRONT_AND_BACK,
dynamic_stencil_compare_mask_front_);
} else {
if (dynamic_stencil_compare_mask_front_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilCompareMask(
VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_compare_mask_front_);
}
if (dynamic_stencil_compare_mask_back_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilCompareMask(
VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_compare_mask_back_);
}
}
dynamic_stencil_compare_mask_front_update_needed_ = false;
dynamic_stencil_compare_mask_back_update_needed_ = false;
}
dynamic_stencil_compare_mask_front_update_needed_ = false;
dynamic_stencil_compare_mask_back_update_needed_ = false;
}
if (dynamic_stencil_write_mask_front_update_needed_ ||
dynamic_stencil_write_mask_back_update_needed_) {
if (dynamic_stencil_write_mask_front_ == dynamic_stencil_write_mask_back_) {
deferred_command_buffer_.CmdVkSetStencilWriteMask(
VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_write_mask_front_);
} else {
if (dynamic_stencil_write_mask_front_update_needed_) {
if (dynamic_stencil_write_mask_front_update_needed_ ||
dynamic_stencil_write_mask_back_update_needed_) {
if (dynamic_stencil_write_mask_front_ ==
dynamic_stencil_write_mask_back_) {
deferred_command_buffer_.CmdVkSetStencilWriteMask(
VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_write_mask_front_);
}
if (dynamic_stencil_write_mask_back_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilWriteMask(
VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_write_mask_back_);
VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_write_mask_front_);
} else {
if (dynamic_stencil_write_mask_front_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilWriteMask(
VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_write_mask_front_);
}
if (dynamic_stencil_write_mask_back_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilWriteMask(
VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_write_mask_back_);
}
}
dynamic_stencil_write_mask_front_update_needed_ = false;
dynamic_stencil_write_mask_back_update_needed_ = false;
}
dynamic_stencil_write_mask_front_update_needed_ = false;
dynamic_stencil_write_mask_back_update_needed_ = false;
}
if (dynamic_stencil_reference_front_update_needed_ ||
dynamic_stencil_reference_back_update_needed_) {
if (dynamic_stencil_reference_front_ == dynamic_stencil_reference_back_) {
deferred_command_buffer_.CmdVkSetStencilReference(
VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_reference_front_);
} else {
if (dynamic_stencil_reference_front_update_needed_) {
if (dynamic_stencil_reference_front_update_needed_ ||
dynamic_stencil_reference_back_update_needed_) {
if (dynamic_stencil_reference_front_ == dynamic_stencil_reference_back_) {
deferred_command_buffer_.CmdVkSetStencilReference(
VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_reference_front_);
}
if (dynamic_stencil_reference_back_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilReference(
VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_reference_back_);
VK_STENCIL_FACE_FRONT_AND_BACK, dynamic_stencil_reference_front_);
} else {
if (dynamic_stencil_reference_front_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilReference(
VK_STENCIL_FACE_FRONT_BIT, dynamic_stencil_reference_front_);
}
if (dynamic_stencil_reference_back_update_needed_) {
deferred_command_buffer_.CmdVkSetStencilReference(
VK_STENCIL_FACE_BACK_BIT, dynamic_stencil_reference_back_);
}
}
dynamic_stencil_reference_front_update_needed_ = false;
dynamic_stencil_reference_back_update_needed_ = false;
}
dynamic_stencil_reference_front_update_needed_ = false;
dynamic_stencil_reference_back_update_needed_ = false;
}
// TODO(Triang3l): VK_EXT_extended_dynamic_state and
@ -3359,23 +3437,67 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
bool primitive_polygonal,
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info,
uint32_t used_texture_mask) {
uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control,
uint32_t normalized_color_mask) {
#if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
const RegisterFile& regs = *register_file_;
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
auto rb_colorcontrol = regs.Get<reg::RB_COLORCONTROL>();
auto rb_depth_info = regs.Get<reg::RB_DEPTH_INFO>();
auto rb_stencilrefmask = regs.Get<reg::RB_STENCILREFMASK>();
auto rb_stencilrefmask_bf =
regs.Get<reg::RB_STENCILREFMASK>(XE_GPU_REG_RB_STENCILREFMASK_BF);
auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
int32_t vgt_indx_offset = int32_t(regs[XE_GPU_REG_VGT_INDX_OFFSET].u32);
// Get the color info register values for each render target.
bool edram_fragment_shader_interlock =
render_target_cache_->GetPath() ==
RenderTargetCache::Path::kPixelShaderInterlock;
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
// Get the color info register values for each render target. Also, for FSI,
// exclude components that don't exist in the format from the write mask.
// Don't exclude fully overlapping render targets, however - two render
// targets with the same base address are used in the lighting pass of
// 4D5307E6, for example, with the needed one picked with dynamic control
// flow.
reg::RB_COLOR_INFO color_infos[xenos::kMaxColorRenderTargets];
float rt_clamp[4][4];
// Two UINT32_MAX if no components actually existing in the RT are written.
uint32_t rt_keep_masks[4][2];
for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
color_infos[i] = regs.Get<reg::RB_COLOR_INFO>(
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
reg::RB_COLOR_INFO::rt_register_indices[i]);
color_infos[i] = color_info;
if (edram_fragment_shader_interlock) {
RenderTargetCache::GetPSIColorFormatInfo(
color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111,
rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3],
rt_keep_masks[i][0], rt_keep_masks[i][1]);
}
}
// Disable depth and stencil if it aliases a color render target (for
// instance, during the XBLA logo in 58410954, though depth writing is already
// disabled there).
bool depth_stencil_enabled = normalized_depth_control.stencil_enable ||
normalized_depth_control.z_enable;
if (edram_fragment_shader_interlock && depth_stencil_enabled) {
for (uint32_t i = 0; i < 4; ++i) {
if (rb_depth_info.depth_base == color_infos[i].color_base &&
(rt_keep_masks[i][0] != UINT32_MAX ||
rt_keep_masks[i][1] != UINT32_MAX)) {
depth_stencil_enabled = false;
break;
}
}
}
bool dirty = false;
@ -3419,6 +3541,13 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
if (draw_util::IsPrimitiveLine(regs)) {
flags |= SpirvShaderTranslator::kSysFlag_PrimitiveLine;
}
// MSAA sample count.
flags |= uint32_t(rb_surface_info.msaa_samples)
<< SpirvShaderTranslator::kSysFlag_MsaaSamples_Shift;
// Depth format.
if (rb_depth_info.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
flags |= SpirvShaderTranslator::kSysFlag_DepthFloat24;
}
// Alpha test.
xenos::CompareFunction alpha_test_function =
rb_colorcontrol.alpha_test_enable ? rb_colorcontrol.alpha_func
@ -3433,6 +3562,30 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
flags |= SpirvShaderTranslator::kSysFlag_ConvertColor0ToGamma << i;
}
}
if (edram_fragment_shader_interlock && depth_stencil_enabled) {
flags |= SpirvShaderTranslator::kSysFlag_FSIDepthStencil;
if (normalized_depth_control.z_enable) {
flags |= uint32_t(normalized_depth_control.zfunc)
<< SpirvShaderTranslator::kSysFlag_FSIDepthPassIfLess_Shift;
if (normalized_depth_control.z_write_enable) {
flags |= SpirvShaderTranslator::kSysFlag_FSIDepthWrite;
}
} else {
// In case stencil is used without depth testing - always pass, and
// don't modify the stored depth.
flags |= SpirvShaderTranslator::kSysFlag_FSIDepthPassIfLess |
SpirvShaderTranslator::kSysFlag_FSIDepthPassIfEqual |
SpirvShaderTranslator::kSysFlag_FSIDepthPassIfGreater;
}
if (normalized_depth_control.stencil_enable) {
flags |= SpirvShaderTranslator::kSysFlag_FSIStencilTest;
}
// Hint - if not applicable to the shader, will not have effect.
if (alpha_test_function == xenos::CompareFunction::kAlways &&
!rb_colorcontrol.alpha_to_mask_enable) {
flags |= SpirvShaderTranslator::kSysFlag_FSIDepthStencilEarlyWrite;
}
}
dirty |= system_constants_.flags != flags;
system_constants_.flags = flags;
@ -3492,10 +3645,10 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
// to radius conversion to avoid multiplying the per-vertex diameter by an
// additional constant in the shader.
float point_screen_diameter_to_ndc_radius_x =
(/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_x())) /
(/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) /
std::max(viewport_info.xy_extent[0], uint32_t(1));
float point_screen_diameter_to_ndc_radius_y =
(/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_y())) /
(/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) /
std::max(viewport_info.xy_extent[1], uint32_t(1));
dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] !=
point_screen_diameter_to_ndc_radius_x;
@ -3560,7 +3713,25 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
dirty |= system_constants_.alpha_test_reference != rb_alpha_ref;
system_constants_.alpha_test_reference = rb_alpha_ref;
// Color exponent bias.
uint32_t edram_tile_dwords_scaled =
xenos::kEdramTileWidthSamples * xenos::kEdramTileHeightSamples *
(draw_resolution_scale_x * draw_resolution_scale_y);
// EDRAM pitch for FSI render target writing.
if (edram_fragment_shader_interlock) {
// Align, then multiply by 32bpp tile size in dwords.
uint32_t edram_32bpp_tile_pitch_dwords_scaled =
((rb_surface_info.surface_pitch *
(rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1)) +
(xenos::kEdramTileWidthSamples - 1)) /
xenos::kEdramTileWidthSamples * edram_tile_dwords_scaled;
dirty |= system_constants_.edram_32bpp_tile_pitch_dwords_scaled !=
edram_32bpp_tile_pitch_dwords_scaled;
system_constants_.edram_32bpp_tile_pitch_dwords_scaled =
edram_32bpp_tile_pitch_dwords_scaled;
}
// Color exponent bias and FSI render target writing.
for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
reg::RB_COLOR_INFO color_info = color_infos[i];
// Exponent bias is in bits 20:25 of RB_COLOR_INFO.
@ -3581,6 +3752,148 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
UINT32_C(0x3F800000) + (color_exp_bias << 23);
dirty |= system_constants_.color_exp_bias[i] != color_exp_bias_scale;
system_constants_.color_exp_bias[i] = color_exp_bias_scale;
if (edram_fragment_shader_interlock) {
dirty |=
system_constants_.edram_rt_keep_mask[i][0] != rt_keep_masks[i][0];
system_constants_.edram_rt_keep_mask[i][0] = rt_keep_masks[i][0];
dirty |=
system_constants_.edram_rt_keep_mask[i][1] != rt_keep_masks[i][1];
system_constants_.edram_rt_keep_mask[i][1] = rt_keep_masks[i][1];
if (rt_keep_masks[i][0] != UINT32_MAX ||
rt_keep_masks[i][1] != UINT32_MAX) {
uint32_t rt_base_dwords_scaled =
color_info.color_base * edram_tile_dwords_scaled;
dirty |= system_constants_.edram_rt_base_dwords_scaled[i] !=
rt_base_dwords_scaled;
system_constants_.edram_rt_base_dwords_scaled[i] =
rt_base_dwords_scaled;
uint32_t format_flags =
RenderTargetCache::AddPSIColorFormatFlags(color_info.color_format);
dirty |= system_constants_.edram_rt_format_flags[i] != format_flags;
system_constants_.edram_rt_format_flags[i] = format_flags;
uint32_t blend_factors_ops =
regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF;
dirty |= system_constants_.edram_rt_blend_factors_ops[i] !=
blend_factors_ops;
system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops;
// Can't do float comparisons here because NaNs would result in always
// setting the dirty flag.
dirty |= std::memcmp(system_constants_.edram_rt_clamp[i], rt_clamp[i],
4 * sizeof(float)) != 0;
std::memcpy(system_constants_.edram_rt_clamp[i], rt_clamp[i],
4 * sizeof(float));
}
}
}
if (edram_fragment_shader_interlock) {
uint32_t depth_base_dwords_scaled =
rb_depth_info.depth_base * edram_tile_dwords_scaled;
dirty |= system_constants_.edram_depth_base_dwords_scaled !=
depth_base_dwords_scaled;
system_constants_.edram_depth_base_dwords_scaled = depth_base_dwords_scaled;
// For non-polygons, front polygon offset is used, and it's enabled if
// POLY_OFFSET_PARA_ENABLED is set, for polygons, separate front and back
// are used.
float poly_offset_front_scale = 0.0f, poly_offset_front_offset = 0.0f;
float poly_offset_back_scale = 0.0f, poly_offset_back_offset = 0.0f;
if (primitive_polygonal) {
if (pa_su_sc_mode_cntl.poly_offset_front_enable) {
poly_offset_front_scale =
regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32;
poly_offset_front_offset =
regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32;
}
if (pa_su_sc_mode_cntl.poly_offset_back_enable) {
poly_offset_back_scale =
regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE].f32;
poly_offset_back_offset =
regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET].f32;
}
} else {
if (pa_su_sc_mode_cntl.poly_offset_para_enable) {
poly_offset_front_scale =
regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32;
poly_offset_front_offset =
regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32;
poly_offset_back_scale = poly_offset_front_scale;
poly_offset_back_offset = poly_offset_front_offset;
}
}
// With non-square resolution scaling, make sure the worst-case impact is
// reverted (slope only along the scaled axis), thus max. More bias is
// better than less bias, because less bias means Z fighting with the
// background is more likely.
float poly_offset_scale_factor =
xenos::kPolygonOffsetScaleSubpixelUnit *
std::max(draw_resolution_scale_x, draw_resolution_scale_y);
poly_offset_front_scale *= poly_offset_scale_factor;
poly_offset_back_scale *= poly_offset_scale_factor;
dirty |= system_constants_.edram_poly_offset_front_scale !=
poly_offset_front_scale;
system_constants_.edram_poly_offset_front_scale = poly_offset_front_scale;
dirty |= system_constants_.edram_poly_offset_front_offset !=
poly_offset_front_offset;
system_constants_.edram_poly_offset_front_offset = poly_offset_front_offset;
dirty |= system_constants_.edram_poly_offset_back_scale !=
poly_offset_back_scale;
system_constants_.edram_poly_offset_back_scale = poly_offset_back_scale;
dirty |= system_constants_.edram_poly_offset_back_offset !=
poly_offset_back_offset;
system_constants_.edram_poly_offset_back_offset = poly_offset_back_offset;
if (depth_stencil_enabled && normalized_depth_control.stencil_enable) {
uint32_t stencil_front_reference_masks =
rb_stencilrefmask.value & 0xFFFFFF;
dirty |= system_constants_.edram_stencil_front_reference_masks !=
stencil_front_reference_masks;
system_constants_.edram_stencil_front_reference_masks =
stencil_front_reference_masks;
uint32_t stencil_func_ops =
(normalized_depth_control.value >> 8) & ((1 << 12) - 1);
dirty |=
system_constants_.edram_stencil_front_func_ops != stencil_func_ops;
system_constants_.edram_stencil_front_func_ops = stencil_func_ops;
if (primitive_polygonal && normalized_depth_control.backface_enable) {
uint32_t stencil_back_reference_masks =
rb_stencilrefmask_bf.value & 0xFFFFFF;
dirty |= system_constants_.edram_stencil_back_reference_masks !=
stencil_back_reference_masks;
system_constants_.edram_stencil_back_reference_masks =
stencil_back_reference_masks;
uint32_t stencil_func_ops_bf =
(normalized_depth_control.value >> 20) & ((1 << 12) - 1);
dirty |= system_constants_.edram_stencil_back_func_ops !=
stencil_func_ops_bf;
system_constants_.edram_stencil_back_func_ops = stencil_func_ops_bf;
} else {
dirty |= std::memcmp(system_constants_.edram_stencil_back,
system_constants_.edram_stencil_front,
2 * sizeof(uint32_t)) != 0;
std::memcpy(system_constants_.edram_stencil_back,
system_constants_.edram_stencil_front,
2 * sizeof(uint32_t));
}
}
dirty |= system_constants_.edram_blend_constant[0] !=
regs[XE_GPU_REG_RB_BLEND_RED].f32;
system_constants_.edram_blend_constant[0] =
regs[XE_GPU_REG_RB_BLEND_RED].f32;
dirty |= system_constants_.edram_blend_constant[1] !=
regs[XE_GPU_REG_RB_BLEND_GREEN].f32;
system_constants_.edram_blend_constant[1] =
regs[XE_GPU_REG_RB_BLEND_GREEN].f32;
dirty |= system_constants_.edram_blend_constant[2] !=
regs[XE_GPU_REG_RB_BLEND_BLUE].f32;
system_constants_.edram_blend_constant[2] =
regs[XE_GPU_REG_RB_BLEND_BLUE].f32;
dirty |= system_constants_.edram_blend_constant[3] !=
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32;
system_constants_.edram_blend_constant[3] =
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32;
}
if (dirty) {

View File

@ -16,6 +16,7 @@
#include <deque>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
@ -256,6 +257,9 @@ class VulkanCommandProcessor : public CommandProcessor {
void SetViewport(const VkViewport& viewport);
void SetScissor(const VkRect2D& scissor);
// Returns the text to display in the GPU backend name in the window title.
std::string GetWindowTitleText() const;
protected:
bool SetupContext() override;
void ShutdownContext() override;
@ -437,7 +441,8 @@ class VulkanCommandProcessor : public CommandProcessor {
bool primitive_polygonal,
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info,
uint32_t used_texture_mask);
uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control,
uint32_t normalized_color_mask);
bool UpdateBindings(const VulkanShader* vertex_shader,
const VulkanShader* pixel_shader);
// Allocates a descriptor set and fills one or two VkWriteDescriptorSet
@ -514,12 +519,12 @@ class VulkanCommandProcessor : public CommandProcessor {
// Descriptor set layouts used by different shaders.
VkDescriptorSetLayout descriptor_set_layout_empty_ = VK_NULL_HANDLE;
VkDescriptorSetLayout descriptor_set_layout_shared_memory_and_edram_ =
VK_NULL_HANDLE;
VkDescriptorSetLayout descriptor_set_layout_constants_ = VK_NULL_HANDLE;
std::array<VkDescriptorSetLayout,
size_t(SingleTransientDescriptorLayout::kCount)>
descriptor_set_layouts_single_transient_{};
VkDescriptorSetLayout descriptor_set_layout_shared_memory_and_edram_ =
VK_NULL_HANDLE;
// Descriptor set layouts are referenced by pipeline_layouts_.
std::unordered_map<TextureDescriptorSetLayoutKey, VkDescriptorSetLayout,
@ -649,6 +654,9 @@ class VulkanCommandProcessor : public CommandProcessor {
// declared as dynamic in the pipeline) invalidates such dynamic state.
VkViewport dynamic_viewport_;
VkRect2D dynamic_scissor_;
// Dynamic fixed-function depth bias, blend constants, stencil state are
// applicable only to the render target implementations where they are
// actually involved.
float dynamic_depth_bias_constant_factor_;
float dynamic_depth_bias_slope_factor_;
float dynamic_blend_constants_[4];

View File

@ -21,6 +21,15 @@ VulkanGraphicsSystem::VulkanGraphicsSystem() {}
VulkanGraphicsSystem::~VulkanGraphicsSystem() {}
std::string VulkanGraphicsSystem::name() const {
auto vulkan_command_processor =
static_cast<VulkanCommandProcessor*>(command_processor());
if (vulkan_command_processor != nullptr) {
return vulkan_command_processor->GetWindowTitleText();
}
return "Vulkan - HEAVILY INCOMPLETE, early development";
}
X_STATUS VulkanGraphicsSystem::Setup(cpu::Processor* processor,
kernel::KernelState* kernel_state,
ui::WindowedAppContext* app_context,

View File

@ -26,9 +26,7 @@ class VulkanGraphicsSystem : public GraphicsSystem {
static bool IsAvailable() { return true; }
std::string name() const override {
return "Vulkan - HEAVILY INCOMPLETE, early development";
}
std::string name() const override;
X_STATUS Setup(cpu::Processor* processor, kernel::KernelState* kernel_state,
ui::WindowedAppContext* app_context,

View File

@ -15,6 +15,7 @@
#include <cstring>
#include <memory>
#include <utility>
#include <vector>
#include "third_party/fmt/include/fmt/format.h"
#include "third_party/glslang/SPIRV/SpvBuilder.h"
@ -53,8 +54,32 @@ bool VulkanPipelineCache::Initialize() {
const ui::vulkan::VulkanProvider& provider =
command_processor_.GetVulkanProvider();
bool edram_fragment_shader_interlock =
render_target_cache_.GetPath() ==
RenderTargetCache::Path::kPixelShaderInterlock;
shader_translator_ = std::make_unique<SpirvShaderTranslator>(
SpirvShaderTranslator::Features(provider));
SpirvShaderTranslator::Features(provider),
render_target_cache_.msaa_2x_attachments_supported(),
render_target_cache_.msaa_2x_no_attachments_supported(),
edram_fragment_shader_interlock);
if (edram_fragment_shader_interlock) {
std::vector<uint8_t> depth_only_fragment_shader_code =
shader_translator_->CreateDepthOnlyFragmentShader();
depth_only_fragment_shader_ = ui::vulkan::util::CreateShaderModule(
provider,
reinterpret_cast<const uint32_t*>(
depth_only_fragment_shader_code.data()),
depth_only_fragment_shader_code.size());
if (depth_only_fragment_shader_ == VK_NULL_HANDLE) {
XELOGE(
"VulkanPipelineCache: Failed to create the depth/stencil-only "
"fragment shader for the fragment shader interlock render backend "
"implementation");
return false;
}
}
return true;
}
@ -75,6 +100,8 @@ void VulkanPipelineCache::Shutdown() {
pipelines_.clear();
// Destroy all internal shaders.
ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyShaderModule, device,
depth_only_fragment_shader_);
for (const auto& geometry_shader_pair : geometry_shaders_) {
if (geometry_shader_pair.second != VK_NULL_HANDLE) {
dfn.vkDestroyShaderModule(device, geometry_shader_pair.second, nullptr);
@ -179,15 +206,18 @@ VulkanPipelineCache::GetCurrentPixelShaderModification(
modification.pixel.param_gen_point = 0;
}
using DepthStencilMode =
SpirvShaderTranslator::Modification::DepthStencilMode;
if (shader.implicit_early_z_write_allowed() &&
(!shader.writes_color_target(0) ||
!draw_util::DoesCoverageDependOnAlpha(
regs.Get<reg::RB_COLORCONTROL>()))) {
modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint;
} else {
modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers;
if (render_target_cache_.GetPath() ==
RenderTargetCache::Path::kHostRenderTargets) {
using DepthStencilMode =
SpirvShaderTranslator::Modification::DepthStencilMode;
if (shader.implicit_early_z_write_allowed() &&
(!shader.writes_color_target(0) ||
!draw_util::DoesCoverageDependOnAlpha(
regs.Get<reg::RB_COLORCONTROL>()))) {
modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint;
} else {
modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers;
}
}
return modification;
@ -303,7 +333,11 @@ bool VulkanPipelineCache::ConfigurePipeline(
}
}
VkRenderPass render_pass =
render_target_cache_.GetRenderPass(render_pass_key);
render_target_cache_.GetPath() ==
RenderTargetCache::Path::kPixelShaderInterlock
? render_target_cache_.GetFragmentShaderInterlockRenderPass()
: render_target_cache_.GetHostRenderTargetsRenderPass(
render_pass_key);
if (render_pass == VK_NULL_HANDLE) {
return false;
}
@ -603,123 +637,127 @@ bool VulkanPipelineCache::GetCurrentStateDescription(
description_out.polygon_mode = PipelinePolygonMode::kFill;
}
// TODO(Triang3l): Skip depth / stencil and color state for the fragment
// shader interlock RB implementation.
if (render_pass_key.depth_and_color_used & 1) {
if (normalized_depth_control.z_enable) {
description_out.depth_write_enable =
normalized_depth_control.z_write_enable;
description_out.depth_compare_op = normalized_depth_control.zfunc;
} else {
description_out.depth_compare_op = xenos::CompareFunction::kAlways;
}
if (normalized_depth_control.stencil_enable) {
description_out.stencil_test_enable = 1;
description_out.stencil_front_fail_op =
normalized_depth_control.stencilfail;
description_out.stencil_front_pass_op =
normalized_depth_control.stencilzpass;
description_out.stencil_front_depth_fail_op =
normalized_depth_control.stencilzfail;
description_out.stencil_front_compare_op =
normalized_depth_control.stencilfunc;
if (primitive_polygonal && normalized_depth_control.backface_enable) {
description_out.stencil_back_fail_op =
normalized_depth_control.stencilfail_bf;
description_out.stencil_back_pass_op =
normalized_depth_control.stencilzpass_bf;
description_out.stencil_back_depth_fail_op =
normalized_depth_control.stencilzfail_bf;
description_out.stencil_back_compare_op =
normalized_depth_control.stencilfunc_bf;
if (render_target_cache_.GetPath() ==
RenderTargetCache::Path::kHostRenderTargets) {
if (render_pass_key.depth_and_color_used & 1) {
if (normalized_depth_control.z_enable) {
description_out.depth_write_enable =
normalized_depth_control.z_write_enable;
description_out.depth_compare_op = normalized_depth_control.zfunc;
} else {
description_out.stencil_back_fail_op =
description_out.stencil_front_fail_op;
description_out.stencil_back_pass_op =
description_out.stencil_front_pass_op;
description_out.stencil_back_depth_fail_op =
description_out.stencil_front_depth_fail_op;
description_out.stencil_back_compare_op =
description_out.stencil_front_compare_op;
description_out.depth_compare_op = xenos::CompareFunction::kAlways;
}
if (normalized_depth_control.stencil_enable) {
description_out.stencil_test_enable = 1;
description_out.stencil_front_fail_op =
normalized_depth_control.stencilfail;
description_out.stencil_front_pass_op =
normalized_depth_control.stencilzpass;
description_out.stencil_front_depth_fail_op =
normalized_depth_control.stencilzfail;
description_out.stencil_front_compare_op =
normalized_depth_control.stencilfunc;
if (primitive_polygonal && normalized_depth_control.backface_enable) {
description_out.stencil_back_fail_op =
normalized_depth_control.stencilfail_bf;
description_out.stencil_back_pass_op =
normalized_depth_control.stencilzpass_bf;
description_out.stencil_back_depth_fail_op =
normalized_depth_control.stencilzfail_bf;
description_out.stencil_back_compare_op =
normalized_depth_control.stencilfunc_bf;
} else {
description_out.stencil_back_fail_op =
description_out.stencil_front_fail_op;
description_out.stencil_back_pass_op =
description_out.stencil_front_pass_op;
description_out.stencil_back_depth_fail_op =
description_out.stencil_front_depth_fail_op;
description_out.stencil_back_compare_op =
description_out.stencil_front_compare_op;
}
}
}
}
// Color blending and write masks (filled only for the attachments present in
// the render pass object).
uint32_t render_pass_color_rts = render_pass_key.depth_and_color_used >> 1;
if (device_features.independentBlend) {
uint32_t render_pass_color_rts_remaining = render_pass_color_rts;
uint32_t color_rt_index;
while (xe::bit_scan_forward(render_pass_color_rts_remaining,
&color_rt_index)) {
render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index);
WritePipelineRenderTargetDescription(
regs.Get<reg::RB_BLENDCONTROL>(
reg::RB_BLENDCONTROL::rt_register_indices[color_rt_index]),
(normalized_color_mask >> (color_rt_index * 4)) & 0b1111,
description_out.render_targets[color_rt_index]);
}
} else {
// Take the blend control for the first render target that the guest wants
// to write to (consider it the most important) and use it for all render
// targets, if any.
// TODO(Triang3l): Implement an option for independent blending via multiple
// draw calls with different pipelines maybe? Though independent blending
// support is pretty wide, with a quite prominent exception of Adreno 4xx
// apparently.
uint32_t render_pass_color_rts_remaining = render_pass_color_rts;
uint32_t render_pass_first_color_rt_index;
if (xe::bit_scan_forward(render_pass_color_rts_remaining,
&render_pass_first_color_rt_index)) {
render_pass_color_rts_remaining &=
~(uint32_t(1) << render_pass_first_color_rt_index);
PipelineRenderTarget& render_pass_first_color_rt =
description_out.render_targets[render_pass_first_color_rt_index];
uint32_t common_blend_rt_index;
if (xe::bit_scan_forward(normalized_color_mask, &common_blend_rt_index)) {
common_blend_rt_index >>= 2;
// If a common write mask will be used for multiple render targets, use
// the original RB_COLOR_MASK instead of the normalized color mask as
// the normalized color mask has non-existent components forced to
// written (don't need reading to be preserved), while the number of
// components may vary between render targets. The attachments in the
// pass that must not be written to at all will be excluded via a shader
// modification.
WritePipelineRenderTargetDescription(
regs.Get<reg::RB_BLENDCONTROL>(
reg::RB_BLENDCONTROL::rt_register_indices
[common_blend_rt_index]),
(((normalized_color_mask &
~(uint32_t(0b1111) << (4 * common_blend_rt_index)))
? regs[XE_GPU_REG_RB_COLOR_MASK].u32
: normalized_color_mask) >>
(4 * common_blend_rt_index)) &
0b1111,
render_pass_first_color_rt);
} else {
// No render targets are written to, though the render pass still may
// contain color attachments - set them to not written and not blending.
render_pass_first_color_rt.src_color_blend_factor =
PipelineBlendFactor::kOne;
render_pass_first_color_rt.dst_color_blend_factor =
PipelineBlendFactor::kZero;
render_pass_first_color_rt.color_blend_op = xenos::BlendOp::kAdd;
render_pass_first_color_rt.src_alpha_blend_factor =
PipelineBlendFactor::kOne;
render_pass_first_color_rt.dst_alpha_blend_factor =
PipelineBlendFactor::kZero;
render_pass_first_color_rt.alpha_blend_op = xenos::BlendOp::kAdd;
}
// Reuse the same blending settings for all render targets in the pass,
// for description consistency.
// Color blending and write masks (filled only for the attachments present
// in the render pass object).
uint32_t render_pass_color_rts = render_pass_key.depth_and_color_used >> 1;
if (device_features.independentBlend) {
uint32_t render_pass_color_rts_remaining = render_pass_color_rts;
uint32_t color_rt_index;
while (xe::bit_scan_forward(render_pass_color_rts_remaining,
&color_rt_index)) {
render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index);
description_out.render_targets[color_rt_index] =
render_pass_first_color_rt;
WritePipelineRenderTargetDescription(
regs.Get<reg::RB_BLENDCONTROL>(
reg::RB_BLENDCONTROL::rt_register_indices[color_rt_index]),
(normalized_color_mask >> (color_rt_index * 4)) & 0b1111,
description_out.render_targets[color_rt_index]);
}
} else {
// Take the blend control for the first render target that the guest wants
// to write to (consider it the most important) and use it for all render
// targets, if any.
// TODO(Triang3l): Implement an option for independent blending via
// replaying the render pass for each set of render targets with unique
// blending parameters, with depth / stencil saved before the first and
// restored before each of the rest maybe? Though independent blending
// support is pretty wide, with a quite prominent exception of Adreno 4xx
// apparently.
uint32_t render_pass_color_rts_remaining = render_pass_color_rts;
uint32_t render_pass_first_color_rt_index;
if (xe::bit_scan_forward(render_pass_color_rts_remaining,
&render_pass_first_color_rt_index)) {
render_pass_color_rts_remaining &=
~(uint32_t(1) << render_pass_first_color_rt_index);
PipelineRenderTarget& render_pass_first_color_rt =
description_out.render_targets[render_pass_first_color_rt_index];
uint32_t common_blend_rt_index;
if (xe::bit_scan_forward(normalized_color_mask,
&common_blend_rt_index)) {
common_blend_rt_index >>= 2;
// If a common write mask will be used for multiple render targets,
// use the original RB_COLOR_MASK instead of the normalized color mask
// as the normalized color mask has non-existent components forced to
// written (don't need reading to be preserved), while the number of
// components may vary between render targets. The attachments in the
// pass that must not be written to at all will be excluded via a
// shader modification.
WritePipelineRenderTargetDescription(
regs.Get<reg::RB_BLENDCONTROL>(
reg::RB_BLENDCONTROL::rt_register_indices
[common_blend_rt_index]),
(((normalized_color_mask &
~(uint32_t(0b1111) << (4 * common_blend_rt_index)))
? regs[XE_GPU_REG_RB_COLOR_MASK].u32
: normalized_color_mask) >>
(4 * common_blend_rt_index)) &
0b1111,
render_pass_first_color_rt);
} else {
// No render targets are written to, though the render pass still may
// contain color attachments - set them to not written and not
// blending.
render_pass_first_color_rt.src_color_blend_factor =
PipelineBlendFactor::kOne;
render_pass_first_color_rt.dst_color_blend_factor =
PipelineBlendFactor::kZero;
render_pass_first_color_rt.color_blend_op = xenos::BlendOp::kAdd;
render_pass_first_color_rt.src_alpha_blend_factor =
PipelineBlendFactor::kOne;
render_pass_first_color_rt.dst_alpha_blend_factor =
PipelineBlendFactor::kZero;
render_pass_first_color_rt.alpha_blend_op = xenos::BlendOp::kAdd;
}
// Reuse the same blending settings for all render targets in the pass,
// for description consistency.
uint32_t color_rt_index;
while (xe::bit_scan_forward(render_pass_color_rts_remaining,
&color_rt_index)) {
render_pass_color_rts_remaining &= ~(uint32_t(1) << color_rt_index);
description_out.render_targets[color_rt_index] =
render_pass_first_color_rt;
}
}
}
}
@ -1929,6 +1967,10 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
command_processor_.GetVulkanProvider();
const VkPhysicalDeviceFeatures& device_features = provider.device_features();
bool edram_fragment_shader_interlock =
render_target_cache_.GetPath() ==
RenderTargetCache::Path::kPixelShaderInterlock;
std::array<VkPipelineShaderStageCreateInfo, 3> shader_stages;
uint32_t shader_stage_count = 0;
@ -1962,24 +2004,32 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
shader_stage_geometry.pName = "main";
shader_stage_geometry.pSpecializationInfo = nullptr;
}
// Pixel shader.
// Fragment shader.
VkPipelineShaderStageCreateInfo& shader_stage_fragment =
shader_stages[shader_stage_count++];
shader_stage_fragment.sType =
VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
shader_stage_fragment.pNext = nullptr;
shader_stage_fragment.flags = 0;
shader_stage_fragment.stage = VK_SHADER_STAGE_FRAGMENT_BIT;
shader_stage_fragment.module = VK_NULL_HANDLE;
shader_stage_fragment.pName = "main";
shader_stage_fragment.pSpecializationInfo = nullptr;
if (creation_arguments.pixel_shader) {
assert_true(creation_arguments.pixel_shader->is_translated());
if (!creation_arguments.pixel_shader->is_valid()) {
return false;
}
VkPipelineShaderStageCreateInfo& shader_stage_fragment =
shader_stages[shader_stage_count++];
shader_stage_fragment.sType =
VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
shader_stage_fragment.pNext = nullptr;
shader_stage_fragment.flags = 0;
shader_stage_fragment.stage = VK_SHADER_STAGE_FRAGMENT_BIT;
shader_stage_fragment.module =
creation_arguments.pixel_shader->shader_module();
assert_true(shader_stage_fragment.module != VK_NULL_HANDLE);
shader_stage_fragment.pName = "main";
shader_stage_fragment.pSpecializationInfo = nullptr;
} else {
if (edram_fragment_shader_interlock) {
shader_stage_fragment.module = depth_only_fragment_shader_;
}
}
if (shader_stage_fragment.module == VK_NULL_HANDLE) {
--shader_stage_count;
}
VkPipelineVertexInputStateCreateInfo vertex_input_state = {};
@ -2087,11 +2137,11 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
// formula, though Z has no effect on anything if a depth attachment is not
// used (the guest shader can't access Z), enabling only when there's a
// depth / stencil attachment for correctness.
// TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB
// implementation.
rasterization_state.depthBiasEnable =
(description.render_pass_key.depth_and_color_used & 0b1) ? VK_TRUE
: VK_FALSE;
(!edram_fragment_shader_interlock &&
(description.render_pass_key.depth_and_color_used & 0b1))
? VK_TRUE
: VK_FALSE;
// TODO(Triang3l): Wide lines.
rasterization_state.lineWidth = 1.0f;
@ -2101,6 +2151,7 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
if (description.render_pass_key.msaa_samples == xenos::MsaaSamples::k2X &&
!render_target_cache_.IsMsaa2xSupported(
!edram_fragment_shader_interlock &&
description.render_pass_key.depth_and_color_used != 0)) {
// Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same
// sample locations, but still top-left and bottom-right - however, this can
@ -2119,126 +2170,131 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
depth_stencil_state.sType =
VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
depth_stencil_state.pNext = nullptr;
if (description.depth_write_enable ||
description.depth_compare_op != xenos::CompareFunction::kAlways) {
depth_stencil_state.depthTestEnable = VK_TRUE;
depth_stencil_state.depthWriteEnable =
description.depth_write_enable ? VK_TRUE : VK_FALSE;
depth_stencil_state.depthCompareOp = VkCompareOp(
uint32_t(VK_COMPARE_OP_NEVER) + uint32_t(description.depth_compare_op));
}
if (description.stencil_test_enable) {
depth_stencil_state.stencilTestEnable = VK_TRUE;
depth_stencil_state.front.failOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_front_fail_op));
depth_stencil_state.front.passOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_front_pass_op));
depth_stencil_state.front.depthFailOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_front_depth_fail_op));
depth_stencil_state.front.compareOp =
VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) +
uint32_t(description.stencil_front_compare_op));
depth_stencil_state.back.failOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_back_fail_op));
depth_stencil_state.back.passOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_back_pass_op));
depth_stencil_state.back.depthFailOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_back_depth_fail_op));
depth_stencil_state.back.compareOp =
VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) +
uint32_t(description.stencil_back_compare_op));
}
VkPipelineColorBlendAttachmentState
color_blend_attachments[xenos::kMaxColorRenderTargets] = {};
uint32_t color_rts_used =
description.render_pass_key.depth_and_color_used >> 1;
{
static const VkBlendFactor kBlendFactorMap[] = {
VK_BLEND_FACTOR_ZERO,
VK_BLEND_FACTOR_ONE,
VK_BLEND_FACTOR_SRC_COLOR,
VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR,
VK_BLEND_FACTOR_DST_COLOR,
VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR,
VK_BLEND_FACTOR_SRC_ALPHA,
VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
VK_BLEND_FACTOR_DST_ALPHA,
VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA,
VK_BLEND_FACTOR_CONSTANT_COLOR,
VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR,
VK_BLEND_FACTOR_CONSTANT_ALPHA,
VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA,
VK_BLEND_FACTOR_SRC_ALPHA_SATURATE,
};
// 8 entries for safety since 3 bits from the guest are passed directly.
static const VkBlendOp kBlendOpMap[] = {VK_BLEND_OP_ADD,
VK_BLEND_OP_SUBTRACT,
VK_BLEND_OP_MIN,
VK_BLEND_OP_MAX,
VK_BLEND_OP_REVERSE_SUBTRACT,
VK_BLEND_OP_ADD,
VK_BLEND_OP_ADD,
VK_BLEND_OP_ADD};
uint32_t color_rts_remaining = color_rts_used;
uint32_t color_rt_index;
while (xe::bit_scan_forward(color_rts_remaining, &color_rt_index)) {
color_rts_remaining &= ~(uint32_t(1) << color_rt_index);
VkPipelineColorBlendAttachmentState& color_blend_attachment =
color_blend_attachments[color_rt_index];
const PipelineRenderTarget& color_rt =
description.render_targets[color_rt_index];
if (color_rt.src_color_blend_factor != PipelineBlendFactor::kOne ||
color_rt.dst_color_blend_factor != PipelineBlendFactor::kZero ||
color_rt.color_blend_op != xenos::BlendOp::kAdd ||
color_rt.src_alpha_blend_factor != PipelineBlendFactor::kOne ||
color_rt.dst_alpha_blend_factor != PipelineBlendFactor::kZero ||
color_rt.alpha_blend_op != xenos::BlendOp::kAdd) {
color_blend_attachment.blendEnable = VK_TRUE;
color_blend_attachment.srcColorBlendFactor =
kBlendFactorMap[uint32_t(color_rt.src_color_blend_factor)];
color_blend_attachment.dstColorBlendFactor =
kBlendFactorMap[uint32_t(color_rt.dst_color_blend_factor)];
color_blend_attachment.colorBlendOp =
kBlendOpMap[uint32_t(color_rt.color_blend_op)];
color_blend_attachment.srcAlphaBlendFactor =
kBlendFactorMap[uint32_t(color_rt.src_alpha_blend_factor)];
color_blend_attachment.dstAlphaBlendFactor =
kBlendFactorMap[uint32_t(color_rt.dst_alpha_blend_factor)];
color_blend_attachment.alphaBlendOp =
kBlendOpMap[uint32_t(color_rt.alpha_blend_op)];
}
color_blend_attachment.colorWriteMask =
VkColorComponentFlags(color_rt.color_write_mask);
if (!device_features.independentBlend) {
// For non-independent blend, the pAttachments element for the first
// actually used color will be replicated into all.
break;
}
if (!edram_fragment_shader_interlock) {
if (description.depth_write_enable ||
description.depth_compare_op != xenos::CompareFunction::kAlways) {
depth_stencil_state.depthTestEnable = VK_TRUE;
depth_stencil_state.depthWriteEnable =
description.depth_write_enable ? VK_TRUE : VK_FALSE;
depth_stencil_state.depthCompareOp =
VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) +
uint32_t(description.depth_compare_op));
}
if (description.stencil_test_enable) {
depth_stencil_state.stencilTestEnable = VK_TRUE;
depth_stencil_state.front.failOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_front_fail_op));
depth_stencil_state.front.passOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_front_pass_op));
depth_stencil_state.front.depthFailOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_front_depth_fail_op));
depth_stencil_state.front.compareOp =
VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) +
uint32_t(description.stencil_front_compare_op));
depth_stencil_state.back.failOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_back_fail_op));
depth_stencil_state.back.passOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_back_pass_op));
depth_stencil_state.back.depthFailOp =
VkStencilOp(uint32_t(VK_STENCIL_OP_KEEP) +
uint32_t(description.stencil_back_depth_fail_op));
depth_stencil_state.back.compareOp =
VkCompareOp(uint32_t(VK_COMPARE_OP_NEVER) +
uint32_t(description.stencil_back_compare_op));
}
}
VkPipelineColorBlendStateCreateInfo color_blend_state = {};
color_blend_state.sType =
VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
color_blend_state.attachmentCount = 32 - xe::lzcnt(color_rts_used);
color_blend_state.pAttachments = color_blend_attachments;
if (color_rts_used && !device_features.independentBlend) {
// "If the independent blending feature is not enabled, all elements of
// pAttachments must be identical."
uint32_t first_color_rt_index;
xe::bit_scan_forward(color_rts_used, &first_color_rt_index);
for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) {
if (i == first_color_rt_index) {
continue;
VkPipelineColorBlendAttachmentState
color_blend_attachments[xenos::kMaxColorRenderTargets] = {};
if (!edram_fragment_shader_interlock) {
uint32_t color_rts_used =
description.render_pass_key.depth_and_color_used >> 1;
{
static const VkBlendFactor kBlendFactorMap[] = {
VK_BLEND_FACTOR_ZERO,
VK_BLEND_FACTOR_ONE,
VK_BLEND_FACTOR_SRC_COLOR,
VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR,
VK_BLEND_FACTOR_DST_COLOR,
VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR,
VK_BLEND_FACTOR_SRC_ALPHA,
VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
VK_BLEND_FACTOR_DST_ALPHA,
VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA,
VK_BLEND_FACTOR_CONSTANT_COLOR,
VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR,
VK_BLEND_FACTOR_CONSTANT_ALPHA,
VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA,
VK_BLEND_FACTOR_SRC_ALPHA_SATURATE,
};
// 8 entries for safety since 3 bits from the guest are passed directly.
static const VkBlendOp kBlendOpMap[] = {VK_BLEND_OP_ADD,
VK_BLEND_OP_SUBTRACT,
VK_BLEND_OP_MIN,
VK_BLEND_OP_MAX,
VK_BLEND_OP_REVERSE_SUBTRACT,
VK_BLEND_OP_ADD,
VK_BLEND_OP_ADD,
VK_BLEND_OP_ADD};
uint32_t color_rts_remaining = color_rts_used;
uint32_t color_rt_index;
while (xe::bit_scan_forward(color_rts_remaining, &color_rt_index)) {
color_rts_remaining &= ~(uint32_t(1) << color_rt_index);
VkPipelineColorBlendAttachmentState& color_blend_attachment =
color_blend_attachments[color_rt_index];
const PipelineRenderTarget& color_rt =
description.render_targets[color_rt_index];
if (color_rt.src_color_blend_factor != PipelineBlendFactor::kOne ||
color_rt.dst_color_blend_factor != PipelineBlendFactor::kZero ||
color_rt.color_blend_op != xenos::BlendOp::kAdd ||
color_rt.src_alpha_blend_factor != PipelineBlendFactor::kOne ||
color_rt.dst_alpha_blend_factor != PipelineBlendFactor::kZero ||
color_rt.alpha_blend_op != xenos::BlendOp::kAdd) {
color_blend_attachment.blendEnable = VK_TRUE;
color_blend_attachment.srcColorBlendFactor =
kBlendFactorMap[uint32_t(color_rt.src_color_blend_factor)];
color_blend_attachment.dstColorBlendFactor =
kBlendFactorMap[uint32_t(color_rt.dst_color_blend_factor)];
color_blend_attachment.colorBlendOp =
kBlendOpMap[uint32_t(color_rt.color_blend_op)];
color_blend_attachment.srcAlphaBlendFactor =
kBlendFactorMap[uint32_t(color_rt.src_alpha_blend_factor)];
color_blend_attachment.dstAlphaBlendFactor =
kBlendFactorMap[uint32_t(color_rt.dst_alpha_blend_factor)];
color_blend_attachment.alphaBlendOp =
kBlendOpMap[uint32_t(color_rt.alpha_blend_op)];
}
color_blend_attachment.colorWriteMask =
VkColorComponentFlags(color_rt.color_write_mask);
if (!device_features.independentBlend) {
// For non-independent blend, the pAttachments element for the first
// actually used color will be replicated into all.
break;
}
}
}
color_blend_state.attachmentCount = 32 - xe::lzcnt(color_rts_used);
color_blend_state.pAttachments = color_blend_attachments;
if (color_rts_used && !device_features.independentBlend) {
// "If the independent blending feature is not enabled, all elements of
// pAttachments must be identical."
uint32_t first_color_rt_index;
xe::bit_scan_forward(color_rts_used, &first_color_rt_index);
for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) {
if (i == first_color_rt_index) {
continue;
}
color_blend_attachments[i] =
color_blend_attachments[first_color_rt_index];
}
color_blend_attachments[i] =
color_blend_attachments[first_color_rt_index];
}
}
@ -2255,16 +2311,18 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
// invalidated (again, even if it has no effect).
dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_VIEWPORT;
dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_SCISSOR;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_DEPTH_BIAS;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_BLEND_CONSTANTS;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_STENCIL_WRITE_MASK;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_STENCIL_REFERENCE;
if (!edram_fragment_shader_interlock) {
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_DEPTH_BIAS;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_BLEND_CONSTANTS;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_STENCIL_WRITE_MASK;
dynamic_states[dynamic_state.dynamicStateCount++] =
VK_DYNAMIC_STATE_STENCIL_REFERENCE;
}
VkGraphicsPipelineCreateInfo pipeline_create_info;
pipeline_create_info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;

View File

@ -314,6 +314,10 @@ class VulkanPipelineCache {
GeometryShaderKey::Hasher>
geometry_shaders_;
// Empty depth-only pixel shader for writing to depth buffer using fragment
// shader interlock when no Xenos pixel shader provided.
VkShaderModule depth_only_fragment_shader_ = VK_NULL_HANDLE;
std::unordered_map<PipelineDescription, Pipeline, PipelineDescription::Hasher>
pipelines_;

File diff suppressed because it is too large Load Diff

View File

@ -43,6 +43,10 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
// true 4x MSAA passes (framebuffers because render target cache render
// targets are different for 2x and 4x guest MSAA, pipelines because the
// sample mask will have 2 samples excluded for 2x-as-4x).
// This has effect only on the attachments, but even in cases when there
// are no attachments, it can be used to the sample count between
// subsystems, for instance, to specify the desired number of samples to
// use when there are no attachments in pipelines.
xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits; // 2
// << 0 is depth, << 1...4 is color.
uint32_t depth_and_color_used : 1 + xenos::kMaxColorRenderTargets; // 7
@ -81,8 +85,9 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
static_assert_size(RenderPassKey, sizeof(uint32_t));
struct Framebuffer {
VkFramebuffer framebuffer;
VkExtent2D host_extent;
VkFramebuffer framebuffer = VK_NULL_HANDLE;
VkExtent2D host_extent{};
Framebuffer() = default;
Framebuffer(VkFramebuffer framebuffer, const VkExtent2D& host_extent)
: framebuffer(framebuffer), host_extent(host_extent) {}
};
@ -96,15 +101,16 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
// Transient descriptor set layouts must be initialized in the command
// processor.
bool Initialize();
bool Initialize(uint32_t shared_memory_binding_count);
void Shutdown(bool from_destructor = false);
void ClearCache() override;
void CompletedSubmissionUpdated();
void EndSubmission();
// TODO(Triang3l): Fragment shader interlock.
Path GetPath() const override { return Path::kHostRenderTargets; }
Path GetPath() const override { return path_; }
VkBuffer edram_buffer() const { return edram_buffer_; }
// Performs the resolve to a shared memory area according to the current
// register values, and also clears the render targets if needed. Must be in a
@ -161,7 +167,11 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
// Returns the render pass object, or VK_NULL_HANDLE if failed to create.
// A render pass managed by the render target cache may be ended and resumed
// at any time (to allow for things like copying and texture loading).
VkRenderPass GetRenderPass(RenderPassKey key);
VkRenderPass GetHostRenderTargetsRenderPass(RenderPassKey key);
VkRenderPass GetFragmentShaderInterlockRenderPass() const {
assert_true(GetPath() == Path::kPixelShaderInterlock);
return fsi_render_pass_;
}
VkFormat GetDepthVulkanFormat(xenos::DepthRenderTargetFormat format) const;
VkFormat GetColorVulkanFormat(xenos::ColorRenderTargetFormat format) const;
@ -178,6 +188,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
bool IsHostDepthEncodingDifferent(
xenos::DepthRenderTargetFormat format) const override;
void RequestPixelShaderInterlockBarrier() override;
private:
enum class EdramBufferUsage {
// There's no need for combined fragment and compute usages.
@ -251,6 +263,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
VulkanCommandProcessor& command_processor_;
TraceWriter& trace_writer_;
Path path_ = Path::kHostRenderTargets;
// Accessible in fragment and compute shaders.
VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE;
VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE;
@ -276,9 +290,18 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
std::array<VkPipeline, size_t(draw_util::ResolveCopyShaderIndex::kCount)>
resolve_copy_pipelines_{};
// RenderPassKey::key -> VkRenderPass.
// VK_NULL_HANDLE if failed to create.
std::unordered_map<uint32_t, VkRenderPass> render_passes_;
// On the fragment shader interlock path, the render pass key is used purely
// for passing parameters to pipeline setup - there's always only one render
// pass.
RenderPassKey last_update_render_pass_key_;
VkRenderPass last_update_render_pass_ = VK_NULL_HANDLE;
// The pitch is not used on the fragment shader interlock path.
uint32_t last_update_framebuffer_pitch_tiles_at_32bpp_ = 0;
// The attachments are not used on the fragment shader interlock path.
const RenderTarget* const*
last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] =
{};
const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE;
// For host render targets.
@ -809,7 +832,7 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
};
// Returns the framebuffer object, or VK_NULL_HANDLE if failed to create.
const Framebuffer* GetFramebuffer(
const Framebuffer* GetHostRenderTargetsFramebuffer(
RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp,
const RenderTarget* const* depth_and_color_render_targets);
@ -845,17 +868,13 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
bool msaa_2x_attachments_supported_ = false;
bool msaa_2x_no_attachments_supported_ = false;
// VK_NULL_HANDLE if failed to create.
std::unordered_map<RenderPassKey, VkRenderPass, RenderPassKey::Hasher>
render_passes_;
std::unordered_map<FramebufferKey, Framebuffer, FramebufferKey::Hasher>
framebuffers_;
RenderPassKey last_update_render_pass_key_;
VkRenderPass last_update_render_pass_ = VK_NULL_HANDLE;
uint32_t last_update_framebuffer_pitch_tiles_at_32bpp_ = 0;
const RenderTarget* const*
last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] =
{};
const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE;
// Set 0 - EDRAM storage buffer, set 1 - source depth sampled image (and
// unused stencil from the transfer descriptor set), HostDepthStoreConstants
// passed via push constants.
@ -895,6 +914,15 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
// Temporary storage for DumpRenderTargets.
std::vector<ResolveCopyDumpRectangle> dump_rectangles_;
std::vector<DumpInvocation> dump_invocations_;
// For pixel (fragment) shader interlock.
VkRenderPass fsi_render_pass_ = VK_NULL_HANDLE;
Framebuffer fsi_framebuffer_;
VkPipelineLayout resolve_fsi_clear_pipeline_layout_ = VK_NULL_HANDLE;
VkPipeline resolve_fsi_clear_32bpp_pipeline_ = VK_NULL_HANDLE;
VkPipeline resolve_fsi_clear_64bpp_pipeline_ = VK_NULL_HANDLE;
};
} // namespace vulkan

View File

@ -1,5 +1,7 @@
// VK_KHR_get_physical_device_properties2 functions used in Xenia.
// Promoted to Vulkan 1.1 core.
XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceFeatures2KHR,
vkGetPhysicalDeviceFeatures2)
XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceMemoryProperties2KHR,
vkGetPhysicalDeviceMemoryProperties2)
XE_UI_VULKAN_FUNCTION_PROMOTED(vkGetPhysicalDeviceProperties2KHR,

View File

@ -696,6 +696,7 @@ bool VulkanProvider::Initialize() {
device_extensions_.khr_shader_float_controls = true;
device_extensions_.khr_spirv_1_4 = true;
if (device_properties_.apiVersion >= VK_MAKE_API_VERSION(0, 1, 3, 0)) {
device_extensions_.ext_shader_demote_to_helper_invocation = true;
device_extensions_.khr_maintenance4 = true;
}
}
@ -709,6 +710,8 @@ bool VulkanProvider::Initialize() {
{"VK_EXT_fragment_shader_interlock",
offsetof(DeviceExtensions, ext_fragment_shader_interlock)},
{"VK_EXT_memory_budget", offsetof(DeviceExtensions, ext_memory_budget)},
{"VK_EXT_shader_demote_to_helper_invocation",
offsetof(DeviceExtensions, ext_shader_demote_to_helper_invocation)},
{"VK_EXT_shader_stencil_export",
offsetof(DeviceExtensions, ext_shader_stencil_export)},
{"VK_KHR_bind_memory2", offsetof(DeviceExtensions, khr_bind_memory2)},
@ -816,6 +819,16 @@ bool VulkanProvider::Initialize() {
// Get additional device properties.
std::memset(&device_float_controls_properties_, 0,
sizeof(device_float_controls_properties_));
device_float_controls_properties_.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR;
std::memset(&device_fragment_shader_interlock_features_, 0,
sizeof(device_fragment_shader_interlock_features_));
device_fragment_shader_interlock_features_.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT;
std::memset(&device_shader_demote_to_helper_invocation_features_, 0,
sizeof(device_shader_demote_to_helper_invocation_features_));
device_shader_demote_to_helper_invocation_features_.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT;
if (instance_extensions_.khr_get_physical_device_properties2) {
VkPhysicalDeviceProperties2KHR device_properties_2;
device_properties_2.sType =
@ -824,8 +837,6 @@ bool VulkanProvider::Initialize() {
VkPhysicalDeviceProperties2KHR* device_properties_2_last =
&device_properties_2;
if (device_extensions_.khr_shader_float_controls) {
device_float_controls_properties_.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR;
device_float_controls_properties_.pNext = nullptr;
device_properties_2_last->pNext = &device_float_controls_properties_;
device_properties_2_last =
@ -836,6 +847,28 @@ bool VulkanProvider::Initialize() {
ifn_.vkGetPhysicalDeviceProperties2KHR(physical_device_,
&device_properties_2);
}
VkPhysicalDeviceFeatures2KHR device_features_2;
device_features_2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR;
device_features_2.pNext = nullptr;
VkPhysicalDeviceFeatures2KHR* device_features_2_last = &device_features_2;
if (device_extensions_.ext_fragment_shader_interlock) {
device_fragment_shader_interlock_features_.pNext = nullptr;
device_features_2_last->pNext =
&device_fragment_shader_interlock_features_;
device_features_2_last = reinterpret_cast<VkPhysicalDeviceFeatures2KHR*>(
&device_fragment_shader_interlock_features_);
}
if (device_extensions_.ext_shader_demote_to_helper_invocation) {
device_shader_demote_to_helper_invocation_features_.pNext = nullptr;
device_features_2_last->pNext =
&device_shader_demote_to_helper_invocation_features_;
device_features_2_last = reinterpret_cast<VkPhysicalDeviceFeatures2KHR*>(
&device_shader_demote_to_helper_invocation_features_);
}
if (device_features_2_last != &device_features_2) {
ifn_.vkGetPhysicalDeviceFeatures2KHR(physical_device_,
&device_features_2);
}
}
// Create the device.
@ -888,6 +921,21 @@ bool VulkanProvider::Initialize() {
device_create_info_last = reinterpret_cast<VkDeviceCreateInfo*>(
&device_portability_subset_features_);
}
if (device_extensions_.ext_fragment_shader_interlock) {
// TODO(Triang3l): Enable only needed fragment shader interlock features.
device_fragment_shader_interlock_features_.pNext = nullptr;
device_create_info_last->pNext =
&device_fragment_shader_interlock_features_;
device_create_info_last = reinterpret_cast<VkDeviceCreateInfo*>(
&device_fragment_shader_interlock_features_);
}
if (device_extensions_.ext_shader_demote_to_helper_invocation) {
device_shader_demote_to_helper_invocation_features_.pNext = nullptr;
device_create_info_last->pNext =
&device_shader_demote_to_helper_invocation_features_;
device_create_info_last = reinterpret_cast<VkDeviceCreateInfo*>(
&device_shader_demote_to_helper_invocation_features_);
}
if (ifn_.vkCreateDevice(physical_device_, &device_create_info, nullptr,
&device_) != VK_SUCCESS) {
XELOGE("Failed to create a Vulkan device");
@ -995,8 +1043,30 @@ bool VulkanProvider::Initialize() {
XELOGVK("Vulkan device extensions:");
XELOGVK("* VK_EXT_fragment_shader_interlock: {}",
device_extensions_.ext_fragment_shader_interlock ? "yes" : "no");
if (device_extensions_.ext_fragment_shader_interlock) {
XELOGVK(
" * Sample interlock: {}",
device_fragment_shader_interlock_features_.fragmentShaderSampleInterlock
? "yes"
: "no");
XELOGVK(
" * Pixel interlock: {}",
device_fragment_shader_interlock_features_.fragmentShaderPixelInterlock
? "yes"
: "no");
}
XELOGVK("* VK_EXT_memory_budget: {}",
device_extensions_.ext_memory_budget ? "yes" : "no");
XELOGVK(
"* VK_EXT_shader_demote_to_helper_invocation: {}",
device_extensions_.ext_shader_demote_to_helper_invocation ? "yes" : "no");
if (device_extensions_.ext_shader_demote_to_helper_invocation) {
XELOGVK(" * Demote to helper invocation: {}",
device_shader_demote_to_helper_invocation_features_
.shaderDemoteToHelperInvocation
? "yes"
: "no");
}
XELOGVK("* VK_EXT_shader_stencil_export: {}",
device_extensions_.ext_shader_stencil_export ? "yes" : "no");
XELOGVK("* VK_KHR_bind_memory2: {}",

View File

@ -133,6 +133,8 @@ class VulkanProvider : public GraphicsProvider {
struct DeviceExtensions {
bool ext_fragment_shader_interlock;
bool ext_memory_budget;
// Core since 1.3.0.
bool ext_shader_demote_to_helper_invocation;
bool ext_shader_stencil_export;
// Core since 1.1.0.
bool khr_bind_memory2;
@ -198,6 +200,14 @@ class VulkanProvider : public GraphicsProvider {
device_float_controls_properties() const {
return device_float_controls_properties_;
}
const VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT&
device_fragment_shader_interlock_features() const {
return device_fragment_shader_interlock_features_;
}
const VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT&
device_shader_demote_to_helper_invocation_features() const {
return device_shader_demote_to_helper_invocation_features_;
}
struct Queue {
VkQueue queue = VK_NULL_HANDLE;
@ -320,6 +330,10 @@ class VulkanProvider : public GraphicsProvider {
uint32_t queue_family_graphics_compute_;
uint32_t queue_family_sparse_binding_;
VkPhysicalDeviceFloatControlsPropertiesKHR device_float_controls_properties_;
VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT
device_fragment_shader_interlock_features_;
VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT
device_shader_demote_to_helper_invocation_features_;
VkDevice device_ = VK_NULL_HANDLE;
DeviceFunctions dfn_ = {};

View File

@ -191,9 +191,10 @@
this.translationComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.translationComboBox.FormattingEnabled = true;
this.translationComboBox.Items.AddRange(new object[] {
"DXBC (RTV/DSV RB)",
"DXBC (ROV RB)",
"SPIR-V"});
"DXBC (render target RB)",
"DXBC (rasterizer-ordered view RB)",
"SPIR-V (framebuffer RB)",
"SPIR-V (fragment shader interlock RB)"});
this.translationComboBox.Location = new System.Drawing.Point(1224, 0);
this.translationComboBox.Margin = new System.Windows.Forms.Padding(3, 0, 3, 0);
this.translationComboBox.Name = "translationComboBox";

View File

@ -235,6 +235,7 @@ namespace shader_playground {
outputType = "dxbctext";
break;
case 2:
case 3:
outputType = "spirvtext";
break;
}
@ -269,8 +270,9 @@ namespace shader_playground {
"--vertex_shader_output_type=" + vertexShaderType,
"--dxbc_source_map=true",
};
if (translationComboBox.SelectedIndex == 1) {
startArguments.Add("--shader_output_dxbc_rov=true");
if (translationComboBox.SelectedIndex == 1 ||
translationComboBox.SelectedIndex == 3) {
startArguments.Add("--shader_output_pixel_shader_interlock=true");
}
startInfo = new ProcessStartInfo(compilerPath_);