From 0acb97d3839771a259063a1e9d387bbfee1d20c0 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 3 Apr 2022 16:40:29 +0300 Subject: [PATCH] [Vulkan] EDRAM range ownership transfers, resolve clears, 2x-as-4x MSAA Transfers are functional on a D3D12-like level, but need additional work so fallbacks are used when multisampled integer sampled images are not supported, and to eliminate transfers between render targets within Vulkan format compatibility classes by using different views directly. --- src/xenia/gpu/render_target_cache.h | 12 +- src/xenia/gpu/spirv_shader_translator.cc | 14 +- src/xenia/gpu/spirv_shader_translator.h | 36 + src/xenia/gpu/spirv_shader_translator_rb.cc | 425 ++ .../gpu/vulkan/deferred_command_buffer.cc | 45 + .../gpu/vulkan/deferred_command_buffer.h | 111 + src/xenia/gpu/vulkan/premake5.lua | 1 + .../gpu/vulkan/vulkan_command_processor.cc | 129 +- .../gpu/vulkan/vulkan_command_processor.h | 16 +- src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc | 20 +- .../gpu/vulkan/vulkan_render_target_cache.cc | 4242 ++++++++++++++++- .../gpu/vulkan/vulkan_render_target_cache.h | 482 +- src/xenia/gpu/vulkan/vulkan_shared_memory.cc | 17 +- src/xenia/gpu/vulkan/vulkan_shared_memory.h | 13 +- src/xenia/gpu/xenos.h | 1 + src/xenia/ui/vulkan/functions/device_1_0.inc | 2 + .../single_layout_descriptor_set_pool.cc | 120 + .../single_layout_descriptor_set_pool.h | 63 + src/xenia/ui/vulkan/vulkan_provider.cc | 4 + src/xenia/ui/vulkan/vulkan_provider.h | 1 + src/xenia/ui/vulkan/vulkan_util.cc | 47 + src/xenia/ui/vulkan/vulkan_util.h | 11 + 22 files changed, 5668 insertions(+), 144 deletions(-) create mode 100644 src/xenia/gpu/spirv_shader_translator_rb.cc create mode 100644 src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc create mode 100644 src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index f0e59fb5f..2bac528bd 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -302,6 +302,10 @@ class RenderTargetCache { } return xenos::IsColorRenderTargetFormat64bpp(GetColorFormat()); } + const char* GetFormatName() const { + return is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat()) + : xenos::GetColorRenderTargetFormatName(GetColorFormat()); + } uint32_t GetPitchTiles() const { return pitch_tiles_at_32bpp << uint32_t(Is64bpp()); @@ -317,11 +321,9 @@ class RenderTargetCache { } std::string GetDebugName() const { - return fmt::format( - "RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles, GetPitchTiles(), - uint32_t(1) << uint32_t(msaa_samples), - is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat()) - : xenos::GetColorRenderTargetFormatName(GetColorFormat())); + return fmt::format("RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles, + GetPitchTiles(), uint32_t(1) << uint32_t(msaa_samples), + GetFormatName()); } }; diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index ce940da49..bcd140445 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -113,11 +113,9 @@ uint32_t SpirvShaderTranslator::GetModificationRegisterCount() const { } void SpirvShaderTranslator::StartTranslation() { - // Tool ID 26 "Xenia Emulator Microcode Translator". - // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79 // TODO(Triang3l): Logger. - builder_ = std::make_unique(features_.spirv_version, - (26 << 16) | 1, nullptr); + builder_ = std::make_unique( + features_.spirv_version, (kSpirvMagicToolId << 16) | 1, nullptr); builder_->addCapability(IsSpirvTessEvalShader() ? spv::CapabilityTessellation : spv::CapabilityShader); @@ -1535,20 +1533,20 @@ spv::Id SpirvShaderTranslator::GetUnmodifiedOperandComponents( static_cast(original_operand.GetComponent(scalar_index)) - static_cast(SwizzleSource::kX)); } - id_vector_temp_util_.clear(); - id_vector_temp_util_.reserve(component_count); + uint_vector_temp_util_.clear(); + uint_vector_temp_util_.reserve(component_count); uint32_t components_remaining = components; uint32_t component_index; while (xe::bit_scan_forward(components_remaining, &component_index)) { components_remaining &= ~(uint32_t(1) << component_index); - id_vector_temp_util_.push_back( + uint_vector_temp_util_.push_back( static_cast( original_operand.GetComponent(component_index)) - static_cast(SwizzleSource::kX)); } return builder_->createRvalueSwizzle(spv::NoPrecision, type_float_vectors_[component_count - 1], - operand_storage, id_vector_temp_util_); + operand_storage, uint_vector_temp_util_); } void SpirvShaderTranslator::GetOperandScalarXY( diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 932bd608f..beb478bb6 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -138,6 +138,10 @@ class SpirvShaderTranslator : public ShaderTranslator { kDescriptorSetCount, }; + // "Xenia Emulator Microcode Translator". + // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79 + static constexpr uint32_t kSpirvMagicToolId = 26; + struct Features { explicit Features(const ui::vulkan::VulkanProvider& provider); explicit Features(bool all = false); @@ -172,6 +176,38 @@ class SpirvShaderTranslator : public ShaderTranslator { features_.max_storage_buffer_range); } + // Common functions useful not only for the translator, but also for EDRAM + // emulation via conventional render targets. + + // Converts the color value externally clamped to [0, 31.875] to 7e3 floating + // point, with zeros in bits 10:31, rounding to the nearest even. + static spv::Id PreClampedFloat32To7e3(spv::Builder& builder, + spv::Id f32_scalar, + spv::Id ext_inst_glsl_std_450); + // Same as PreClampedFloat32To7e3, but clamps the input to [0, 31.875]. + static spv::Id UnclampedFloat32To7e3(spv::Builder& builder, + spv::Id f32_scalar, + spv::Id ext_inst_glsl_std_450); + // Converts the 7e3 number in bits [f10_shift, f10_shift + 10) to a 32-bit + // float. + static spv::Id Float7e3To32(spv::Builder& builder, spv::Id f10_uint_scalar, + uint32_t f10_shift, bool result_as_uint, + spv::Id ext_inst_glsl_std_450); + // Converts the depth value externally clamped to the representable [0, 2) + // range to 20e4 floating point, with zeros in bits 24:31, rounding to the + // nearest even. If remap_from_0_to_0_5 is true, it's assumed that 0...1 is + // pre-remapped to 0...0.5 in the input. + static spv::Id PreClampedDepthTo20e4(spv::Builder& builder, + spv::Id f32_scalar, + bool remap_from_0_to_0_5, + spv::Id ext_inst_glsl_std_450); + // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit + // float. + static spv::Id Depth20e4To32(spv::Builder& builder, spv::Id f24_uint_scalar, + uint32_t f24_shift, bool remap_to_0_to_0_5, + bool result_as_uint, + spv::Id ext_inst_glsl_std_450); + protected: void Reset() override; diff --git a/src/xenia/gpu/spirv_shader_translator_rb.cc b/src/xenia/gpu/spirv_shader_translator_rb.cc new file mode 100644 index 000000000..4cb260bdd --- /dev/null +++ b/src/xenia/gpu/spirv_shader_translator_rb.cc @@ -0,0 +1,425 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/spirv_shader_translator.h" + +#include +#include + +#include "third_party/glslang/SPIRV/GLSL.std.450.h" +#include "xenia/base/assert.h" + +namespace xe { +namespace gpu { + +spv::Id SpirvShaderTranslator::PreClampedFloat32To7e3( + spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) { + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + // Assuming the value is already clamped to [0, 31.875]. + + spv::Id type_uint = builder.makeUintType(32); + + // Need the source as uint for bit operations. + { + spv::Id source_type = builder.getTypeId(f32_scalar); + assert_true(builder.isScalarType(source_type)); + if (!builder.isUintType(source_type)) { + f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar); + } + } + + // The denormal 7e3 case. + // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000 + spv::Id denormal_biased_f32; + { + spv::Instruction* denormal_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + denormal_insert_instruction->addIdOperand(f32_scalar); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_insert_instruction)); + denormal_biased_f32 = denormal_insert_instruction->getResultId(); + } + // denormal_biased_f32_shift_amount = min(125 - (f32 >> 23), 24) + // Not allowing the shift to overflow as that's undefined in SPIR-V. + spv::Id denormal_biased_f32_shift_amount; + { + spv::Instruction* denormal_shift_amount_instruction = + new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst); + denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin); + denormal_shift_amount_instruction->addIdOperand(builder.createBinOp( + spv::OpISub, type_uint, builder.makeUintConstant(125), + builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar, + builder.makeUintConstant(23)))); + denormal_shift_amount_instruction->addIdOperand( + builder.makeUintConstant(24)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_shift_amount_instruction)); + denormal_biased_f32_shift_amount = + denormal_shift_amount_instruction->getResultId(); + } + // denormal_biased_f32 = + // ((f32 & 0x7FFFFF) | 0x800000) >> min(125 - (f32 >> 23), 24) + denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint, + denormal_biased_f32, + denormal_biased_f32_shift_amount); + + // The normal 7e3 case. + // Bias the exponent. + // normal_biased_f32 = f32 - (124 << 23) + spv::Id normal_biased_f32 = + builder.createBinOp(spv::OpISub, type_uint, f32_scalar, + builder.makeUintConstant(UINT32_C(124) << 23)); + + // Select the needed conversion depending on whether the number is too small + // to be represented as normalized 7e3. + spv::Id biased_f32 = builder.createTriOp( + spv::OpSelect, type_uint, + builder.createBinOp(spv::OpULessThan, builder.makeBoolType(), f32_scalar, + builder.makeUintConstant(0x3E800000)), + denormal_biased_f32, normal_biased_f32); + + // Build the 7e3 number rounding to the nearest even. + // ((biased_f32 + 0x7FFF + ((biased_f32 >> 16) & 1)) >> 16) & 0x3FF + return builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIAdd, type_uint, biased_f32, + builder.makeUintConstant(0x7FFF)), + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32, + builder.makeUintConstant(16), + builder.makeUintConstant(1))), + builder.makeUintConstant(16), builder.makeUintConstant(10)); +} + +spv::Id SpirvShaderTranslator::UnclampedFloat32To7e3( + spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) { + spv::Id type_float = builder.makeFloatType(32); + + // Need the source as float for clamping. + { + spv::Id source_type = builder.getTypeId(f32_scalar); + assert_true(builder.isScalarType(source_type)); + if (!builder.isFloatType(source_type)) { + f32_scalar = + builder.createUnaryOp(spv::OpBitcast, type_float, f32_scalar); + } + } + + { + spv::Instruction* clamp_instruction = + new spv::Instruction(builder.getUniqueId(), type_float, spv::OpExtInst); + clamp_instruction->addIdOperand(ext_inst_glsl_std_450); + clamp_instruction->addImmediateOperand(GLSLstd450NClamp); + clamp_instruction->addIdOperand(f32_scalar); + clamp_instruction->addIdOperand(builder.makeFloatConstant(0.0f)); + clamp_instruction->addIdOperand(builder.makeFloatConstant(31.875f)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(clamp_instruction)); + f32_scalar = clamp_instruction->getResultId(); + } + + return PreClampedFloat32To7e3(builder, f32_scalar, ext_inst_glsl_std_450); +} + +spv::Id SpirvShaderTranslator::Float7e3To32(spv::Builder& builder, + spv::Id f10_uint_scalar, + uint32_t f10_shift, + bool result_as_uint, + spv::Id ext_inst_glsl_std_450) { + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + + assert_true(builder.isUintType(builder.getTypeId(f10_uint_scalar))); + assert_true(f10_shift <= (32 - 10)); + + spv::Id type_bool = builder.makeBoolType(); + spv::Id type_int = builder.makeIntType(32); + spv::Id type_uint = builder.makeUintType(32); + + spv::Id f10_unbiased_exponent = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f10_uint_scalar, + builder.makeUintConstant(f10_shift + 7), builder.makeUintConstant(3)); + spv::Id f10_mantissa = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f10_uint_scalar, + builder.makeUintConstant(f10_shift), builder.makeUintConstant(7)); + + // The denormal nonzero 7e3 case. + // denormal_mantissa_msb = findMSB(f10_mantissa) + spv::Id denormal_mantissa_msb; + { + spv::Instruction* denormal_mantissa_msb_instruction = + new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst); + denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb); + denormal_mantissa_msb_instruction->addIdOperand(f10_mantissa); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_mantissa_msb_instruction)); + denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId(); + } + denormal_mantissa_msb = + builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb); + // denormal_f32_unbiased_exponent = 1 - (7 - findMSB(f10_mantissa)) + // Or: + // denormal_f32_unbiased_exponent = findMSB(f10_mantissa) - 6 + spv::Id denormal_f32_unbiased_exponent = + builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb, + builder.makeUintConstant(6)); + // Normalize the mantissa. + // denormal_f32_mantissa = f10_mantissa << (7 - findMSB(f10_mantissa)) + spv::Id denormal_f32_mantissa = builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, f10_mantissa, + builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(7), + denormal_mantissa_msb)); + // If the 7e3 number is zero, make sure the float32 number is zero too. + spv::Id f10_mantissa_is_nonzero = builder.createBinOp( + spv::OpINotEqual, type_bool, f10_mantissa, builder.makeUintConstant(0)); + // Set the unbiased exponent to -124 for zero - 124 will be added later, + // resulting in zero float32. + denormal_f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f10_mantissa_is_nonzero, + denormal_f32_unbiased_exponent, builder.makeUintConstant(uint32_t(-124))); + denormal_f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f10_mantissa_is_nonzero, + denormal_f32_mantissa, builder.makeUintConstant(0)); + + // Select the needed conversion depending on whether the number is normal. + spv::Id f10_is_normal = + builder.createBinOp(spv::OpINotEqual, type_bool, f10_unbiased_exponent, + builder.makeUintConstant(0)); + spv::Id f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f10_is_normal, f10_unbiased_exponent, + denormal_f32_unbiased_exponent); + spv::Id f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f10_is_normal, f10_mantissa, + denormal_f32_mantissa); + + // Bias the exponent and construct the build the float32 number. + spv::Id f32_shifted; + { + spv::Instruction* f32_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + f32_insert_instruction->addIdOperand(f32_mantissa); + f32_insert_instruction->addIdOperand( + builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent, + builder.makeUintConstant(124))); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(7)); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(8)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(f32_insert_instruction)); + f32_shifted = f32_insert_instruction->getResultId(); + } + spv::Id f32 = + builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted, + builder.makeUintConstant(23 - 7)); + + if (!result_as_uint) { + f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32); + } + + return f32; +} + +spv::Id SpirvShaderTranslator::PreClampedDepthTo20e4( + spv::Builder& builder, spv::Id f32_scalar, bool remap_from_0_to_0_5, + spv::Id ext_inst_glsl_std_450) { + // CFloat24 from d3dref9.dll + + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + // Assuming the value is already clamped to [0, 2) (in all places, the depth + // is written with saturation). + + uint32_t remap_bias = uint32_t(remap_from_0_to_0_5); + + spv::Id type_uint = builder.makeUintType(32); + + // Need the source as uint for bit operations. + { + spv::Id source_type = builder.getTypeId(f32_scalar); + assert_true(builder.isScalarType(source_type)); + if (!builder.isUintType(source_type)) { + f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar); + } + } + + // The denormal 20e4 case. + // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000 + spv::Id denormal_biased_f32; + { + spv::Instruction* denormal_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + denormal_insert_instruction->addIdOperand(f32_scalar); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_insert_instruction)); + denormal_biased_f32 = denormal_insert_instruction->getResultId(); + } + // denormal_biased_f32_shift_amount = min(113 - (f32 >> 23), 24) + // Not allowing the shift to overflow as that's undefined in SPIR-V. + spv::Id denormal_biased_f32_shift_amount; + { + spv::Instruction* denormal_shift_amount_instruction = + new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst); + denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin); + denormal_shift_amount_instruction->addIdOperand(builder.createBinOp( + spv::OpISub, type_uint, builder.makeUintConstant(113 - remap_bias), + builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar, + builder.makeUintConstant(23)))); + denormal_shift_amount_instruction->addIdOperand( + builder.makeUintConstant(24)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_shift_amount_instruction)); + denormal_biased_f32_shift_amount = + denormal_shift_amount_instruction->getResultId(); + } + // denormal_biased_f32 = + // ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) + denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint, + denormal_biased_f32, + denormal_biased_f32_shift_amount); + + // The normal 20e4 case. + // Bias the exponent. + // normal_biased_f32 = f32 - (112 << 23) + spv::Id normal_biased_f32 = builder.createBinOp( + spv::OpISub, type_uint, f32_scalar, + builder.makeUintConstant((UINT32_C(112) + remap_bias) << 23)); + + // Select the needed conversion depending on whether the number is too small + // to be represented as normalized 20e4. + spv::Id biased_f32 = builder.createTriOp( + spv::OpSelect, type_uint, + builder.createBinOp( + spv::OpULessThan, builder.makeBoolType(), f32_scalar, + builder.makeUintConstant(0x38800000 - (remap_bias << 23))), + denormal_biased_f32, normal_biased_f32); + + // Build the 20e4 number rounding to the nearest even. + // ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF + return builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIAdd, type_uint, biased_f32, + builder.makeUintConstant(3)), + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32, + builder.makeUintConstant(3), + builder.makeUintConstant(1))), + builder.makeUintConstant(3), builder.makeUintConstant(24)); +} + +spv::Id SpirvShaderTranslator::Depth20e4To32(spv::Builder& builder, + spv::Id f24_uint_scalar, + uint32_t f24_shift, + bool remap_to_0_to_0_5, + bool result_as_uint, + spv::Id ext_inst_glsl_std_450) { + // CFloat24 from d3dref9.dll + + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + + assert_true(builder.isUintType(builder.getTypeId(f24_uint_scalar))); + assert_true(f24_shift <= (32 - 24)); + + uint32_t remap_bias = uint32_t(remap_to_0_to_0_5); + + spv::Id type_bool = builder.makeBoolType(); + spv::Id type_int = builder.makeIntType(32); + spv::Id type_uint = builder.makeUintType(32); + + spv::Id f24_unbiased_exponent = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f24_uint_scalar, + builder.makeUintConstant(f24_shift + 20), builder.makeUintConstant(4)); + spv::Id f24_mantissa = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f24_uint_scalar, + builder.makeUintConstant(f24_shift), builder.makeUintConstant(20)); + + // The denormal nonzero 20e4 case. + // denormal_mantissa_msb = findMSB(f24_mantissa) + spv::Id denormal_mantissa_msb; + { + spv::Instruction* denormal_mantissa_msb_instruction = + new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst); + denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb); + denormal_mantissa_msb_instruction->addIdOperand(f24_mantissa); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_mantissa_msb_instruction)); + denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId(); + } + denormal_mantissa_msb = + builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb); + // denormal_f32_unbiased_exponent = 1 - (20 - findMSB(f24_mantissa)) + // Or: + // denormal_f32_unbiased_exponent = findMSB(f24_mantissa) - 19 + spv::Id denormal_f32_unbiased_exponent = + builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb, + builder.makeUintConstant(19)); + // Normalize the mantissa. + // denormal_f32_mantissa = f24_mantissa << (20 - findMSB(f24_mantissa)) + spv::Id denormal_f32_mantissa = builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, f24_mantissa, + builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(20), + denormal_mantissa_msb)); + // If the 20e4 number is zero, make sure the float32 number is zero too. + spv::Id f24_mantissa_is_nonzero = builder.createBinOp( + spv::OpINotEqual, type_bool, f24_mantissa, builder.makeUintConstant(0)); + // Set the unbiased exponent to -112 for zero - 112 will be added later, + // resulting in zero float32. + denormal_f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f24_mantissa_is_nonzero, + denormal_f32_unbiased_exponent, + builder.makeUintConstant(uint32_t(-int32_t(112 - remap_bias)))); + denormal_f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f24_mantissa_is_nonzero, + denormal_f32_mantissa, builder.makeUintConstant(0)); + + // Select the needed conversion depending on whether the number is normal. + spv::Id f24_is_normal = + builder.createBinOp(spv::OpINotEqual, type_bool, f24_unbiased_exponent, + builder.makeUintConstant(0)); + spv::Id f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f24_is_normal, f24_unbiased_exponent, + denormal_f32_unbiased_exponent); + spv::Id f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f24_is_normal, f24_mantissa, + denormal_f32_mantissa); + + // Bias the exponent and construct the build the float32 number. + spv::Id f32_shifted; + { + spv::Instruction* f32_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + f32_insert_instruction->addIdOperand(f32_mantissa); + f32_insert_instruction->addIdOperand( + builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent, + builder.makeUintConstant(112 - remap_bias))); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(20)); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(8)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(f32_insert_instruction)); + f32_shifted = f32_insert_instruction->getResultId(); + } + spv::Id f32 = + builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted, + builder.makeUintConstant(23 - 20)); + + if (!result_as_uint) { + f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32); + } + + return f32; +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/vulkan/deferred_command_buffer.cc b/src/xenia/gpu/vulkan/deferred_command_buffer.cc index 470d8adde..98d42865d 100644 --- a/src/xenia/gpu/vulkan/deferred_command_buffer.cc +++ b/src/xenia/gpu/vulkan/deferred_command_buffer.cc @@ -103,6 +103,37 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) { args.pipeline); } break; + case Command::kVkBindVertexBuffers: { + auto& args = *reinterpret_cast(stream); + size_t offset_bytes = + xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer)); + const VkBuffer* buffers = reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + offset_bytes = + xe::align(offset_bytes + sizeof(VkBuffer) * args.binding_count, + alignof(VkDeviceSize)); + const VkDeviceSize* offsets = reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + dfn.vkCmdBindVertexBuffers(command_buffer, args.first_binding, + args.binding_count, buffers, offsets); + } break; + + case Command::kVkClearAttachments: { + auto& args = *reinterpret_cast(stream); + size_t offset_bytes = xe::align(sizeof(ArgsVkClearAttachments), + alignof(VkClearAttachment)); + const VkClearAttachment* attachments = + reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + offset_bytes = xe::align( + offset_bytes + sizeof(VkClearAttachment) * args.attachment_count, + alignof(VkClearRect)); + const VkClearRect* rects = reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + dfn.vkCmdClearAttachments(command_buffer, args.attachment_count, + attachments, args.rect_count, rects); + } break; + case Command::kVkCopyBuffer: { auto& args = *reinterpret_cast(stream); dfn.vkCmdCopyBuffer( @@ -112,6 +143,12 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) { xe::align(sizeof(ArgsVkCopyBuffer), alignof(VkBufferCopy)))); } break; + case Command::kVkDispatch: { + auto& args = *reinterpret_cast(stream); + dfn.vkCmdDispatch(command_buffer, args.group_count_x, + args.group_count_y, args.group_count_z); + } break; + case Command::kVkDraw: { auto& args = *reinterpret_cast(stream); dfn.vkCmdDraw(command_buffer, args.vertex_count, args.instance_count, @@ -168,6 +205,14 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) { args.image_memory_barrier_count, image_memory_barriers); } break; + case Command::kVkPushConstants: { + auto& args = *reinterpret_cast(stream); + dfn.vkCmdPushConstants(command_buffer, args.layout, args.stage_flags, + args.offset, args.size, + reinterpret_cast(stream) + + sizeof(ArgsVkPushConstants)); + } break; + case Command::kVkSetBlendConstants: { auto& args = *reinterpret_cast(stream); dfn.vkCmdSetBlendConstants(command_buffer, args.blend_constants); diff --git a/src/xenia/gpu/vulkan/deferred_command_buffer.h b/src/xenia/gpu/vulkan/deferred_command_buffer.h index ac4c88f85..e3605f1e6 100644 --- a/src/xenia/gpu/vulkan/deferred_command_buffer.h +++ b/src/xenia/gpu/vulkan/deferred_command_buffer.h @@ -108,6 +108,61 @@ class DeferredCommandBuffer { args.pipeline = pipeline; } + void CmdVkBindVertexBuffers(uint32_t first_binding, uint32_t binding_count, + const VkBuffer* buffers, + const VkDeviceSize* offsets) { + size_t arguments_size = + xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer)); + size_t buffers_offset = arguments_size; + arguments_size = + xe::align(arguments_size + sizeof(VkBuffer) * binding_count, + alignof(VkDeviceSize)); + size_t offsets_offset = arguments_size; + arguments_size += sizeof(VkDeviceSize) * binding_count; + uint8_t* args_ptr = reinterpret_cast( + WriteCommand(Command::kVkBindVertexBuffers, arguments_size)); + auto& args = *reinterpret_cast(args_ptr); + args.first_binding = first_binding; + args.binding_count = binding_count; + std::memcpy(args_ptr + buffers_offset, buffers, + sizeof(VkBuffer) * binding_count); + std::memcpy(args_ptr + offsets_offset, offsets, + sizeof(VkDeviceSize) * binding_count); + } + + void CmdClearAttachmentsEmplace(uint32_t attachment_count, + VkClearAttachment*& attachments_out, + uint32_t rect_count, + VkClearRect*& rects_out) { + size_t arguments_size = + xe::align(sizeof(ArgsVkClearAttachments), alignof(VkClearAttachment)); + size_t attachments_offset = arguments_size; + arguments_size = + xe::align(arguments_size + sizeof(VkClearAttachment) * attachment_count, + alignof(VkClearRect)); + size_t rects_offset = arguments_size; + arguments_size += sizeof(VkClearRect) * rect_count; + uint8_t* args_ptr = reinterpret_cast( + WriteCommand(Command::kVkClearAttachments, arguments_size)); + auto& args = *reinterpret_cast(args_ptr); + args.attachment_count = attachment_count; + args.rect_count = rect_count; + attachments_out = + reinterpret_cast(args_ptr + attachments_offset); + rects_out = reinterpret_cast(args_ptr + rects_offset); + } + void CmdVkClearAttachments(uint32_t attachment_count, + const VkClearAttachment* attachments, + uint32_t rect_count, const VkClearRect* rects) { + VkClearAttachment* attachments_arg; + VkClearRect* rects_arg; + CmdClearAttachmentsEmplace(attachment_count, attachments_arg, rect_count, + rects_arg); + std::memcpy(attachments_arg, attachments, + sizeof(VkClearAttachment) * attachment_count); + std::memcpy(rects_arg, rects, sizeof(VkClearRect) * rect_count); + } + VkBufferCopy* CmdCopyBufferEmplace(VkBuffer src_buffer, VkBuffer dst_buffer, uint32_t region_count) { const size_t header_size = @@ -127,6 +182,15 @@ class DeferredCommandBuffer { regions, sizeof(VkBufferCopy) * region_count); } + void CmdVkDispatch(uint32_t group_count_x, uint32_t group_count_y, + uint32_t group_count_z) { + auto& args = *reinterpret_cast( + WriteCommand(Command::kVkDispatch, sizeof(ArgsVkDispatch))); + args.group_count_x = group_count_x; + args.group_count_y = group_count_y; + args.group_count_z = group_count_z; + } + void CmdVkDraw(uint32_t vertex_count, uint32_t instance_count, uint32_t first_vertex, uint32_t first_instance) { auto& args = *reinterpret_cast( @@ -162,6 +226,19 @@ class DeferredCommandBuffer { uint32_t image_memory_barrier_count, const VkImageMemoryBarrier* image_memory_barriers); + void CmdVkPushConstants(VkPipelineLayout layout, + VkShaderStageFlags stage_flags, uint32_t offset, + uint32_t size, const void* values) { + uint8_t* args_ptr = reinterpret_cast(WriteCommand( + Command::kVkPushConstants, sizeof(ArgsVkPushConstants) + size)); + auto& args = *reinterpret_cast(args_ptr); + args.layout = layout; + args.stage_flags = stage_flags; + args.offset = offset; + args.size = size; + std::memcpy(args_ptr + sizeof(ArgsVkPushConstants), values, size); + } + void CmdVkSetBlendConstants(const float* blend_constants) { auto& args = *reinterpret_cast(WriteCommand( Command::kVkSetBlendConstants, sizeof(ArgsVkSetBlendConstants))); @@ -237,11 +314,15 @@ class DeferredCommandBuffer { kVkBindDescriptorSets, kVkBindIndexBuffer, kVkBindPipeline, + kVkBindVertexBuffers, + kVkClearAttachments, kVkCopyBuffer, + kVkDispatch, kVkDraw, kVkDrawIndexed, kVkEndRenderPass, kVkPipelineBarrier, + kVkPushConstants, kVkSetBlendConstants, kVkSetDepthBias, kVkSetScissor, @@ -289,6 +370,22 @@ class DeferredCommandBuffer { VkPipeline pipeline; }; + struct ArgsVkBindVertexBuffers { + uint32_t first_binding; + uint32_t binding_count; + // Followed by aligned VkBuffer[], VkDeviceSize[]. + static_assert(alignof(VkBuffer) <= alignof(uintmax_t)); + static_assert(alignof(VkDeviceSize) <= alignof(uintmax_t)); + }; + + struct ArgsVkClearAttachments { + uint32_t attachment_count; + uint32_t rect_count; + // Followed by aligned VkClearAttachment[], VkClearRect[]. + static_assert(alignof(VkClearAttachment) <= alignof(uintmax_t)); + static_assert(alignof(VkClearRect) <= alignof(uintmax_t)); + }; + struct ArgsVkCopyBuffer { VkBuffer src_buffer; VkBuffer dst_buffer; @@ -297,6 +394,12 @@ class DeferredCommandBuffer { static_assert(alignof(VkBufferCopy) <= alignof(uintmax_t)); }; + struct ArgsVkDispatch { + uint32_t group_count_x; + uint32_t group_count_y; + uint32_t group_count_z; + }; + struct ArgsVkDraw { uint32_t vertex_count; uint32_t instance_count; @@ -326,6 +429,14 @@ class DeferredCommandBuffer { static_assert(alignof(VkImageMemoryBarrier) <= alignof(uintmax_t)); }; + struct ArgsVkPushConstants { + VkPipelineLayout layout; + VkShaderStageFlags stage_flags; + uint32_t offset; + uint32_t size; + // Followed by `size` bytes of values. + }; + struct ArgsVkSetBlendConstants { float blend_constants[4]; }; diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua index 44205f326..ffc359504 100644 --- a/src/xenia/gpu/vulkan/premake5.lua +++ b/src/xenia/gpu/vulkan/premake5.lua @@ -8,6 +8,7 @@ project("xenia-gpu-vulkan") language("C++") links({ "fmt", + "glslang-spirv", "xenia-base", "xenia-gpu", "xenia-ui", diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 69d0c70a3..4f534c9dd 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -476,7 +476,7 @@ bool VulkanCommandProcessor::SetupContext() { swap_pipeline_create_info.renderPass = swap_render_pass_; swap_pipeline_create_info.subpass = 0; swap_pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; - swap_pipeline_create_info.basePipelineIndex = UINT32_MAX; + swap_pipeline_create_info.basePipelineIndex = -1; VkResult swap_pipeline_create_result = dfn.vkCreateGraphicsPipelines( device, VK_NULL_HANDLE, 1, &swap_pipeline_create_info, nullptr, &swap_pipeline_); @@ -810,8 +810,6 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, deferred_command_buffer_.CmdVkBeginRenderPass( &render_pass_begin_info, VK_SUBPASS_CONTENTS_INLINE); - dynamic_viewport_update_needed_ = true; - dynamic_scissor_update_needed_ = true; VkViewport viewport; viewport.x = 0.0f; viewport.y = 0.0f; @@ -819,13 +817,13 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, viewport.height = float(scaled_height); viewport.minDepth = 0.0f; viewport.maxDepth = 1.0f; - deferred_command_buffer_.CmdVkSetViewport(0, 1, &viewport); - VkRect2D scissor_rect; - scissor_rect.offset.x = 0; - scissor_rect.offset.y = 0; - scissor_rect.extent.width = scaled_width; - scissor_rect.extent.height = scaled_height; - deferred_command_buffer_.CmdVkSetScissor(0, 1, &scissor_rect); + SetViewport(viewport); + VkRect2D scissor; + scissor.offset.x = 0; + scissor.offset.y = 0; + scissor.extent.width = scaled_width; + scissor.extent.height = scaled_height; + SetScissor(scissor); BindExternalGraphicsPipeline(swap_pipeline_); @@ -856,7 +854,7 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, EndSubmission(true); } -void VulkanCommandProcessor::PushBufferMemoryBarrier( +bool VulkanCommandProcessor::PushBufferMemoryBarrier( VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, @@ -865,7 +863,7 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier( if (skip_if_equal && src_stage_mask == dst_stage_mask && src_access_mask == dst_access_mask && src_queue_family_index == dst_queue_family_index) { - return; + return false; } // Separate different barriers for overlapping buffer ranges into different @@ -889,10 +887,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier( src_queue_family_index && other_buffer_memory_barrier.dstQueueFamilyIndex == dst_queue_family_index) { - // The barrier is already present. + // The barrier is already pending. current_pending_barrier_.src_stage_mask |= src_stage_mask; current_pending_barrier_.dst_stage_mask |= dst_stage_mask; - return; + return true; } SplitPendingBarrier(); break; @@ -911,9 +909,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier( buffer_memory_barrier.buffer = buffer; buffer_memory_barrier.offset = offset; buffer_memory_barrier.size = size; + return true; } -void VulkanCommandProcessor::PushImageMemoryBarrier( +bool VulkanCommandProcessor::PushImageMemoryBarrier( VkImage image, const VkImageSubresourceRange& subresource_range, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, @@ -923,7 +922,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier( if (skip_if_equal && src_stage_mask == dst_stage_mask && src_access_mask == dst_access_mask && old_layout == new_layout && src_queue_family_index == dst_queue_family_index) { - return; + return false; } // Separate different barriers for overlapping image subresource ranges into @@ -969,10 +968,10 @@ void VulkanCommandProcessor::PushImageMemoryBarrier( src_queue_family_index && other_image_memory_barrier.dstQueueFamilyIndex == dst_queue_family_index) { - // The barrier is already present. + // The barrier is already pending. current_pending_barrier_.src_stage_mask |= src_stage_mask; current_pending_barrier_.dst_stage_mask |= dst_stage_mask; - return; + return true; } SplitPendingBarrier(); break; @@ -992,6 +991,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier( image_memory_barrier.dstQueueFamilyIndex = dst_queue_family_index; image_memory_barrier.image = image; image_memory_barrier.subresourceRange = subresource_range; + return true; } bool VulkanCommandProcessor::SubmitBarriers(bool force_end_render_pass) { @@ -1257,6 +1257,53 @@ void VulkanCommandProcessor::BindExternalGraphicsPipeline( current_guest_graphics_pipeline_layout_ = VK_NULL_HANDLE; } +void VulkanCommandProcessor::BindExternalComputePipeline(VkPipeline pipeline) { + if (current_external_compute_pipeline_ == pipeline) { + return; + } + deferred_command_buffer_.CmdVkBindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); + current_external_compute_pipeline_ = pipeline; +} + +void VulkanCommandProcessor::SetViewport(const VkViewport& viewport) { + if (!dynamic_viewport_update_needed_) { + dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x; + dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.width != viewport.width; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.height != viewport.height; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.minDepth != viewport.minDepth; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.maxDepth != viewport.maxDepth; + } + if (dynamic_viewport_update_needed_) { + dynamic_viewport_ = viewport; + deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_); + dynamic_viewport_update_needed_ = false; + } +} + +void VulkanCommandProcessor::SetScissor(const VkRect2D& scissor) { + if (!dynamic_scissor_update_needed_) { + dynamic_scissor_update_needed_ |= + dynamic_scissor_.offset.x != scissor.offset.x; + dynamic_scissor_update_needed_ |= + dynamic_scissor_.offset.y != scissor.offset.y; + dynamic_scissor_update_needed_ |= + dynamic_scissor_.extent.width != scissor.extent.width; + dynamic_scissor_update_needed_ |= + dynamic_scissor_.extent.height != scissor.extent.height; + } + if (dynamic_scissor_update_needed_) { + dynamic_scissor_ = scissor; + deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_); + dynamic_scissor_update_needed_ = false; + } +} + Shader* VulkanCommandProcessor::LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, @@ -1417,8 +1464,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, } const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); - const VkPhysicalDeviceProperties& device_properties = - provider.device_properties(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; // Get dynamic rasterizer state. draw_util::ViewportInfo viewport_info; @@ -1438,10 +1485,10 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // life. Or even disregard the viewport bounds range in the fragment shader // interlocks case completely - apply the viewport and the scissor offset // directly to pixel address and to things like ps_param_gen. - draw_util::GetHostViewportInfo( - regs, 1, 1, false, device_properties.limits.maxViewportDimensions[0], - device_properties.limits.maxViewportDimensions[1], true, false, false, - false, viewport_info); + draw_util::GetHostViewportInfo(regs, 1, 1, false, + device_limits.maxViewportDimensions[0], + device_limits.maxViewportDimensions[1], true, + false, false, false, viewport_info); // Update dynamic graphics pipeline state. UpdateDynamicState(viewport_info, primitive_polygonal); @@ -1675,6 +1722,8 @@ void VulkanCommandProcessor::CheckSubmissionFenceAndDeviceLoss( primitive_processor_->CompletedSubmissionUpdated(); + render_target_cache_->CompletedSubmissionUpdated(); + // Destroy outdated swap objects. while (!swap_framebuffers_outdated_.empty()) { const auto& framebuffer_pair = swap_framebuffers_outdated_.front(); @@ -1752,6 +1801,7 @@ bool VulkanCommandProcessor::BeginSubmission(bool is_guest_command) { current_framebuffer_ = nullptr; current_guest_graphics_pipeline_ = VK_NULL_HANDLE; current_external_graphics_pipeline_ = VK_NULL_HANDLE; + current_external_compute_pipeline_ = VK_NULL_HANDLE; current_guest_graphics_pipeline_layout_ = nullptr; current_graphics_descriptor_sets_bound_up_to_date_ = 0; @@ -1861,6 +1911,8 @@ bool VulkanCommandProcessor::EndSubmission(bool is_swap) { if (submission_open_) { EndRenderPass(); + render_target_cache_->EndSubmission(); + primitive_processor_->EndSubmission(); shared_memory_->EndSubmission(); @@ -2112,20 +2164,7 @@ void VulkanCommandProcessor::UpdateDynamicState( } viewport.minDepth = viewport_info.z_min; viewport.maxDepth = viewport_info.z_max; - dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x; - dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y; - dynamic_viewport_update_needed_ |= dynamic_viewport_.width != viewport.width; - dynamic_viewport_update_needed_ |= - dynamic_viewport_.height != viewport.height; - dynamic_viewport_update_needed_ |= - dynamic_viewport_.minDepth != viewport.minDepth; - dynamic_viewport_update_needed_ |= - dynamic_viewport_.maxDepth != viewport.maxDepth; - if (dynamic_viewport_update_needed_) { - dynamic_viewport_ = viewport; - deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_); - dynamic_viewport_update_needed_ = false; - } + SetViewport(viewport); // Scissor. draw_util::Scissor scissor; @@ -2135,19 +2174,7 @@ void VulkanCommandProcessor::UpdateDynamicState( scissor_rect.offset.y = int32_t(scissor.offset[1]); scissor_rect.extent.width = scissor.extent[0]; scissor_rect.extent.height = scissor.extent[1]; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.offset.x != scissor_rect.offset.x; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.offset.y != scissor_rect.offset.y; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.extent.width != scissor_rect.extent.width; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.extent.height != scissor_rect.extent.height; - if (dynamic_scissor_update_needed_) { - dynamic_scissor_ = scissor_rect; - deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_); - dynamic_scissor_update_needed_ = false; - } + SetScissor(scissor_rect); // Depth bias. // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 551a3fcae..54c25d22f 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2020 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -81,15 +81,16 @@ class VulkanCommandProcessor : public CommandProcessor { uint64_t GetCurrentFrame() const { return frame_current_; } uint64_t GetCompletedFrame() const { return frame_completed_; } - // Submission must be open to insert barriers. - void PushBufferMemoryBarrier( + // Submission must be open to insert barriers. Returning true if the barrier + // has actually been inserted and not dropped. + bool PushBufferMemoryBarrier( VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, uint32_t src_queue_family_index = VK_QUEUE_FAMILY_IGNORED, uint32_t dst_queue_family_index = VK_QUEUE_FAMILY_IGNORED, bool skip_if_equal = true); - void PushImageMemoryBarrier( + bool PushImageMemoryBarrier( VkImage image, const VkImageSubresourceRange& subresource_range, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, @@ -125,6 +126,9 @@ class VulkanCommandProcessor : public CommandProcessor { bool keep_dynamic_depth_bias = false, bool keep_dynamic_blend_constants = false, bool keep_dynamic_stencil_mask_ref = false); + void BindExternalComputePipeline(VkPipeline pipeline); + void SetViewport(const VkViewport& viewport); + void SetScissor(const VkRect2D& scissor); protected: bool SetupContext() override; @@ -211,6 +215,9 @@ class VulkanCommandProcessor : public CommandProcessor { // open non-frame submission, BeginSubmission(true) will promote it to a // frame. EndSubmission(true) will close the frame no matter whether the // submission has already been closed. + // Unlike on Direct3D 12, submission boundaries do not imply any memory + // barriers aside from an incoming host write (but not outgoing host read) + // dependency. // Rechecks submission number and reclaims per-submission resources. Pass 0 as // the submission to await to simply check status, or pass @@ -396,6 +403,7 @@ class VulkanCommandProcessor : public CommandProcessor { // TODO(Triang3l): Change to a deferred compilation handle. VkPipeline current_guest_graphics_pipeline_; VkPipeline current_external_graphics_pipeline_; + VkPipeline current_external_compute_pipeline_; // Pipeline layout of the current guest graphics pipeline. const PipelineLayout* current_guest_graphics_pipeline_layout_; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index 8f581f0fa..450a346b0 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -884,11 +884,25 @@ bool VulkanPipelineCache::EnsurePipelineCreated( // TODO(Triang3l): Wide lines. rasterization_state.lineWidth = 1.0f; + VkSampleMask sample_mask = UINT32_MAX; VkPipelineMultisampleStateCreateInfo multisample_state = {}; multisample_state.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisample_state.rasterizationSamples = VkSampleCountFlagBits( - uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples)); + if (description.render_pass_key.msaa_samples == xenos::MsaaSamples::k2X && + !render_target_cache_.IsMsaa2xSupported( + description.render_pass_key.depth_and_color_used != 0)) { + // Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same + // sample locations, but still top-left and bottom-right - however, this can + // be adjusted with custom sample locations). + multisample_state.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT; + sample_mask = 0b1001; + // TODO(Triang3l): Research sample mask behavior without attachments (in + // Direct3D, it's completely ignored in this case). + multisample_state.pSampleMask = &sample_mask; + } else { + multisample_state.rasterizationSamples = VkSampleCountFlagBits( + uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples)); + } VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {}; depth_stencil_state.sType = @@ -1061,7 +1075,7 @@ bool VulkanPipelineCache::EnsurePipelineCreated( pipeline_create_info.renderPass = creation_arguments.render_pass; pipeline_create_info.subpass = 0; pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; - pipeline_create_info.basePipelineIndex = UINT32_MAX; + pipeline_create_info.basePipelineIndex = -1; const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc index 24eb8e14b..b029f64dd 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc @@ -10,23 +10,109 @@ #include "xenia/gpu/vulkan/vulkan_render_target_cache.h" #include +#include #include #include #include +#include #include #include +#include +#include "third_party/glslang/SPIRV/GLSL.std.450.h" +#include "third_party/glslang/SPIRV/SpvBuilder.h" #include "xenia/base/assert.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" +#include "xenia/gpu/draw_util.h" #include "xenia/gpu/registers.h" +#include "xenia/gpu/spirv_shader_translator.h" +#include "xenia/gpu/vulkan/deferred_command_buffer.h" #include "xenia/gpu/vulkan/vulkan_command_processor.h" +#include "xenia/gpu/xenos.h" #include "xenia/ui/vulkan/vulkan_util.h" namespace xe { namespace gpu { namespace vulkan { +// Generated with `xb buildshaders`. +namespace shaders { +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_1xmsaa_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_2xmsaa_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_4xmsaa_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/passthrough_position_xy_vs.h" +} // namespace shaders + +const VulkanRenderTargetCache::TransferPipelineLayoutInfo + VulkanRenderTargetCache::kTransferPipelineLayoutInfos[size_t( + TransferPipelineLayoutIndex::kCount)] = { + // kColor + {kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordAddressBit}, + // kDepth + {kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordAddressBit}, + // kColorToStencilBit + {kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordAddressBit | + kTransferUsedPushConstantDwordStencilMaskBit}, + // kDepthToStencilBit + {kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordAddressBit | + kTransferUsedPushConstantDwordStencilMaskBit}, + // kColorAndHostDepthTexture + {kTransferUsedDescriptorSetHostDepthStencilTexturesBit | + kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, + // kColorAndHostDepthBuffer + {kTransferUsedDescriptorSetHostDepthBufferBit | + kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, + // kDepthAndHostDepthTexture + {kTransferUsedDescriptorSetHostDepthStencilTexturesBit | + kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, + // kDepthAndHostDepthBuffer + {kTransferUsedDescriptorSetHostDepthBufferBit | + kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, +}; + +const VulkanRenderTargetCache::TransferModeInfo + VulkanRenderTargetCache::kTransferModes[size_t(TransferMode::kCount)] = { + // kColorToDepth + {TransferOutput::kDepth, TransferPipelineLayoutIndex::kColor}, + // kColorToColor + {TransferOutput::kColor, TransferPipelineLayoutIndex::kColor}, + // kDepthToDepth + {TransferOutput::kDepth, TransferPipelineLayoutIndex::kDepth}, + // kDepthToColor + {TransferOutput::kColor, TransferPipelineLayoutIndex::kDepth}, + // kColorToStencilBit + {TransferOutput::kStencilBit, + TransferPipelineLayoutIndex::kColorToStencilBit}, + // kDepthToStencilBit + {TransferOutput::kStencilBit, + TransferPipelineLayoutIndex::kDepthToStencilBit}, + // kColorAndHostDepthToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kColorAndHostDepthTexture}, + // kDepthAndHostDepthToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kDepthAndHostDepthTexture}, + // kColorAndHostDepthCopyToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kColorAndHostDepthBuffer}, + // kDepthAndHostDepthCopyToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kDepthAndHostDepthBuffer}, +}; + VulkanRenderTargetCache::VulkanRenderTargetCache( VulkanCommandProcessor& command_processor, const RegisterFile& register_file) @@ -35,6 +121,342 @@ VulkanRenderTargetCache::VulkanRenderTargetCache( VulkanRenderTargetCache::~VulkanRenderTargetCache() { Shutdown(true); } bool VulkanRenderTargetCache::Initialize() { + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + + // Descriptor set layouts. + VkDescriptorSetLayoutBinding descriptor_set_layout_bindings[2]; + descriptor_set_layout_bindings[0].binding = 0; + descriptor_set_layout_bindings[0].descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + descriptor_set_layout_bindings[0].descriptorCount = 1; + descriptor_set_layout_bindings[0].stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT; + descriptor_set_layout_bindings[0].pImmutableSamplers = nullptr; + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info; + descriptor_set_layout_create_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + descriptor_set_layout_create_info.pNext = nullptr; + descriptor_set_layout_create_info.flags = 0; + descriptor_set_layout_create_info.bindingCount = 1; + descriptor_set_layout_create_info.pBindings = descriptor_set_layout_bindings; + if (dfn.vkCreateDescriptorSetLayout( + device, &descriptor_set_layout_create_info, nullptr, + &descriptor_set_layout_storage_buffer_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the descriptor set layout " + "with one storage buffer"); + Shutdown(); + return false; + } + descriptor_set_layout_bindings[0].descriptorType = + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + if (dfn.vkCreateDescriptorSetLayout( + device, &descriptor_set_layout_create_info, nullptr, + &descriptor_set_layout_sampled_image_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the descriptor set layout " + "with one sampled image"); + Shutdown(); + return false; + } + descriptor_set_layout_bindings[1].binding = 1; + descriptor_set_layout_bindings[1].descriptorType = + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_layout_bindings[1].descriptorCount = 1; + descriptor_set_layout_bindings[1].stageFlags = + descriptor_set_layout_bindings[0].stageFlags; + descriptor_set_layout_bindings[1].pImmutableSamplers = nullptr; + descriptor_set_layout_create_info.bindingCount = 2; + if (dfn.vkCreateDescriptorSetLayout( + device, &descriptor_set_layout_create_info, nullptr, + &descriptor_set_layout_sampled_image_x2_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the descriptor set layout " + "with two sampled images"); + Shutdown(); + return false; + } + + // Descriptor set pools. + // The pool sizes were chosen without a specific reason. + VkDescriptorPoolSize descriptor_set_layout_size; + descriptor_set_layout_size.type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_layout_size.descriptorCount = 1; + descriptor_set_pool_sampled_image_ = + std::make_unique( + provider, 256, 1, &descriptor_set_layout_size, + descriptor_set_layout_sampled_image_); + descriptor_set_layout_size.descriptorCount = 2; + descriptor_set_pool_sampled_image_x2_ = + std::make_unique( + provider, 256, 1, &descriptor_set_layout_size, + descriptor_set_layout_sampled_image_x2_); + + // EDRAM contents reinterpretation buffer. + // 90 MB with 9x resolution scaling - within the minimum + // maxStorageBufferRange. + if (!ui::vulkan::util::CreateDedicatedAllocationBuffer( + provider, + VkDeviceSize(xenos::kEdramSizeBytes * resolution_scale_x_ * + resolution_scale_y_), + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + ui::vulkan::util::MemoryPurpose::kDeviceLocal, edram_buffer_, + edram_buffer_memory_)) { + XELOGE("VulkanRenderTargetCache: Failed to create the EDRAM buffer"); + Shutdown(); + return false; + } + if (GetPath() == Path::kPixelShaderInterlock) { + // The first operation will likely be drawing. + edram_buffer_usage_ = EdramBufferUsage::kFragmentReadWrite; + } else { + // The first operation will likely be depth self-comparison. + edram_buffer_usage_ = EdramBufferUsage::kFragmentRead; + } + edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + VkDescriptorPoolSize edram_storage_buffer_descriptor_pool_size; + edram_storage_buffer_descriptor_pool_size.type = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + edram_storage_buffer_descriptor_pool_size.descriptorCount = 1; + VkDescriptorPoolCreateInfo edram_storage_buffer_descriptor_pool_create_info; + edram_storage_buffer_descriptor_pool_create_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + edram_storage_buffer_descriptor_pool_create_info.pNext = nullptr; + edram_storage_buffer_descriptor_pool_create_info.flags = 0; + edram_storage_buffer_descriptor_pool_create_info.maxSets = 1; + edram_storage_buffer_descriptor_pool_create_info.poolSizeCount = 1; + edram_storage_buffer_descriptor_pool_create_info.pPoolSizes = + &edram_storage_buffer_descriptor_pool_size; + if (dfn.vkCreateDescriptorPool( + device, &edram_storage_buffer_descriptor_pool_create_info, nullptr, + &edram_storage_buffer_descriptor_pool_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the EDRAM buffer storage " + "buffer descriptor pool"); + Shutdown(); + return false; + } + VkDescriptorSetAllocateInfo edram_storage_buffer_descriptor_set_allocate_info; + edram_storage_buffer_descriptor_set_allocate_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + edram_storage_buffer_descriptor_set_allocate_info.pNext = nullptr; + edram_storage_buffer_descriptor_set_allocate_info.descriptorPool = + edram_storage_buffer_descriptor_pool_; + edram_storage_buffer_descriptor_set_allocate_info.descriptorSetCount = 1; + edram_storage_buffer_descriptor_set_allocate_info.pSetLayouts = + &descriptor_set_layout_storage_buffer_; + if (dfn.vkAllocateDescriptorSets( + device, &edram_storage_buffer_descriptor_set_allocate_info, + &edram_storage_buffer_descriptor_set_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to allocate the EDRAM buffer storage " + "buffer descriptor set"); + Shutdown(); + return false; + } + VkDescriptorBufferInfo edram_storage_buffer_descriptor_buffer_info; + edram_storage_buffer_descriptor_buffer_info.buffer = edram_buffer_; + edram_storage_buffer_descriptor_buffer_info.offset = 0; + edram_storage_buffer_descriptor_buffer_info.range = VK_WHOLE_SIZE; + VkWriteDescriptorSet edram_storage_buffer_descriptor_write; + edram_storage_buffer_descriptor_write.sType = + VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + edram_storage_buffer_descriptor_write.pNext = nullptr; + edram_storage_buffer_descriptor_write.dstSet = + edram_storage_buffer_descriptor_set_; + edram_storage_buffer_descriptor_write.dstBinding = 0; + edram_storage_buffer_descriptor_write.dstArrayElement = 0; + edram_storage_buffer_descriptor_write.descriptorCount = 1; + edram_storage_buffer_descriptor_write.descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + edram_storage_buffer_descriptor_write.pImageInfo = nullptr; + edram_storage_buffer_descriptor_write.pBufferInfo = + &edram_storage_buffer_descriptor_buffer_info; + edram_storage_buffer_descriptor_write.pTexelBufferView = nullptr; + dfn.vkUpdateDescriptorSets(device, 1, &edram_storage_buffer_descriptor_write, + 0, nullptr); + + // TODO(Triang3l): All paths (FSI). + + // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in + // transfers. + if (cvars::native_2x_msaa) { + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; + // Multisampled integer sampled images are optional in Vulkan and in Xenia. + msaa_2x_attachments_supported_ = + (device_limits.framebufferColorSampleCounts & + device_limits.framebufferDepthSampleCounts & + device_limits.framebufferStencilSampleCounts & + device_limits.sampledImageColorSampleCounts & + device_limits.sampledImageDepthSampleCounts & + device_limits.sampledImageStencilSampleCounts & + VK_SAMPLE_COUNT_2_BIT) && + (device_limits.sampledImageIntegerSampleCounts & + (VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT)) != + VK_SAMPLE_COUNT_4_BIT; + msaa_2x_no_attachments_supported_ = + (device_limits.framebufferNoAttachmentsSampleCounts & + VK_SAMPLE_COUNT_2_BIT) != 0; + } else { + msaa_2x_attachments_supported_ = false; + msaa_2x_no_attachments_supported_ = false; + } + + // Host depth storing pipeline layout. + VkDescriptorSetLayout host_depth_store_descriptor_set_layouts[] = { + // Destination EDRAM storage buffer. + descriptor_set_layout_storage_buffer_, + // Source depth / stencil texture (only depth is used). + descriptor_set_layout_sampled_image_x2_, + }; + VkPushConstantRange host_depth_store_push_constant_range; + host_depth_store_push_constant_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + host_depth_store_push_constant_range.offset = 0; + host_depth_store_push_constant_range.size = sizeof(HostDepthStoreConstants); + VkPipelineLayoutCreateInfo host_depth_store_pipeline_layout_create_info; + host_depth_store_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + host_depth_store_pipeline_layout_create_info.pNext = nullptr; + host_depth_store_pipeline_layout_create_info.flags = 0; + host_depth_store_pipeline_layout_create_info.setLayoutCount = + uint32_t(xe::countof(host_depth_store_descriptor_set_layouts)); + host_depth_store_pipeline_layout_create_info.pSetLayouts = + host_depth_store_descriptor_set_layouts; + host_depth_store_pipeline_layout_create_info.pushConstantRangeCount = 1; + host_depth_store_pipeline_layout_create_info.pPushConstantRanges = + &host_depth_store_push_constant_range; + if (dfn.vkCreatePipelineLayout( + device, &host_depth_store_pipeline_layout_create_info, nullptr, + &host_depth_store_pipeline_layout_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the host depth storing " + "pipeline layout"); + Shutdown(); + return false; + } + const std::pair host_depth_store_shaders[] = { + {shaders::host_depth_store_1xmsaa_cs, + sizeof(shaders::host_depth_store_1xmsaa_cs)}, + {shaders::host_depth_store_2xmsaa_cs, + sizeof(shaders::host_depth_store_2xmsaa_cs)}, + {shaders::host_depth_store_4xmsaa_cs, + sizeof(shaders::host_depth_store_4xmsaa_cs)}, + }; + for (size_t i = 0; i < xe::countof(host_depth_store_shaders); ++i) { + const std::pair host_depth_store_shader = + host_depth_store_shaders[i]; + VkPipeline host_depth_store_pipeline = + ui::vulkan::util::CreateComputePipeline( + provider, host_depth_store_pipeline_layout_, + host_depth_store_shader.first, host_depth_store_shader.second); + if (host_depth_store_pipeline == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the {}-sample host depth " + "storing pipeline", + uint32_t(1) << i); + Shutdown(); + return false; + } + host_depth_store_pipelines_[i] = host_depth_store_pipeline; + } + + // Transfer and clear vertex buffer, for quads of up to tile granularity. + transfer_vertex_buffer_pool_ = + std::make_unique( + provider, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + std::max(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, + sizeof(float) * 2 * 6 * + Transfer::kMaxCutoutBorderRectangles * + xenos::kEdramTileCount)); + + // Transfer vertex shader. + transfer_passthrough_vertex_shader_ = ui::vulkan::util::CreateShaderModule( + provider, shaders::passthrough_position_xy_vs, + sizeof(shaders::passthrough_position_xy_vs)); + if (transfer_passthrough_vertex_shader_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target ownership " + "transfer vertex shader"); + Shutdown(); + return false; + } + + // Transfer pipeline layouts. + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layouts + [kTransferUsedDescriptorSetCount]; + VkPushConstantRange transfer_pipeline_layout_push_constant_range; + transfer_pipeline_layout_push_constant_range.stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT; + transfer_pipeline_layout_push_constant_range.offset = 0; + VkPipelineLayoutCreateInfo transfer_pipeline_layout_create_info; + transfer_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + transfer_pipeline_layout_create_info.pNext = nullptr; + transfer_pipeline_layout_create_info.flags = 0; + transfer_pipeline_layout_create_info.pSetLayouts = + transfer_pipeline_layout_descriptor_set_layouts; + transfer_pipeline_layout_create_info.pPushConstantRanges = + &transfer_pipeline_layout_push_constant_range; + for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { + const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = + kTransferPipelineLayoutInfos[i]; + transfer_pipeline_layout_create_info.setLayoutCount = 0; + uint32_t transfer_pipeline_layout_descriptor_sets_remaining = + transfer_pipeline_layout_info.used_descriptor_sets; + uint32_t transfer_pipeline_layout_descriptor_set_index; + while ( + xe::bit_scan_forward(transfer_pipeline_layout_descriptor_sets_remaining, + &transfer_pipeline_layout_descriptor_set_index)) { + transfer_pipeline_layout_descriptor_sets_remaining &= + ~(uint32_t(1) << transfer_pipeline_layout_descriptor_set_index); + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layout = + VK_NULL_HANDLE; + switch (TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)) { + case kTransferUsedDescriptorSetHostDepthBuffer: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_storage_buffer_; + break; + case kTransferUsedDescriptorSetHostDepthStencilTextures: + case kTransferUsedDescriptorSetDepthStencilTextures: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_x2_; + break; + case kTransferUsedDescriptorSetColorTexture: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_; + break; + default: + assert_unhandled_case(TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)); + } + transfer_pipeline_layout_descriptor_set_layouts + [transfer_pipeline_layout_create_info.setLayoutCount++] = + transfer_pipeline_layout_descriptor_set_layout; + } + transfer_pipeline_layout_push_constant_range.size = uint32_t( + sizeof(uint32_t) * + xe::bit_count(transfer_pipeline_layout_info.used_push_constant_dwords)); + transfer_pipeline_layout_create_info.pushConstantRangeCount = + transfer_pipeline_layout_info.used_push_constant_dwords ? 1 : 0; + if (dfn.vkCreatePipelineLayout( + device, &transfer_pipeline_layout_create_info, nullptr, + &transfer_pipeline_layouts_[i]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target " + "ownership transfer pipeline layout {}", + i); + Shutdown(); + return false; + } + } + InitializeCommon(); return true; } @@ -45,6 +467,36 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) { const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); + for (const auto& transfer_pipeline_array_pair : transfer_pipelines_) { + for (VkPipeline transfer_pipeline : transfer_pipeline_array_pair.second) { + // May be null to prevent recreation attempts. + if (transfer_pipeline != VK_NULL_HANDLE) { + dfn.vkDestroyPipeline(device, transfer_pipeline, nullptr); + } + } + } + transfer_pipelines_.clear(); + for (const auto& transfer_shader_pair : transfer_shaders_) { + if (transfer_shader_pair.second != VK_NULL_HANDLE) { + dfn.vkDestroyShaderModule(device, transfer_shader_pair.second, nullptr); + } + } + transfer_shaders_.clear(); + for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device, + transfer_pipeline_layouts_[i]); + } + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyShaderModule, device, + transfer_passthrough_vertex_shader_); + transfer_vertex_buffer_pool_.reset(); + + for (size_t i = 0; i < xe::countof(host_depth_store_pipelines_); ++i) { + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipeline, device, + host_depth_store_pipelines_[i]); + } + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device, + host_depth_store_pipeline_layout_); + last_update_framebuffer_ = VK_NULL_HANDLE; for (const auto& framebuffer_pair : framebuffers_) { dfn.vkDestroyFramebuffer(device, framebuffer_pair.second.framebuffer, @@ -54,10 +506,32 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) { last_update_render_pass_ = VK_NULL_HANDLE; for (const auto& render_pass_pair : render_passes_) { - dfn.vkDestroyRenderPass(device, render_pass_pair.second, nullptr); + if (render_pass_pair.second != VK_NULL_HANDLE) { + dfn.vkDestroyRenderPass(device, render_pass_pair.second, nullptr); + } } render_passes_.clear(); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorPool, device, + edram_storage_buffer_descriptor_pool_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device, + edram_buffer_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device, + edram_buffer_memory_); + + descriptor_set_pool_sampled_image_x2_.reset(); + descriptor_set_pool_sampled_image_.reset(); + + ui::vulkan::util::DestroyAndNullHandle( + dfn.vkDestroyDescriptorSetLayout, device, + descriptor_set_layout_sampled_image_x2_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, + device, + descriptor_set_layout_sampled_image_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, + device, + descriptor_set_layout_storage_buffer_); + if (!from_destructor) { ShutdownCommon(); } @@ -87,6 +561,19 @@ void VulkanRenderTargetCache::ClearCache() { RenderTargetCache::ClearCache(); } +void VulkanRenderTargetCache::CompletedSubmissionUpdated() { + if (transfer_vertex_buffer_pool_) { + transfer_vertex_buffer_pool_->Reclaim( + command_processor_.GetCompletedSubmission()); + } +} + +void VulkanRenderTargetCache::EndSubmission() { + if (transfer_vertex_buffer_pool_) { + transfer_vertex_buffer_pool_->FlushWrites(); + } +} + bool VulkanRenderTargetCache::Update(bool is_rasterization_done, uint32_t shader_writes_color_targets) { if (!RenderTargetCache::Update(is_rasterization_done, @@ -94,9 +581,16 @@ bool VulkanRenderTargetCache::Update(bool is_rasterization_done, return false; } - auto rb_surface_info = register_file().Get(); + // TODO(Triang3l): All paths (FSI). + RenderTarget* const* depth_and_color_render_targets = last_update_accumulated_render_targets(); + + PerformTransfersAndResolveClears(1 + xenos::kMaxColorRenderTargets, + depth_and_color_render_targets, + last_update_transfers()); + + auto rb_surface_info = register_file().Get(); uint32_t render_targets_are_srgb = gamma_render_target_as_srgb_ ? last_update_accumulated_color_targets_are_gamma() @@ -104,7 +598,6 @@ bool VulkanRenderTargetCache::Update(bool is_rasterization_done, RenderPassKey render_pass_key; render_pass_key.msaa_samples = rb_surface_info.msaa_samples; - // TODO(Triang3l): 2x MSAA as 4x. if (depth_and_color_render_targets[0]) { render_pass_key.depth_and_color_used |= 1 << 0; render_pass_key.depth_format = @@ -220,9 +713,9 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { samples = VK_SAMPLE_COUNT_1_BIT; break; case xenos::MsaaSamples::k2X: - // Using unconditionally because if 2x is emulated as 4x, the key will - // also contain 4x. - samples = VK_SAMPLE_COUNT_2_BIT; + samples = IsMsaa2xSupported(key.depth_and_color_used != 0) + ? VK_SAMPLE_COUNT_2_BIT + : VK_SAMPLE_COUNT_4_BIT; break; case xenos::MsaaSamples::k4X: samples = VK_SAMPLE_COUNT_4_BIT; @@ -264,7 +757,11 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { color_attachment.attachment = attachment_index; VkAttachmentDescription& attachment = attachments[attachment_index]; attachment.flags = 0; - attachment.format = GetColorVulkanFormat(color_formats[i]); + xenos::ColorRenderTargetFormat color_format = color_formats[i]; + attachment.format = + key.color_rts_use_transfer_formats + ? GetColorOwnershipTransferVulkanFormat(color_format) + : GetColorVulkanFormat(color_format); attachment.samples = samples; attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; @@ -340,7 +837,8 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { VkRenderPass render_pass; if (dfn.vkCreateRenderPass(device, &render_pass_create_info, nullptr, &render_pass) != VK_SUCCESS) { - XELOGE("Failed to create a Vulkan render pass"); + XELOGE("VulkanRenderTargetCache: Failed to create a render pass"); + render_passes_.emplace(key.key, VK_NULL_HANDLE); return VK_NULL_HANDLE; } render_passes_.emplace(key.key, render_pass); @@ -419,8 +917,15 @@ VkFormat VulkanRenderTargetCache::GetColorOwnershipTransferVulkanFormat( } VulkanRenderTargetCache::VulkanRenderTarget::~VulkanRenderTarget() { - const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn(); - VkDevice device = provider_.device(); + const ui::vulkan::VulkanProvider& provider = + render_target_cache_.command_processor_.GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool = + key().is_depth + ? *render_target_cache_.descriptor_set_pool_sampled_image_x2_ + : *render_target_cache_.descriptor_set_pool_sampled_image_; + descriptor_set_pool.Free(descriptor_set_index_transfer_source_); if (view_color_transfer_separate_ != VK_NULL_HANDLE) { dfn.vkDestroyImageView(device, view_color_transfer_separate_, nullptr); } @@ -464,16 +969,20 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( image_create_info.pNext = nullptr; image_create_info.flags = 0; image_create_info.imageType = VK_IMAGE_TYPE_2D; - // TODO(Triang3l): Resolution scaling. - image_create_info.extent.width = key.GetWidth(); + image_create_info.extent.width = key.GetWidth() * resolution_scale_x_; image_create_info.extent.height = - GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples); + GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples) * + resolution_scale_y_; image_create_info.extent.depth = 1; image_create_info.mipLevels = 1; image_create_info.arrayLayers = 1; - // TODO(Triang3l): 2x MSAA as 4x. - image_create_info.samples = - VkSampleCountFlagBits(uint32_t(1) << uint32_t(key.msaa_samples)); + if (key.msaa_samples == xenos::MsaaSamples::k2X && + !msaa_2x_attachments_supported_) { + image_create_info.samples = VK_SAMPLE_COUNT_4_BIT; + } else { + image_create_info.samples = + VkSampleCountFlagBits(uint32_t(1) << uint32_t(key.msaa_samples)); + } image_create_info.tiling = VK_IMAGE_TILING_OPTIMAL; image_create_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT; image_create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; @@ -509,7 +1018,11 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( if (!ui::vulkan::util::CreateDedicatedAllocationImage( provider, image_create_info, ui::vulkan::util::MemoryPurpose::kDeviceLocal, image, memory)) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a {}x{} {}xMSAA {} render target " + "image", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName()); return nullptr; } @@ -532,7 +1045,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( VkImageView view_depth_color; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_depth_color) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a {} view for a {}x{} {}xMSAA {} " + "render target", + key.is_depth ? "depth" : "color", image_create_info.extent.width, + image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName()); dfn.vkDestroyImage(device, image, nullptr); dfn.vkFreeMemory(device, memory, nullptr); return nullptr; @@ -546,7 +1064,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_depth_stencil) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a depth / stencil view for a " + "{}x{} {}xMSAA {} render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), + xenos::GetDepthRenderTargetFormatName(key.GetDepthFormat())); dfn.vkDestroyImageView(device, view_depth_color, nullptr); dfn.vkDestroyImage(device, image, nullptr); dfn.vkFreeMemory(device, memory, nullptr); @@ -555,7 +1078,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( view_create_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_stencil) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a stencil view for a {}x{} " + "{}xMSAA render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), + xenos::GetDepthRenderTargetFormatName(key.GetDepthFormat())); dfn.vkDestroyImageView(device, view_depth_stencil, nullptr); dfn.vkDestroyImageView(device, view_depth_color, nullptr); dfn.vkDestroyImage(device, image, nullptr); @@ -567,7 +1095,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( view_create_info.format = VK_FORMAT_R8G8B8A8_SRGB; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_srgb) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create an sRGB view for a {}x{} " + "{}xMSAA render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), + xenos::GetColorRenderTargetFormatName(key.GetColorFormat())); dfn.vkDestroyImageView(device, view_depth_color, nullptr); dfn.vkDestroyImage(device, image, nullptr); dfn.vkFreeMemory(device, memory, nullptr); @@ -578,7 +1111,11 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( view_create_info.format = transfer_format; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_color_transfer_separate) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a transfer view for a {}x{} " + "{}xMSAA {} render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName()); if (view_srgb != VK_NULL_HANDLE) { dfn.vkDestroyImageView(device, view_srgb, nullptr); } @@ -590,11 +1127,170 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( } } - VkImageView view_transfer_separate = VK_NULL_HANDLE; + ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool = + key.is_depth ? *descriptor_set_pool_sampled_image_x2_ + : *descriptor_set_pool_sampled_image_; + size_t descriptor_set_index_transfer_source = descriptor_set_pool.Allocate(); + if (descriptor_set_index_transfer_source == SIZE_MAX) { + XELOGE( + "VulkanRenderTargetCache: Failed to allocate sampled image descriptors " + "for a {} render target", + key.is_depth ? "depth/stencil" : "color"); + if (view_color_transfer_separate != VK_NULL_HANDLE) { + dfn.vkDestroyImageView(device, view_color_transfer_separate, nullptr); + } + if (view_srgb != VK_NULL_HANDLE) { + dfn.vkDestroyImageView(device, view_srgb, nullptr); + } + dfn.vkDestroyImageView(device, view_depth_color, nullptr); + dfn.vkDestroyImage(device, image, nullptr); + dfn.vkFreeMemory(device, memory, nullptr); + return nullptr; + } + VkDescriptorSet descriptor_set_transfer_source = + descriptor_set_pool.Get(descriptor_set_index_transfer_source); + VkWriteDescriptorSet descriptor_set_write[2]; + VkDescriptorImageInfo descriptor_set_write_depth_color; + descriptor_set_write_depth_color.sampler = VK_NULL_HANDLE; + descriptor_set_write_depth_color.imageView = + view_color_transfer_separate != VK_NULL_HANDLE + ? view_color_transfer_separate + : view_depth_color; + descriptor_set_write_depth_color.imageLayout = + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + descriptor_set_write[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptor_set_write[0].pNext = nullptr; + descriptor_set_write[0].dstSet = descriptor_set_transfer_source; + descriptor_set_write[0].dstBinding = 0; + descriptor_set_write[0].dstArrayElement = 0; + descriptor_set_write[0].descriptorCount = 1; + descriptor_set_write[0].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_write[0].pImageInfo = &descriptor_set_write_depth_color; + descriptor_set_write[0].pBufferInfo = nullptr; + descriptor_set_write[0].pTexelBufferView = nullptr; + VkDescriptorImageInfo descriptor_set_write_stencil; + if (key.is_depth) { + descriptor_set_write_stencil.sampler = VK_NULL_HANDLE; + descriptor_set_write_stencil.imageView = view_stencil; + descriptor_set_write_stencil.imageLayout = + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + descriptor_set_write[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptor_set_write[1].pNext = nullptr; + descriptor_set_write[1].dstSet = descriptor_set_transfer_source; + descriptor_set_write[1].dstBinding = 1; + descriptor_set_write[1].dstArrayElement = 0; + descriptor_set_write[1].descriptorCount = 1; + descriptor_set_write[1].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_write[1].pImageInfo = &descriptor_set_write_stencil; + descriptor_set_write[1].pBufferInfo = nullptr; + descriptor_set_write[1].pTexelBufferView = nullptr; + } + dfn.vkUpdateDescriptorSets(device, key.is_depth ? 2 : 1, descriptor_set_write, + 0, nullptr); - return new VulkanRenderTarget(key, provider, image, memory, view_depth_color, + return new VulkanRenderTarget(key, *this, image, memory, view_depth_color, view_depth_stencil, view_stencil, view_srgb, - view_color_transfer_separate); + view_color_transfer_separate, + descriptor_set_index_transfer_source); +} + +void VulkanRenderTargetCache::GetEdramBufferUsageMasks( + EdramBufferUsage usage, VkPipelineStageFlags& stage_mask_out, + VkAccessFlags& access_mask_out) { + switch (usage) { + case EdramBufferUsage::kFragmentRead: + stage_mask_out = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_READ_BIT; + break; + case EdramBufferUsage::kFragmentReadWrite: + stage_mask_out = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + break; + case EdramBufferUsage::kComputeRead: + stage_mask_out = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_READ_BIT; + break; + case EdramBufferUsage::kComputeWrite: + stage_mask_out = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_WRITE_BIT; + break; + case EdramBufferUsage::kTransferRead: + stage_mask_out = VK_PIPELINE_STAGE_TRANSFER_BIT; + access_mask_out = VK_ACCESS_TRANSFER_READ_BIT; + break; + case EdramBufferUsage::kTransferWrite: + stage_mask_out = VK_PIPELINE_STAGE_TRANSFER_BIT; + access_mask_out = VK_ACCESS_TRANSFER_WRITE_BIT; + break; + default: + assert_unhandled_case(usage); + } +} + +void VulkanRenderTargetCache::UseEdramBuffer(EdramBufferUsage new_usage) { + if (edram_buffer_usage_ == new_usage) { + return; + } + VkPipelineStageFlags src_stage_mask, dst_stage_mask; + VkAccessFlags src_access_mask, dst_access_mask; + GetEdramBufferUsageMasks(edram_buffer_usage_, src_stage_mask, + src_access_mask); + GetEdramBufferUsageMasks(new_usage, dst_stage_mask, dst_access_mask); + if (command_processor_.PushBufferMemoryBarrier( + edram_buffer_, 0, VK_WHOLE_SIZE, src_stage_mask, dst_stage_mask, + src_access_mask, dst_access_mask)) { + // Resetting edram_buffer_modification_status_ only if the barrier has been + // truly inserted. + edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + } + edram_buffer_usage_ = new_usage; +} + +void VulkanRenderTargetCache::MarkEdramBufferModified( + EdramBufferModificationStatus modification_status) { + assert_true(modification_status != + EdramBufferModificationStatus::kUnmodified); + switch (edram_buffer_usage_) { + case EdramBufferUsage::kFragmentReadWrite: + // max because being modified via unordered access requires stricter + // synchronization than via fragment shader interlocks. + edram_buffer_modification_status_ = + std::max(edram_buffer_modification_status_, modification_status); + break; + case EdramBufferUsage::kComputeWrite: + assert_true(modification_status == + EdramBufferModificationStatus::kViaUnordered); + modification_status = EdramBufferModificationStatus::kViaUnordered; + break; + default: + assert_always( + "While changing the usage of the EDRAM buffer before marking it as " + "modified is handled safely (but will cause spurious marking as " + "modified after the changes have been implicitly committed by the " + "usage switch), normally that shouldn't be done and is an " + "indication of architectural mistakes. Alternatively, this may " + "indicate that the usage switch has been forgotten before writing, " + "which is a clearly invalid situation."); + } +} + +void VulkanRenderTargetCache::CommitEdramBufferShaderWrites( + EdramBufferModificationStatus commit_status) { + assert_true(commit_status != EdramBufferModificationStatus::kUnmodified); + if (edram_buffer_modification_status_ < commit_status) { + return; + } + VkPipelineStageFlags stage_mask; + VkAccessFlags access_mask; + GetEdramBufferUsageMasks(edram_buffer_usage_, stage_mask, access_mask); + assert_not_zero(access_mask & VK_ACCESS_SHADER_WRITE_BIT); + command_processor_.PushBufferMemoryBarrier( + edram_buffer_, 0, VK_WHOLE_SIZE, stage_mask, stage_mask, access_mask, + access_mask, VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, false); + edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + PixelShaderInterlockFullEdramBarrierPlaced(); } const VulkanRenderTargetCache::Framebuffer* @@ -646,8 +1342,15 @@ VulkanRenderTargetCache::GetFramebuffer( depth_and_color_rts_remaining &= ~(uint32_t(1) << rt_index); const auto& vulkan_rt = *static_cast( depth_and_color_render_targets[rt_index]); - attachments[attachment_count++] = rt_index ? vulkan_rt.view_depth_color() - : vulkan_rt.view_depth_stencil(); + VkImageView attachment; + if (rt_index) { + attachment = render_pass_key.color_rts_use_transfer_formats + ? vulkan_rt.view_color_transfer() + : vulkan_rt.view_depth_color(); + } else { + attachment = vulkan_rt.view_depth_stencil(); + } + attachments[attachment_count++] = attachment; } VkFramebufferCreateInfo framebuffer_create_info; @@ -684,6 +1387,3491 @@ VulkanRenderTargetCache::GetFramebuffer( .first->second; } +VkShaderModule VulkanRenderTargetCache::GetTransferShader( + TransferShaderKey key) { + auto shader_it = transfer_shaders_.find(key); + if (shader_it != transfer_shaders_.end()) { + return shader_it->second; + } + + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + + std::vector id_vector_temp; + std::vector uint_vector_temp; + + spv::Builder builder(spv::Spv_1_0, + (SpirvShaderTranslator::kSpirvMagicToolId << 16) | 1, + nullptr); + spv::Id ext_inst_glsl_std_450 = builder.import("GLSL.std.450"); + builder.addCapability(spv::CapabilityShader); + builder.setMemoryModel(spv::AddressingModelLogical, spv::MemoryModelGLSL450); + builder.setSource(spv::SourceLanguageUnknown, 0); + + spv::Id type_void = builder.makeVoidType(); + spv::Id type_bool = builder.makeBoolType(); + spv::Id type_int = builder.makeIntType(32); + spv::Id type_int2 = builder.makeVectorType(type_int, 2); + spv::Id type_uint = builder.makeUintType(32); + spv::Id type_uint2 = builder.makeVectorType(type_uint, 2); + spv::Id type_uint4 = builder.makeVectorType(type_uint, 4); + spv::Id type_float = builder.makeFloatType(32); + spv::Id type_float2 = builder.makeVectorType(type_float, 2); + spv::Id type_float4 = builder.makeVectorType(type_float, 4); + + const TransferModeInfo& mode = kTransferModes[size_t(key.mode)]; + const TransferPipelineLayoutInfo& pipeline_layout_info = + kTransferPipelineLayoutInfos[size_t(mode.pipeline_layout)]; + + // If not dest_is_color, it's depth, or stencil bit - 40-sample columns are + // swapped as opposed to color source. + bool dest_is_color = (mode.output == TransferOutput::kColor); + xenos::ColorRenderTargetFormat dest_color_format = + xenos::ColorRenderTargetFormat(key.dest_resource_format); + xenos::DepthRenderTargetFormat dest_depth_format = + xenos::DepthRenderTargetFormat(key.dest_resource_format); + bool dest_is_64bpp = + dest_is_color && xenos::IsColorRenderTargetFormat64bpp(dest_color_format); + + xenos::ColorRenderTargetFormat source_color_format = + xenos::ColorRenderTargetFormat(key.source_resource_format); + xenos::DepthRenderTargetFormat source_depth_format = + xenos::DepthRenderTargetFormat(key.source_resource_format); + // If not source_is_color, it's depth / stencil - 40-sample columns are + // swapped as opposed to color destination. + bool source_is_color = (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetColorTextureBit) != 0; + bool source_is_64bpp; + uint32_t source_color_format_component_count; + uint32_t source_color_texture_component_mask; + bool source_color_is_uint; + spv::Id source_color_component_type; + if (source_is_color) { + assert_zero(pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetDepthStencilTexturesBit); + source_is_64bpp = + xenos::IsColorRenderTargetFormat64bpp(source_color_format); + source_color_format_component_count = + xenos::GetColorRenderTargetFormatComponentCount(source_color_format); + if (mode.output == TransferOutput::kStencilBit) { + if (source_is_64bpp && !dest_is_64bpp) { + // Need one component, but choosing from the two 32bpp halves of the + // 64bpp sample. + source_color_texture_component_mask = + 0b1 | (0b1 << (source_color_format_component_count >> 1)); + } else { + // Red is at least 8 bits per component in all formats. + source_color_texture_component_mask = 0b1; + } + } else { + source_color_texture_component_mask = + (uint32_t(1) << source_color_format_component_count) - 1; + } + GetColorOwnershipTransferVulkanFormat(source_color_format, + &source_color_is_uint); + source_color_component_type = source_color_is_uint ? type_uint : type_float; + } else { + source_is_64bpp = false; + source_color_format_component_count = 0; + source_color_texture_component_mask = 0; + source_color_is_uint = false; + source_color_component_type = spv::NoType; + } + + std::vector main_interface; + + // Outputs. + bool shader_uses_stencil_reference_output = + mode.output == TransferOutput::kDepth && + provider.device_extensions().ext_shader_stencil_export; + bool dest_color_is_uint = false; + uint32_t dest_color_component_count = 0; + spv::Id type_fragment_data_component = spv::NoResult; + spv::Id type_fragment_data = spv::NoResult; + spv::Id output_fragment_data = spv::NoResult; + spv::Id output_fragment_depth = spv::NoResult; + spv::Id output_fragment_stencil_ref = spv::NoResult; + switch (mode.output) { + case TransferOutput::kColor: + GetColorOwnershipTransferVulkanFormat(dest_color_format, + &dest_color_is_uint); + dest_color_component_count = + xenos::GetColorRenderTargetFormatComponentCount(dest_color_format); + type_fragment_data_component = + dest_color_is_uint ? type_uint : type_float; + type_fragment_data = + dest_color_component_count > 1 + ? builder.makeVectorType(type_fragment_data_component, + dest_color_component_count) + : type_fragment_data_component; + output_fragment_data = builder.createVariable( + spv::NoPrecision, spv::StorageClassOutput, type_fragment_data, + "xe_transfer_fragment_data"); + builder.addDecoration(output_fragment_data, spv::DecorationLocation, + key.dest_color_rt_index); + main_interface.push_back(output_fragment_data); + break; + case TransferOutput::kDepth: + output_fragment_depth = + builder.createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float, "gl_FragDepth"); + builder.addDecoration(output_fragment_depth, spv::DecorationBuiltIn, + spv::BuiltInFragDepth); + main_interface.push_back(output_fragment_depth); + if (shader_uses_stencil_reference_output) { + builder.addExtension("SPV_EXT_shader_stencil_export"); + builder.addCapability(spv::CapabilityStencilExportEXT); + output_fragment_stencil_ref = + builder.createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_int, "gl_FragStencilRefARB"); + builder.addDecoration(output_fragment_stencil_ref, + spv::DecorationBuiltIn, + spv::BuiltInFragStencilRefEXT); + main_interface.push_back(output_fragment_stencil_ref); + } + break; + default: + break; + } + + // Bindings. + // Generating SPIR-V 1.0, no need to add bindings to the entry point's + // interface until SPIR-V 1.4. + // Color source. + bool source_is_multisampled = + key.source_msaa_samples != xenos::MsaaSamples::k1X; + spv::Id source_color_texture = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetColorTextureBit) { + source_color_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType(source_color_component_type, spv::Dim2D, false, + false, source_is_multisampled, 1, + spv::ImageFormatUnknown), + "xe_transfer_color"); + builder.addDecoration( + source_color_texture, spv::DecorationDescriptorSet, + xe::bit_count(pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetColorTextureBit - 1))); + builder.addDecoration(source_color_texture, spv::DecorationBinding, 0); + } + // Depth / stencil source. + spv::Id source_depth_texture = spv::NoResult; + spv::Id source_stencil_texture = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetDepthStencilTexturesBit) { + uint32_t source_depth_stencil_descriptor_set = + xe::bit_count(pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetDepthStencilTexturesBit - 1)); + // Using `depth == false` in makeImageType because comparisons are not + // required, and other values of `depth` are causing issues in drivers. + // https://github.com/microsoft/DirectXShaderCompiler/issues/1107 + if (mode.output != TransferOutput::kStencilBit) { + source_depth_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType(type_float, spv::Dim2D, false, false, + source_is_multisampled, 1, + spv::ImageFormatUnknown), + "xe_transfer_depth"); + builder.addDecoration(source_depth_texture, spv::DecorationDescriptorSet, + source_depth_stencil_descriptor_set); + builder.addDecoration(source_depth_texture, spv::DecorationBinding, 0); + } + if (mode.output != TransferOutput::kDepth || + shader_uses_stencil_reference_output) { + source_stencil_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType(type_uint, spv::Dim2D, false, false, + source_is_multisampled, 1, + spv::ImageFormatUnknown), + "xe_transfer_stencil"); + builder.addDecoration(source_stencil_texture, + spv::DecorationDescriptorSet, + source_depth_stencil_descriptor_set); + builder.addDecoration(source_stencil_texture, spv::DecorationBinding, 1); + } + } + // Host depth source buffer. + spv::Id host_depth_source_buffer = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetHostDepthBufferBit) { + id_vector_temp.clear(); + id_vector_temp.push_back(builder.makeRuntimeArray(type_uint)); + // Storage buffers have std430 packing, no padding to 4-component vectors. + builder.addDecoration(id_vector_temp.back(), spv::DecorationArrayStride, + sizeof(float)); + spv::Id type_host_depth_source_buffer = + builder.makeStructType(id_vector_temp, "XeTransferHostDepthBuffer"); + builder.addMemberName(type_host_depth_source_buffer, 0, "host_depth"); + builder.addMemberDecoration(type_host_depth_source_buffer, 0, + spv::DecorationNonWritable); + builder.addMemberDecoration(type_host_depth_source_buffer, 0, + spv::DecorationOffset, 0); + // Block since SPIR-V 1.3, but since SPIR-V 1.0 is generated, it's + // BufferBlock. + builder.addDecoration(type_host_depth_source_buffer, + spv::DecorationBufferBlock); + // StorageBuffer since SPIR-V 1.3, but since SPIR-V 1.0 is generated, it's + // Uniform. + host_depth_source_buffer = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniform, + type_host_depth_source_buffer, "xe_transfer_host_depth_buffer"); + builder.addDecoration( + host_depth_source_buffer, spv::DecorationDescriptorSet, + xe::bit_count(pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthBufferBit - 1))); + builder.addDecoration(host_depth_source_buffer, spv::DecorationBinding, 0); + } + // Host depth source texture (the depth / stencil descriptor set is reused, + // but stencil is not needed). + spv::Id host_depth_source_texture = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetHostDepthStencilTexturesBit) { + host_depth_source_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType( + type_float, spv::Dim2D, false, false, + key.host_depth_source_msaa_samples != xenos::MsaaSamples::k1X, 1, + spv::ImageFormatUnknown), + "xe_transfer_host_depth"); + builder.addDecoration( + host_depth_source_texture, spv::DecorationDescriptorSet, + xe::bit_count( + pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthStencilTexturesBit - 1))); + builder.addDecoration(host_depth_source_texture, spv::DecorationBinding, 0); + } + // Push constants. + id_vector_temp.clear(); + uint32_t push_constants_member_host_depth_address = UINT32_MAX; + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + push_constants_member_host_depth_address = uint32_t(id_vector_temp.size()); + id_vector_temp.push_back(type_uint); + } + uint32_t push_constants_member_address = UINT32_MAX; + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordAddressBit) { + push_constants_member_address = uint32_t(id_vector_temp.size()); + id_vector_temp.push_back(type_uint); + } + uint32_t push_constants_member_stencil_mask = UINT32_MAX; + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordStencilMaskBit) { + push_constants_member_stencil_mask = uint32_t(id_vector_temp.size()); + id_vector_temp.push_back(type_uint); + } + spv::Id push_constants = spv::NoResult; + if (!id_vector_temp.empty()) { + spv::Id type_push_constants = + builder.makeStructType(id_vector_temp, "XeTransferPushConstants"); + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + assert_true(push_constants_member_host_depth_address != UINT32_MAX); + builder.addMemberName(type_push_constants, + push_constants_member_host_depth_address, + "host_depth_address"); + builder.addMemberDecoration( + type_push_constants, push_constants_member_host_depth_address, + spv::DecorationOffset, + sizeof(uint32_t) * + xe::bit_count( + pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordHostDepthAddressBit - 1))); + } + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordAddressBit) { + assert_true(push_constants_member_address != UINT32_MAX); + builder.addMemberName(type_push_constants, push_constants_member_address, + "address"); + builder.addMemberDecoration( + type_push_constants, push_constants_member_address, + spv::DecorationOffset, + sizeof(uint32_t) * + xe::bit_count(pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordAddressBit - 1))); + } + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordStencilMaskBit) { + assert_true(push_constants_member_stencil_mask != UINT32_MAX); + builder.addMemberName(type_push_constants, + push_constants_member_stencil_mask, "stencil_mask"); + builder.addMemberDecoration( + type_push_constants, push_constants_member_stencil_mask, + spv::DecorationOffset, + sizeof(uint32_t) * + xe::bit_count( + pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordStencilMaskBit - 1))); + } + builder.addDecoration(type_push_constants, spv::DecorationBlock); + push_constants = builder.createVariable( + spv::NoPrecision, spv::StorageClassPushConstant, type_push_constants, + "xe_transfer_push_constants"); + } + + // Coordinate inputs. + spv::Id input_fragment_coord = builder.createVariable( + spv::NoPrecision, spv::StorageClassInput, type_float4, "gl_FragCoord"); + builder.addDecoration(input_fragment_coord, spv::DecorationBuiltIn, + spv::BuiltInFragCoord); + main_interface.push_back(input_fragment_coord); + spv::Id input_sample_id = spv::NoResult; + spv::Id spec_const_sample_id = spv::NoResult; + if (key.dest_msaa_samples != xenos::MsaaSamples::k1X) { + if (device_features.sampleRateShading) { + // One draw for all samples. + builder.addCapability(spv::CapabilitySampleRateShading); + input_sample_id = builder.createVariable( + spv::NoPrecision, spv::StorageClassInput, type_int, "gl_SampleID"); + builder.addDecoration(input_sample_id, spv::DecorationFlat); + builder.addDecoration(input_sample_id, spv::DecorationBuiltIn, + spv::BuiltInSampleId); + main_interface.push_back(input_sample_id); + } else { + // One sample per draw, with different sample masks. + spec_const_sample_id = builder.makeUintConstant(0, true); + builder.addName(spec_const_sample_id, "xe_transfer_sample_id"); + builder.addDecoration(spec_const_sample_id, spv::DecorationSpecId, 0); + } + } + + // Begin the main function. + std::vector main_param_types; + std::vector> main_precisions; + spv::Block* main_entry; + spv::Function* main_function = + builder.makeFunctionEntry(spv::NoPrecision, type_void, "main", + main_param_types, main_precisions, &main_entry); + + // Working with unsigned numbers for simplicity now, bitcasting to signed will + // be done at texture fetch. + + uint32_t tile_width_samples_scaled = + xenos::kEdramTileWidthSamples * resolution_scale_x_; + uint32_t tile_height_samples_scaled = + xenos::kEdramTileHeightSamples * resolution_scale_y_; + + // Convert the fragment coordinates to uint2. + uint_vector_temp.clear(); + uint_vector_temp.reserve(2); + uint_vector_temp.push_back(0); + uint_vector_temp.push_back(1); + spv::Id dest_pixel_coord = builder.createUnaryOp( + spv::OpConvertFToU, type_uint2, + builder.createRvalueSwizzle( + spv::NoPrecision, type_float2, + builder.createLoad(input_fragment_coord, spv::NoPrecision), + uint_vector_temp)); + + // Prove to the AMD compiler that 24*24 multiplication can be done. 16 bits + // are more than enough for coordinates even with 3x resolution scaling (and + // Direct3D 11 hardware has 16.8 fixed-point coordinates). + // TODO(Triang3l): OpUnreachable if the coordinates have upper bits set. + + // Split the destination pixel coordinate into scalars. + spv::Id dest_pixel_x = + builder.createCompositeExtract(dest_pixel_coord, type_uint, 0); + spv::Id dest_pixel_y = + builder.createCompositeExtract(dest_pixel_coord, type_uint, 1); + + // Split the destination pixel index into 32bpp tile and 32bpp-tile-relative + // pixel index. + uint32_t dest_sample_width_log2 = + uint32_t(dest_is_64bpp) + + uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X); + uint32_t dest_sample_height_log2 = + uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X); + uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_shift; + draw_util::GetEdramTileWidthDivideScaleAndUpperShift( + resolution_scale_x_, dest_tile_width_divide_scale, + dest_tile_width_divide_shift); + // Doing 16*16=32 multiplication, not 32*32=64. + // TODO(Triang3l): Abstract this away, don't do 32*32 on Direct3D 12 too. + dest_tile_width_divide_scale &= UINT16_MAX; + dest_tile_width_divide_shift += 16; + // Need the host tile size in pixels, not samples. + dest_tile_width_divide_shift -= dest_sample_width_log2; + spv::Id dest_tile_index_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, dest_pixel_x, + builder.makeUintConstant(dest_tile_width_divide_scale)), + builder.makeUintConstant(dest_tile_width_divide_shift)); + spv::Id dest_tile_pixel_x = builder.createBinOp( + spv::OpISub, type_uint, dest_pixel_x, + builder.createBinOp(spv::OpIMul, type_uint, dest_tile_index_x, + builder.makeUintConstant(tile_width_samples_scaled >> + dest_sample_width_log2))); + spv::Id dest_tile_index_y, dest_tile_pixel_y; + if (resolution_scale_y_ == 3) { + dest_tile_index_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, dest_pixel_y, + builder.makeUintConstant(draw_util::kDivideScale3 & UINT16_MAX)), + builder.makeUintConstant(draw_util::kDivideUpperShift3 + 16 + 4 - + dest_sample_height_log2)); + dest_tile_pixel_y = builder.createBinOp( + spv::OpISub, type_uint, dest_pixel_y, + builder.createBinOp( + spv::OpIMul, type_uint, dest_tile_index_y, + builder.makeUintConstant(tile_height_samples_scaled >> + dest_sample_height_log2))); + } else { + assert_true(resolution_scale_y_ <= 2); + uint32_t dest_tile_height_pixels_log2 = + (resolution_scale_y_ == 2 ? 5 : 4) - dest_sample_height_log2; + dest_tile_index_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_pixel_y, + builder.makeUintConstant(dest_tile_height_pixels_log2)); + dest_tile_pixel_y = builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_pixel_y, + builder.makeUintConstant((uint32_t(1) << dest_tile_height_pixels_log2) - + 1)); + } + + assert_true(push_constants_member_address != UINT32_MAX); + id_vector_temp.clear(); + id_vector_temp.push_back( + builder.makeIntConstant(int32_t(push_constants_member_address))); + spv::Id address_constant = builder.createLoad( + builder.createAccessChain(spv::StorageClassPushConstant, push_constants, + id_vector_temp), + spv::NoPrecision); + + // Calculate the 32bpp tile index from its X and Y parts. + spv::Id dest_tile_index = builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, address_constant, + builder.makeUintConstant(0), + builder.makeUintConstant(xenos::kEdramPitchTilesBits)), + dest_tile_index_y), + dest_tile_index_x); + + // Load the destination sample index. + spv::Id dest_sample_id = spv::NoResult; + if (key.dest_msaa_samples != xenos::MsaaSamples::k1X) { + if (device_features.sampleRateShading) { + assert_true(input_sample_id != spv::NoResult); + dest_sample_id = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createLoad(input_sample_id, spv::NoPrecision)); + } else { + assert_true(spec_const_sample_id != spv::NoResult); + // Already uint. + dest_sample_id = spec_const_sample_id; + } + } + + // Transform the destination framebuffer pixel and sample coordinates into the + // source texture pixel and sample coordinates. + + // First sample bit at 4x with Vulkan standard locations - horizontal sample. + // Second sample bit at 4x with Vulkan standard locations - vertical sample. + // At 2x: + // - Native 2x: top is 1 in Vulkan, bottom is 0. + // - 2x as 4x: top is 0, bottom is 3. + + spv::Id source_sample_id = dest_sample_id; + spv::Id source_tile_pixel_x = dest_tile_pixel_x; + spv::Id source_tile_pixel_y = dest_tile_pixel_y; + spv::Id source_color_half = spv::NoResult; + if (!source_is_64bpp && dest_is_64bpp) { + // 32bpp -> 64bpp, need two samples of the source. + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // 32bpp -> 64bpp, 4x ->. + // Source has 32bpp halves in two adjacent samples. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 32bpp -> 64bpp, 4x -> 4x. + // 1 destination horizontal sample = 2 source horizontal samples. + // D p0,0 s0,0 = S p0,0 s0,0 | S p0,0 s1,0 + // D p0,0 s1,0 = S p1,0 s0,0 | S p1,0 s1,0 + // D p0,0 s0,1 = S p0,0 s0,1 | S p0,0 s1,1 + // D p0,0 s1,1 = S p1,0 s0,1 | S p1,0 s1,1 + // Thus destination horizontal sample -> source horizontal pixel, + // vertical samples are 1:1. + source_sample_id = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id, + builder.makeUintConstant(1 << 1)); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_x = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + } else if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // 32bpp -> 64bpp, 4x -> 2x. + // 1 destination horizontal pixel = 2 source horizontal samples. + // D p0,0 s0 = S p0,0 s0,0 | S p0,0 s1,0 + // D p0,0 s1 = S p0,0 s0,1 | S p0,0 s1,1 + // D p1,0 s0 = S p1,0 s0,0 | S p1,0 s1,0 + // D p1,0 s1 = S p1,0 s0,1 | S p1,0 s1,1 + // Pixel index can be reused. Sample 1 (for native 2x) or 0 (for 2x as + // 4x) should become samples 01, sample 0 or 3 should become samples 23. + if (msaa_2x_attachments_supported_) { + source_sample_id = builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, + builder.createBinOp(spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1)), + builder.makeUintConstant(1)); + } else { + source_sample_id = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id, + builder.makeUintConstant(1 << 1)); + } + } else { + // 32bpp -> 64bpp, 4x -> 1x. + // 1 destination horizontal pixel = 2 source horizontal samples. + // D p0,0 = S p0,0 s0,0 | S p0,0 s1,0 + // D p0,1 = S p0,0 s0,1 | S p0,0 s1,1 + // Horizontal pixel index can be reused. Vertical pixel 1 should + // become sample 2. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + source_tile_pixel_y = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_y, builder.makeUintConstant(1)); + } + } else { + // 32bpp -> 64bpp, 1x/2x ->. + // Source has 32bpp halves in two adjacent pixels. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 32bpp -> 64bpp, 1x/2x -> 4x. + // The X part. + // 1 destination horizontal sample = 2 source horizontal pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(2))); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_tile_pixel_x = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + // Y is handled by common code. + } else { + // 32bpp -> 64bpp, 1x/2x -> 1x/2x. + // The X part. + // 1 destination horizontal pixel = 2 source horizontal pixels. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftLeftLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(1)); + // Y is handled by common code. + } + } + } else if (source_is_64bpp && !dest_is_64bpp) { + // 64bpp -> 32bpp, also the half to load. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 64bpp -> 32bpp, -> 4x. + // The needed half is in the destination horizontal sample index. + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // 64bpp -> 32bpp, 4x -> 4x. + // D p0,0 s0,0 = S s0,0 low + // D p0,0 s1,0 = S s0,0 high + // D p1,0 s0,0 = S s1,0 low + // D p1,0 s1,0 = S s1,0 high + // Vertical pixel and sample (second bit) addressing is the same. + // However, 1 horizontal destination pixel = 1 horizontal source sample. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + // 2 destination horizontal samples = 1 source horizontal sample, thus + // 2 destination horizontal pixels = 1 source horizontal pixel. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(1)); + } else { + // 64bpp -> 32bpp, 1x/2x -> 4x. + // 2 destination horizontal samples = 1 source horizontal pixel, thus + // 1 destination horizontal pixel = 1 source horizontal pixel. Can reuse + // horizontal pixel index. + // Y is handled by common code. + } + // Half from the destination horizontal sample index. + source_color_half = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id, + builder.makeUintConstant(1)); + } else { + // 64bpp -> 32bpp, -> 1x/2x. + // The needed half is in the destination horizontal pixel index. + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // 64bpp -> 32bpp, 4x -> 1x/2x. + // (Destination horizontal pixel >> 1) & 1 = source horizontal sample + // (first bit). + source_sample_id = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1), builder.makeUintConstant(1)); + if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // 64bpp -> 32bpp, 4x -> 2x. + // Destination vertical samples (1/0 in the first bit for native 2x or + // 0/1 in the second bit for 2x as 4x) = source vertical samples + // (second bit). + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + } else { + // 64bpp -> 32bpp, 4x -> 1x. + // 1 destination vertical pixel = 1 source vertical sample. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(source_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } + // 2 destination horizontal pixels = 1 source horizontal sample. + // 4 destination horizontal pixels = 1 source horizontal pixel. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(2)); + } else { + // 64bpp -> 32bpp, 1x/2x -> 1x/2x. + // The X part. + // 2 destination horizontal pixels = 1 destination source pixel. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(1)); + // Y is handled by common code. + } + // Half from the destination horizontal pixel index. + source_color_half = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + } + assert_true(source_color_half != spv::NoResult); + } else { + // Same bit count. + if (key.source_msaa_samples != key.dest_msaa_samples) { + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // Same BPP, 4x -> 1x/2x. + if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // Same BPP, 4x -> 2x. + // Horizontal pixels to samples. Vertical sample (1/0 in the first bit + // for native 2x or 0/1 in the second bit for 2x as 4x) to second + // sample bit. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + } else { + // Same BPP, 4x -> 1x. + // Pixels to samples. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } + } else { + // Same BPP, 1x/2x -> 1x/2x/4x (as long as they're different). + // Only the X part - Y is handled by common code. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // Horizontal samples to pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_x = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + } + } + } + // Common source Y and sample index for 1x/2x AA sources, independent of bits + // per sample. + if (key.source_msaa_samples < xenos::MsaaSamples::k4X && + key.source_msaa_samples != key.dest_msaa_samples) { + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 1x/2x -> 4x. + if (key.source_msaa_samples == xenos::MsaaSamples::k2X) { + // 2x -> 4x. + // Vertical samples (second bit) of 4x destination to vertical sample + // (1, 0 for native 2x, or 0, 3 for 2x as 4x) of 2x source. + source_sample_id = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_sample_id, builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + source_sample_id = builder.createBinOp(spv::OpBitwiseXor, type_uint, + source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } else { + // 1x -> 4x. + // Vertical samples (second bit) to Y pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back( + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_sample_id, builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_y = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } else { + // 1x/2x -> different 1x/2x. + if (key.source_msaa_samples == xenos::MsaaSamples::k2X) { + // 2x -> 1x. + // Vertical pixels of 2x destination to vertical samples (1, 0 for + // native 2x, or 0, 3 for 2x as 4x) of 1x source. + source_sample_id = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + source_sample_id = builder.createBinOp(spv::OpBitwiseXor, type_uint, + source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + source_tile_pixel_y = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_y, builder.makeUintConstant(1)); + } else { + // 1x -> 2x. + // Vertical samples (1/0 in the first bit for native 2x or 0/1 in the + // second bit for 2x as 4x) of 2x destination to vertical pixels of 1x + // source. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back( + builder.createBinOp(spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_y = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back( + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_sample_id, builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_y = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + } + } + } + + uint32_t source_pixel_width_dwords_log2 = + uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k4X) + + uint32_t(source_is_64bpp); + + if (source_is_color != dest_is_color) { + // Copying between color and depth / stencil - swap 40-32bpp-sample columns + // in the pixel index within the source 32bpp tile. + uint32_t source_32bpp_tile_half_pixels = + tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2); + source_tile_pixel_x = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createBinOp( + spv::OpIAdd, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, + source_tile_pixel_x), + builder.createTriOp( + spv::OpSelect, type_int, + builder.createBinOp( + spv::OpULessThan, builder.makeBoolType(), + source_tile_pixel_x, + builder.makeUintConstant(source_32bpp_tile_half_pixels)), + builder.makeIntConstant(int32_t(source_32bpp_tile_half_pixels)), + builder.makeIntConstant( + -int32_t(source_32bpp_tile_half_pixels))))); + } + + // Transform the destination 32bpp tile index into the source. + spv::Id source_tile_index = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createBinOp( + spv::OpIAdd, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, dest_tile_index), + builder.createTriOp( + spv::OpBitFieldSExtract, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, address_constant), + builder.makeUintConstant(xenos::kEdramPitchTilesBits * 2), + builder.makeUintConstant(xenos::kEdramBaseTilesBits)))); + // Split the source 32bpp tile index into X and Y tile index within the source + // image. + spv::Id source_pitch_tiles = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, address_constant, + builder.makeUintConstant(xenos::kEdramPitchTilesBits), + builder.makeUintConstant(xenos::kEdramPitchTilesBits)); + spv::Id source_tile_index_y = builder.createBinOp( + spv::OpUDiv, type_uint, source_tile_index, source_pitch_tiles); + spv::Id source_tile_index_x = builder.createBinOp( + spv::OpUMod, type_uint, source_tile_index, source_pitch_tiles); + // Finally calculate the source texture coordinates. + spv::Id source_pixel_x_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant(tile_width_samples_scaled >> + source_pixel_width_dwords_log2), + source_tile_index_x), + source_tile_pixel_x)); + spv::Id source_pixel_y_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant( + tile_height_samples_scaled >> + uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)), + source_tile_index_y), + source_tile_pixel_y)); + + // Load the source. + + spv::Builder::TextureParameters source_texture_parameters = {}; + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(source_pixel_x_int); + id_vector_temp.push_back(source_pixel_y_int); + spv::Id source_coordinates[2] = { + builder.createCompositeConstruct(type_int2, id_vector_temp), + }; + spv::Id source_sample_ids_int[2] = {}; + if (key.source_msaa_samples != xenos::MsaaSamples::k1X) { + source_sample_ids_int[0] = + builder.createUnaryOp(spv::OpBitcast, type_int, source_sample_id); + } else { + source_texture_parameters.lod = builder.makeIntConstant(0); + } + // Go to the next sample or pixel along X if need to load two dwords. + bool source_load_is_two_32bpp_samples = !source_is_64bpp && dest_is_64bpp; + if (source_load_is_two_32bpp_samples) { + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + source_coordinates[1] = source_coordinates[0]; + source_sample_ids_int[1] = builder.createBinOp( + spv::OpBitwiseOr, type_int, source_sample_ids_int[0], + builder.makeIntConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(builder.createBinOp(spv::OpBitwiseOr, type_int, + source_pixel_x_int, + builder.makeIntConstant(1))); + id_vector_temp.push_back(source_pixel_y_int); + source_coordinates[1] = + builder.createCompositeConstruct(type_int2, id_vector_temp); + source_sample_ids_int[1] = source_sample_ids_int[0]; + } + } + spv::Id source_color[2][4] = {}; + if (source_color_texture != spv::NoResult) { + source_texture_parameters.sampler = + builder.createLoad(source_color_texture, spv::NoPrecision); + assert_true(source_color_component_type != spv::NoType); + spv::Id source_color_vec4_type = + builder.makeVectorType(source_color_component_type, 4); + for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) { + source_texture_parameters.coords = source_coordinates[i]; + source_texture_parameters.sample = source_sample_ids_int[i]; + spv::Id source_color_vec4 = builder.createTextureCall( + spv::NoPrecision, source_color_vec4_type, false, true, false, false, + false, source_texture_parameters, spv::ImageOperandsMaskNone); + uint32_t source_color_components_remaining = + source_color_texture_component_mask; + uint32_t source_color_component_index; + while (xe::bit_scan_forward(source_color_components_remaining, + &source_color_component_index)) { + source_color_components_remaining &= + ~(uint32_t(1) << source_color_component_index); + source_color[i][source_color_component_index] = + builder.createCompositeExtract(source_color_vec4, + source_color_component_type, + source_color_component_index); + } + } + } + spv::Id source_depth_float[2] = {}; + if (source_depth_texture != spv::NoResult) { + source_texture_parameters.sampler = + builder.createLoad(source_depth_texture, spv::NoPrecision); + for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) { + source_texture_parameters.coords = source_coordinates[i]; + source_texture_parameters.sample = source_sample_ids_int[i]; + source_depth_float[i] = builder.createCompositeExtract( + builder.createTextureCall( + spv::NoPrecision, type_float4, false, true, false, false, false, + source_texture_parameters, spv::ImageOperandsMaskNone), + type_float, 0); + } + } + spv::Id source_stencil[2] = {}; + if (source_stencil_texture != spv::NoResult) { + source_texture_parameters.sampler = + builder.createLoad(source_stencil_texture, spv::NoPrecision); + for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) { + source_texture_parameters.coords = source_coordinates[i]; + source_texture_parameters.sample = source_sample_ids_int[i]; + source_stencil[i] = builder.createCompositeExtract( + builder.createTextureCall( + spv::NoPrecision, type_uint4, false, true, false, false, false, + source_texture_parameters, spv::ImageOperandsMaskNone), + type_uint, 0); + } + } + + // Pick the needed 32bpp half of the 64bpp color. + if (source_is_64bpp && !dest_is_64bpp) { + uint32_t source_color_half_component_count = + source_color_format_component_count >> 1; + assert_true(source_color_half != spv::NoResult); + spv::Id source_color_is_second_half = + builder.createBinOp(spv::OpINotEqual, type_bool, source_color_half, + builder.makeUintConstant(0)); + if (mode.output == TransferOutput::kStencilBit) { + source_color[0][0] = builder.createTriOp( + spv::OpSelect, source_color_component_type, + source_color_is_second_half, + source_color[0][source_color_half_component_count], + source_color[0][0]); + } else { + for (uint32_t i = 0; i < source_color_half_component_count; ++i) { + source_color[0][i] = builder.createTriOp( + spv::OpSelect, source_color_component_type, + source_color_is_second_half, + source_color[0][source_color_half_component_count + i], + source_color[0][i]); + } + } + } + + if (output_fragment_stencil_ref != spv::NoResult && + source_stencil[0] != spv::NoResult) { + // For the depth -> depth case, write the stencil directly to the output. + assert_true(mode.output == TransferOutput::kDepth); + builder.createStore(source_stencil[0], output_fragment_stencil_ref); + } + + if (dest_is_64bpp) { + // Construct the 64bpp color from two 32-bit samples or one 64-bit sample. + // If `packed` (two uints) are created, use the generic path involving + // unpacking. + // Otherwise, the fragment data output must be written to directly by the + // reached control flow path. + spv::Id packed[2] = {}; + if (source_is_color) { + switch (source_color_format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale = builder.makeFloatConstant(255.0f); + spv::Id component_width = builder.makeUintConstant(8); + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[i][0], unorm_scale), + unorm_round_offset)); + for (uint32_t j = 1; j < 4; ++j) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[i][j], unorm_scale), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(8 * j)); + id_vector_temp.push_back(component_width); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale_rgb = builder.makeFloatConstant(1023.0f); + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f); + spv::Id width_a = builder.makeUintConstant(2); + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[i][0], unorm_scale_rgb), + unorm_round_offset)); + for (uint32_t j = 1; j < 4; ++j) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, source_color[i][j], + j == 3 ? unorm_scale_a : unorm_scale_rgb), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(10 * j)); + id_vector_temp.push_back(j == 3 ? width_a : width_rgb); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id float_0 = builder.makeFloatConstant(0.0f); + spv::Id float_1 = builder.makeFloatConstant(1.0f); + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f); + spv::Id offset_a = builder.makeUintConstant(30); + spv::Id width_a = builder.makeUintConstant(2); + for (uint32_t i = 0; i < 2; ++i) { + // Float16 has a wider range for both color and alpha, also NaNs - + // clamp and convert. + packed[i] = SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[i][0], ext_inst_glsl_std_450); + for (uint32_t j = 1; j < 3; ++j) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back( + SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[i][j], ext_inst_glsl_std_450)); + id_vector_temp.push_back(builder.makeUintConstant(10 * j)); + id_vector_temp.push_back(width_rgb); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + // Saturate and convert the alpha. + id_vector_temp.clear(); + id_vector_temp.reserve(3); + id_vector_temp.push_back(source_color[i][3]); + id_vector_temp.push_back(float_0); + id_vector_temp.push_back(float_1); + spv::Id alpha_saturated = + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450NClamp, id_vector_temp); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + alpha_saturated, unorm_scale_a), + unorm_round_offset))); + id_vector_temp.push_back(offset_a); + id_vector_temp.push_back(width_a); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } break; + // All 64bpp formats, and all 16 bits per component formats, are + // represented as integers in ownership transfer for safe handling of + // NaN encodings and -32768 / -32767. + // TODO(Triang3l): Handle the case when that's not true (no multisampled + // sampled images, no 16-bit UNORM, no cross-packing 32bpp aliasing on a + // portability subset device or a 64bpp format where that wouldn't help + // anyway). + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: { + if (dest_color_format == + xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + spv::Id component_offset_width = builder.makeUintConstant(16); + spv::Id color_16_in_32[2]; + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_color[i][0]); + id_vector_temp.push_back(source_color[i][1]); + id_vector_temp.push_back(component_offset_width); + id_vector_temp.push_back(component_offset_width); + color_16_in_32[i] = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(color_16_in_32[0]); + id_vector_temp.push_back(color_16_in_32[1]); + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[i >> 1][i & 1]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: { + if (dest_color_format == + xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + spv::Id component_offset_width = builder.makeUintConstant(16); + spv::Id color_16_in_32[2]; + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_color[0][i << 1]); + id_vector_temp.push_back(source_color[0][(i << 1) + 1]); + id_vector_temp.push_back(component_offset_width); + id_vector_temp.push_back(component_offset_width); + color_16_in_32[i] = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(color_16_in_32[0]); + id_vector_temp.push_back(color_16_in_32[1]); + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } + } break; + // Float32 is transferred as uint32 to preserve NaN encodings. However, + // multisampled sampled image support is optional in Vulkan. + case xenos::ColorRenderTargetFormat::k_32_FLOAT: { + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = source_color[i][0]; + if (!source_color_is_uint) { + packed[i] = + builder.createUnaryOp(spv::OpBitcast, type_uint, packed[i]); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: { + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = source_color[0][i]; + if (!source_color_is_uint) { + packed[i] = + builder.createUnaryOp(spv::OpBitcast, type_uint, packed[i]); + } + } + } break; + } + } else { + assert_true(source_depth_texture != spv::NoResult); + assert_true(source_stencil_texture != spv::NoResult); + spv::Id depth_offset = builder.makeUintConstant(8); + spv::Id depth_width = builder.makeUintConstant(24); + for (uint32_t i = 0; i < 2; ++i) { + spv::Id depth24 = spv::NoResult; + switch (source_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Round to the nearest even integer. This seems to be the + // correct, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. + id_vector_temp.clear(); + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, source_depth_float[i], + builder.makeFloatConstant(float(0xFFFFFF)))); + depth24 = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450RoundEven, id_vector_temp)); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4( + builder, source_depth_float[i], true, ext_inst_glsl_std_450); + } break; + } + // Merge depth and stencil. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_stencil[i]); + id_vector_temp.push_back(depth24); + id_vector_temp.push_back(depth_offset); + id_vector_temp.push_back(depth_width); + packed[i] = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } + // Common path unless there was a specialized one - unpack two packed 32-bit + // parts. + if (packed[0] != spv::NoResult) { + assert_true(packed[1] != spv::NoResult); + if (dest_color_format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(packed[0]); + id_vector_temp.push_back(packed[1]); + // Multisampled sampled images are optional in Vulkan, and image views + // of different formats can't be created separately for sampled image + // and color attachment usages, so no multisampled integer sampled image + // support implies no multisampled integer framebuffer attachment + // support in Xenia. + if (!dest_color_is_uint) { + for (spv::Id& float32 : id_vector_temp) { + float32 = + builder.createUnaryOp(spv::OpBitcast, type_float, float32); + } + } + builder.createStore(builder.createCompositeConstruct(type_fragment_data, + id_vector_temp), + output_fragment_data); + } else { + spv::Id const_uint_0 = builder.makeUintConstant(0); + spv::Id const_uint_16 = builder.makeUintConstant(16); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, packed[i >> 1], + (i & 1) ? const_uint_16 : const_uint_0, const_uint_16)); + } + // TODO(Triang3l): Handle the case when that's not true (no multisampled + // sampled images, no 16-bit UNORM, no cross-packing 32bpp aliasing on a + // portability subset device or a 64bpp format where that wouldn't help + // anyway). + builder.createStore(builder.createCompositeConstruct(type_fragment_data, + id_vector_temp), + output_fragment_data); + } + } + } else { + // If `packed` is created, use the generic path involving unpacking. + // - For a color destination, the packed 32bpp color. + // - For a depth / stencil destination, stencil in 0:7, depth in 8:31 + // normally, or depth in 0:23 and zeros in 24:31 with packed_only_depth. + // - For a stencil bit, stencil in 0:7. + // Otherwise, the fragment data or fragment depth / stencil output must be + // written to directly by the reached control flow path. + spv::Id packed = spv::NoResult; + bool packed_only_depth = false; + if (source_is_color) { + switch (source_color_format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + if (dest_is_color && + (dest_color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8 || + dest_color_format == + xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA)) { + // Same format - passthrough. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale = builder.makeFloatConstant(255.0f); + uint32_t packed_component_offset = 0; + if (mode.output == TransferOutput::kDepth) { + // When need only depth, not stencil, skip the red component, and + // put the depth from GBA directly in the lower bits. + packed_component_offset = 1; + packed_only_depth = true; + if (output_fragment_stencil_ref != spv::NoResult) { + builder.createStore( + builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[0][0], + unorm_scale), + unorm_round_offset)), + output_fragment_stencil_ref); + } + } + packed = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, + source_color[0][packed_component_offset], unorm_scale), + unorm_round_offset)); + if (mode.output != TransferOutput::kStencilBit) { + spv::Id component_width = builder.makeUintConstant(8); + for (uint32_t i = 1; i < 4 - packed_component_offset; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, + source_color[0][packed_component_offset + i], + unorm_scale), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(8 * i)); + id_vector_temp.push_back(component_width); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + if (dest_is_color && + (dest_color_format == + xenos::ColorRenderTargetFormat::k_2_10_10_10 || + dest_color_format == xenos::ColorRenderTargetFormat:: + k_2_10_10_10_AS_10_10_10_10)) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale_rgb = builder.makeFloatConstant(1023.0f); + packed = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[0][0], unorm_scale_rgb), + unorm_round_offset)); + if (mode.output != TransferOutput::kStencilBit) { + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f); + spv::Id width_a = builder.makeUintConstant(2); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, source_color[0][i], + i == 3 ? unorm_scale_a : unorm_scale_rgb), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(10 * i)); + id_vector_temp.push_back(i == 3 ? width_a : width_rgb); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + if (dest_is_color && + (dest_color_format == + xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT || + dest_color_format == xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16)) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + // Float16 has a wider range for both color and alpha, also NaNs - + // clamp and convert. + packed = SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[0][0], ext_inst_glsl_std_450); + if (mode.output != TransferOutput::kStencilBit) { + spv::Id width_rgb = builder.makeUintConstant(10); + for (uint32_t i = 1; i < 3; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back( + SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[0][i], ext_inst_glsl_std_450)); + id_vector_temp.push_back(builder.makeUintConstant(10 * i)); + id_vector_temp.push_back(width_rgb); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + // Saturate and convert the alpha. + id_vector_temp.clear(); + id_vector_temp.reserve(3); + id_vector_temp.push_back(source_color[0][3]); + id_vector_temp.push_back(builder.makeFloatConstant(0.0f)); + id_vector_temp.push_back(builder.makeFloatConstant(1.0f)); + spv::Id alpha_saturated = + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450NClamp, id_vector_temp); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + alpha_saturated, + builder.makeFloatConstant(3.0f)), + builder.makeFloatConstant(0.5f)))); + id_vector_temp.push_back(builder.makeUintConstant(30)); + id_vector_temp.push_back(builder.makeUintConstant(2)); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: { + // All 64bpp formats, and all 16 bits per component formats, are + // represented as integers in ownership transfer for safe handling of + // NaN encodings and -32768 / -32767. + // TODO(Triang3l): Handle the case when that's not true (no + // multisampled sampled images, no 16-bit UNORM, no cross-packing + // 32bpp aliasing on a portability subset device or a 64bpp format + // where that wouldn't help anyway). + if (dest_is_color && + (dest_color_format == xenos::ColorRenderTargetFormat::k_16_16 || + dest_color_format == + xenos::ColorRenderTargetFormat::k_16_16_FLOAT)) { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + packed = source_color[0][0]; + if (mode.output != TransferOutput::kStencilBit) { + spv::Id component_offset_width = builder.makeUintConstant(16); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(source_color[0][1]); + id_vector_temp.push_back(component_offset_width); + id_vector_temp.push_back(component_offset_width); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + // Float32 is transferred as uint32 to preserve NaN encodings. However, + // multisampled sampled image support is optional in Vulkan. + case xenos::ColorRenderTargetFormat::k_32_FLOAT: + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: { + packed = source_color[0][0]; + if (!source_color_is_uint) { + packed = builder.createUnaryOp(spv::OpBitcast, type_uint, packed); + } + } break; + } + } else if (source_depth_float[0] != spv::NoResult) { + if (mode.output == TransferOutput::kDepth && + dest_depth_format == source_depth_format) { + builder.createStore(source_depth_float[0], output_fragment_depth); + } else { + switch (source_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Round to the nearest even integer. This seems to be the correct, + // adding +0.5 and rounding towards zero results in red instead of + // black in the 4D5307E6 clear shader. + id_vector_temp.clear(); + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, source_depth_float[0], + builder.makeFloatConstant(float(0xFFFFFF)))); + packed = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450RoundEven, id_vector_temp)); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + packed = SpirvShaderTranslator::PreClampedDepthTo20e4( + builder, source_depth_float[0], true, ext_inst_glsl_std_450); + } break; + } + if (mode.output == TransferOutput::kDepth) { + packed_only_depth = true; + } else { + // Merge depth and stencil. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_stencil[0]); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.makeUintConstant(8)); + id_vector_temp.push_back(builder.makeUintConstant(24)); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } + switch (mode.output) { + case TransferOutput::kColor: { + // Unless a special path was taken, unpack the raw 32bpp value into the + // 32bpp color output. + if (packed != spv::NoResult) { + switch (dest_color_format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + spv::Id component_width = builder.makeUintConstant(8); + spv::Id unorm_scale = builder.makeFloatConstant(1.0f / 255.0f); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, packed, + builder.makeUintConstant(8 * i), component_width)), + unorm_scale)); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id unorm_scale_rgb = + builder.makeFloatConstant(1.0f / 1023.0f); + spv::Id width_a = builder.makeUintConstant(2); + spv::Id unorm_scale_a = builder.makeFloatConstant(1.0f / 3.0f); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, + packed, + builder.makeUintConstant(10 * i), + i == 3 ? width_a : width_rgb)), + i == 3 ? unorm_scale_a : unorm_scale_rgb)); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + // Color. + spv::Id width_rgb = builder.makeUintConstant(10); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp.push_back(SpirvShaderTranslator::Float7e3To32( + builder, packed, 10 * i, false, ext_inst_glsl_std_450)); + } + // Alpha. + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, + packed, builder.makeUintConstant(30), + builder.makeUintConstant(2))), + builder.makeFloatConstant(1.0f / 3.0f))); + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: { + // All 16 bits per component formats are represented as integers + // in ownership transfer for safe handling of NaN encodings and + // -32768 / -32767. + // TODO(Triang3l): Handle the case when that's not true (no + // multisampled sampled images, no 16-bit UNORM, no cross-packing + // 32bpp aliasing on a portability subset device or a 64bpp format + // where that wouldn't help anyway). + spv::Id component_offset_width = builder.makeUintConstant(16); + id_vector_temp.clear(); + id_vector_temp.reserve(2); + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.push_back(builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, packed, + i ? component_offset_width : builder.makeUintConstant(0), + component_offset_width)); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_32_FLOAT: { + // Float32 is transferred as uint32 to preserve NaN encodings. + // However, multisampled sampled images are optional in Vulkan, + // and image views of different formats can't be created + // separately for sampled image and color attachment usages, so no + // multisampled integer sampled image support implies no + // multisampled integer framebuffer attachment support in Xenia. + spv::Id float32 = packed; + if (!dest_color_is_uint) { + float32 = + builder.createUnaryOp(spv::OpBitcast, type_float, float32); + } + builder.createStore(float32, output_fragment_data); + } break; + default: + // A 64bpp format (handled separately) or an invalid one. + assert_unhandled_case(dest_color_format); + } + } + } break; + case TransferOutput::kDepth: { + if (packed) { + spv::Id guest_depth24 = packed; + if (!packed_only_depth) { + // Extract the depth bits. + guest_depth24 = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + guest_depth24, builder.makeUintConstant(8)); + } + // Load the host float32 depth, check if, when converted to the guest + // format, it's the same as the guest source, thus up to date, and if + // it is, write host float32 depth, otherwise do the guest -> host + // conversion. + spv::Id host_depth32 = spv::NoResult; + if (host_depth_source_texture != spv::NoResult) { + // Convert position and sample index from within the destination + // tile to within the host depth source tile, like for the guest + // render target, but for 32bpp -> 32bpp only. + spv::Id host_depth_source_sample_id = dest_sample_id; + spv::Id host_depth_source_tile_pixel_x = dest_tile_pixel_x; + spv::Id host_depth_source_tile_pixel_y = dest_tile_pixel_y; + if (key.host_depth_source_msaa_samples != key.dest_msaa_samples) { + if (key.host_depth_source_msaa_samples >= + xenos::MsaaSamples::k4X) { + // 4x -> 1x/2x. + if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // 4x -> 2x. + // Horizontal pixels to samples. Vertical sample (1/0 in the + // first bit for native 2x or 0/1 in the second bit for 2x as + // 4x) to second sample bit. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + host_depth_source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + } else { + // 4x -> 1x. + // Pixels to samples. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + host_depth_source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + host_depth_source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } + } else { + // 1x/2x -> 1x/2x/4x (as long as they're different). + // Only the X part - Y is handled by common code. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // Horizontal samples to pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_x = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } + // Host depth source Y and sample index for 1x/2x AA sources. + if (key.host_depth_source_msaa_samples < + xenos::MsaaSamples::k4X) { + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 1x/2x -> 4x. + if (key.host_depth_source_msaa_samples == + xenos::MsaaSamples::k2X) { + // 2x -> 4x. + // Vertical samples (second bit) of 4x destination to + // vertical sample (1, 0 for native 2x, or 0, 3 for 2x as + // 4x) of 2x source. + host_depth_source_sample_id = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_sample_id, + builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + host_depth_source_sample_id = + builder.createBinOp(spv::OpBitwiseXor, type_uint, + host_depth_source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } else { + // 1x -> 4x. + // Vertical samples (second bit) to Y pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_y = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } else { + // 1x/2x -> different 1x/2x. + if (key.host_depth_source_msaa_samples == + xenos::MsaaSamples::k2X) { + // 2x -> 1x. + // Vertical pixels of 2x destination to vertical samples (1, + // 0 for native 2x, or 0, 3 for 2x as 4x) of 1x source. + host_depth_source_sample_id = builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + host_depth_source_sample_id = + builder.createBinOp(spv::OpBitwiseXor, type_uint, + host_depth_source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + host_depth_source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } else { + // 1x -> 2x. + // Vertical samples (1/0 in the first bit for native 2x or + // 0/1 in the second bit for 2x as 4x) of 2x destination to + // vertical pixels of 1x source. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_y = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_y = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } + } + } + } + assert_true(push_constants_member_host_depth_address != UINT32_MAX); + id_vector_temp.clear(); + id_vector_temp.push_back(builder.makeIntConstant( + int32_t(push_constants_member_host_depth_address))); + spv::Id host_depth_address_constant = builder.createLoad( + builder.createAccessChain(spv::StorageClassPushConstant, + push_constants, id_vector_temp), + spv::NoPrecision); + // Transform the destination tile index into the host depth source. + spv::Id host_depth_source_tile_index = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createBinOp( + spv::OpIAdd, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, + dest_tile_index), + builder.createTriOp( + spv::OpBitFieldSExtract, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, + host_depth_address_constant), + builder.makeUintConstant(xenos::kEdramPitchTilesBits * + 2), + builder.makeUintConstant(xenos::kEdramBaseTilesBits)))); + // Split the host depth source tile index into X and Y tile index + // within the source image. + spv::Id host_depth_source_pitch_tiles = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, host_depth_address_constant, + builder.makeUintConstant(xenos::kEdramPitchTilesBits), + builder.makeUintConstant(xenos::kEdramPitchTilesBits)); + spv::Id host_depth_source_tile_index_y = builder.createBinOp( + spv::OpUDiv, type_uint, host_depth_source_tile_index, + host_depth_source_pitch_tiles); + spv::Id host_depth_source_tile_index_x = builder.createBinOp( + spv::OpUMod, type_uint, host_depth_source_tile_index, + host_depth_source_pitch_tiles); + // Finally calculate the host depth source texture coordinates. + spv::Id host_depth_source_pixel_x_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIMul, type_uint, + builder.makeUintConstant( + tile_width_samples_scaled >> + uint32_t(key.source_msaa_samples >= + xenos::MsaaSamples::k4X)), + host_depth_source_tile_index_x), + host_depth_source_tile_pixel_x)); + spv::Id host_depth_source_pixel_y_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIMul, type_uint, + builder.makeUintConstant( + tile_height_samples_scaled >> + uint32_t(key.source_msaa_samples >= + xenos::MsaaSamples::k2X)), + host_depth_source_tile_index_y), + host_depth_source_tile_pixel_y)); + // Load the host depth source. + spv::Builder::TextureParameters + host_depth_source_texture_parameters = {}; + host_depth_source_texture_parameters.sampler = + builder.createLoad(host_depth_source_texture, spv::NoPrecision); + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(host_depth_source_pixel_x_int); + id_vector_temp.push_back(host_depth_source_pixel_y_int); + host_depth_source_texture_parameters.coords = + builder.createCompositeConstruct(type_int2, id_vector_temp); + if (key.host_depth_source_msaa_samples != xenos::MsaaSamples::k1X) { + host_depth_source_texture_parameters.sample = + builder.createUnaryOp(spv::OpBitcast, type_int, + host_depth_source_sample_id); + } else { + host_depth_source_texture_parameters.lod = + builder.makeIntConstant(0); + } + host_depth32 = builder.createCompositeExtract( + builder.createTextureCall(spv::NoPrecision, type_float4, false, + true, false, false, false, + host_depth_source_texture_parameters, + spv::ImageOperandsMaskNone), + type_float, 0); + } else if (host_depth_source_buffer != spv::NoResult) { + // Get the address in the EDRAM scratch buffer and load from there. + // The beginning of the buffer is (0, 0) of the destination. + // 40-sample columns are not swapped for addressing simplicity + // (because this is used for depth -> depth transfers, where + // swapping isn't needed). + // Convert samples to pixels. + assert_true(key.host_depth_source_msaa_samples == + xenos::MsaaSamples::k1X); + spv::Id dest_tile_sample_x = dest_tile_pixel_x; + spv::Id dest_tile_sample_y = dest_tile_pixel_y; + if (key.dest_msaa_samples >= xenos::MsaaSamples::k2X) { + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // Horizontal sample index in bit 0. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + dest_tile_sample_x = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + // Vertical sample index as 1 or 0 in bit 0 for true 2x or as 0 + // or 1 in bit 1 for 4x or for 2x emulated as 4x. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + (key.dest_msaa_samples == xenos::MsaaSamples::k2X && + msaa_2x_attachments_supported_) + ? spv::OpBitwiseXor + : spv::OpShiftRightLogical, + type_uint, dest_sample_id, builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + dest_tile_sample_y = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + // Combine the tile sample index and the tile index. + spv::Id host_depth_offset = builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant(tile_width_samples_scaled * + tile_height_samples_scaled), + dest_tile_index), + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant(tile_width_samples_scaled), + dest_tile_sample_y), + dest_tile_sample_x)); + id_vector_temp.clear(); + id_vector_temp.reserve(2); + // The only SSBO structure member. + id_vector_temp.push_back(builder.makeIntConstant(0)); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpBitcast, type_int, host_depth_offset)); + // StorageBuffer since SPIR-V 1.3, but since SPIR-V 1.0 is + // generated, it's Uniform. + host_depth32 = builder.createUnaryOp( + spv::OpBitcast, type_float, + builder.createLoad( + builder.createAccessChain(spv::StorageClassUniform, + host_depth_source_buffer, + id_vector_temp), + spv::NoPrecision)); + } + spv::Block* depth24_to_depth32_header = builder.getBuildPoint(); + spv::Id depth24_to_depth32_convert_id = spv::NoResult; + spv::Block* depth24_to_depth32_merge = nullptr; + spv::Id host_depth24 = spv::NoResult; + if (host_depth32 != spv::NoResult) { + // Convert the host depth value to the guest format and check if it + // matches the value in the currently owning guest render target. + switch (dest_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Round to the nearest even integer. This seems to be the + // correct, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. + id_vector_temp.clear(); + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, host_depth32, + builder.makeFloatConstant(float(0xFFFFFF)))); + host_depth24 = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450RoundEven, + id_vector_temp)); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + host_depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4( + builder, host_depth32, true, ext_inst_glsl_std_450); + } break; + } + assert_true(host_depth24 != spv::NoResult); + // Update the header block pointer after the conversion (to avoid + // assuming that the conversion doesn't branch). + depth24_to_depth32_header = builder.getBuildPoint(); + spv::Id host_depth_outdated = builder.createBinOp( + spv::OpINotEqual, type_bool, guest_depth24, host_depth24); + spv::Block& depth24_to_depth32_convert_entry = + builder.makeNewBlock(); + { + spv::Block& depth24_to_depth32_merge_block = + builder.makeNewBlock(); + depth24_to_depth32_merge = &depth24_to_depth32_merge_block; + } + { + std::unique_ptr depth24_to_depth32_merge_op = + std::make_unique(spv::OpSelectionMerge); + depth24_to_depth32_merge_op->addIdOperand( + depth24_to_depth32_merge->getId()); + depth24_to_depth32_merge_op->addImmediateOperand( + spv::SelectionControlMaskNone); + builder.getBuildPoint()->addInstruction( + std::move(depth24_to_depth32_merge_op)); + } + builder.createConditionalBranch(host_depth_outdated, + &depth24_to_depth32_convert_entry, + depth24_to_depth32_merge); + builder.setBuildPoint(&depth24_to_depth32_convert_entry); + } + // Convert the guest 24-bit depth to float32 (in an open conditional + // if the host depth is also loaded). + spv::Id guest_depth32 = spv::NoResult; + switch (dest_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Multiplying by 1.0 / 0xFFFFFF produces an incorrect result (for + // 0xC00000, for instance - which is 2_10_10_10 clear to 0001) - + // rescale from 0...0xFFFFFF to 0...0x1000000 doing what true + // float division followed by multiplication does (on x86-64 MSVC + // with default SSE rounding) - values starting from 0x800000 + // become bigger by 1; then accurately bias the result's exponent. + guest_depth32 = builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createBinOp( + spv::OpIAdd, type_uint, guest_depth24, + builder.createBinOp(spv::OpShiftRightLogical, + type_uint, guest_depth24, + builder.makeUintConstant(23)))), + builder.makeFloatConstant(1.0f / float(1 << 24))); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + guest_depth32 = SpirvShaderTranslator::Depth20e4To32( + builder, guest_depth24, 0, true, false, + ext_inst_glsl_std_450); + } break; + } + assert_true(guest_depth32 != spv::NoResult); + spv::Id fragment_depth32 = guest_depth32; + if (host_depth32 != spv::NoResult) { + assert_not_null(depth24_to_depth32_merge); + spv::Id depth24_to_depth32_result_block_id = + builder.getBuildPoint()->getId(); + builder.createBranch(depth24_to_depth32_merge); + builder.setBuildPoint(depth24_to_depth32_merge); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(guest_depth32); + id_vector_temp.push_back(depth24_to_depth32_result_block_id); + id_vector_temp.push_back(host_depth32); + id_vector_temp.push_back(depth24_to_depth32_header->getId()); + fragment_depth32 = + builder.createOp(spv::OpPhi, type_float, id_vector_temp); + } + builder.createStore(fragment_depth32, output_fragment_depth); + } + } break; + case TransferOutput::kStencilBit: { + if (packed) { + // Kill the sample if the needed stencil bit is not set. + assert_true(push_constants_member_stencil_mask != UINT32_MAX); + id_vector_temp.clear(); + id_vector_temp.push_back(builder.makeIntConstant( + int32_t(push_constants_member_stencil_mask))); + spv::Id stencil_mask_constant = builder.createLoad( + builder.createAccessChain(spv::StorageClassPushConstant, + push_constants, id_vector_temp), + spv::NoPrecision); + spv::Id stencil_sample_passed = builder.createBinOp( + spv::OpINotEqual, type_bool, + builder.createBinOp(spv::OpBitwiseAnd, type_uint, packed, + stencil_mask_constant), + builder.makeUintConstant(0)); + spv::Block& stencil_bit_kill_block = builder.makeNewBlock(); + spv::Block& stencil_bit_merge_block = builder.makeNewBlock(); + { + std::unique_ptr stencil_bit_merge_op = + std::make_unique(spv::OpSelectionMerge); + stencil_bit_merge_op->addIdOperand(stencil_bit_merge_block.getId()); + stencil_bit_merge_op->addImmediateOperand( + spv::SelectionControlMaskNone); + builder.getBuildPoint()->addInstruction( + std::move(stencil_bit_merge_op)); + } + builder.createConditionalBranch(stencil_sample_passed, + &stencil_bit_merge_block, + &stencil_bit_kill_block); + builder.setBuildPoint(&stencil_bit_kill_block); + builder.createNoResultOp(spv::OpKill); + builder.setBuildPoint(&stencil_bit_merge_block); + } + } break; + } + } + + // End the main function and make it the entry point. + builder.leaveFunction(); + builder.addExecutionMode(main_function, spv::ExecutionModeOriginUpperLeft); + if (output_fragment_depth != spv::NoResult) { + builder.addExecutionMode(main_function, spv::ExecutionModeDepthReplacing); + } + if (output_fragment_stencil_ref != spv::NoResult) { + builder.addExecutionMode(main_function, + spv::ExecutionModeStencilRefReplacingEXT); + } + spv::Instruction* entry_point = + builder.addEntryPoint(spv::ExecutionModelFragment, main_function, "main"); + for (spv::Id interface_id : main_interface) { + entry_point->addIdOperand(interface_id); + } + + // Serialize the shader code. + std::vector shader_code; + builder.dump(shader_code); + + // Create the shader module, and store the handle even if creation fails not + // to try to create it again later. + VkShaderModule shader_module = ui::vulkan::util::CreateShaderModule( + provider, reinterpret_cast(shader_code.data()), + sizeof(uint32_t) * shader_code.size()); + if (shader_module == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target ownership " + "transfer shader 0x{:08X}", + key.key); + } + transfer_shaders_.emplace(key, shader_module); + return shader_module; +} + +VkPipeline const* VulkanRenderTargetCache::GetTransferPipelines( + TransferPipelineKey key) { + auto pipeline_it = transfer_pipelines_.find(key); + if (pipeline_it != transfer_pipelines_.end()) { + return pipeline_it->second[0] != VK_NULL_HANDLE ? pipeline_it->second.data() + : nullptr; + } + + VkRenderPass render_pass = GetRenderPass(key.render_pass_key); + VkShaderModule fragment_shader_module = GetTransferShader(key.shader_key); + if (render_pass == VK_NULL_HANDLE || + fragment_shader_module == VK_NULL_HANDLE) { + transfer_pipelines_.emplace(key, std::array{}); + return nullptr; + } + + const TransferModeInfo& mode = kTransferModes[size_t(key.shader_key.mode)]; + + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + + uint32_t dest_sample_count = uint32_t(1) + << uint32_t(key.shader_key.dest_msaa_samples); + bool dest_is_masked_sample = + dest_sample_count > 1 && !device_features.sampleRateShading; + + VkPipelineShaderStageCreateInfo shader_stages[2]; + shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stages[0].pNext = nullptr; + shader_stages[0].flags = 0; + shader_stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT; + shader_stages[0].module = transfer_passthrough_vertex_shader_; + shader_stages[0].pName = "main"; + shader_stages[0].pSpecializationInfo = nullptr; + shader_stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stages[1].pNext = nullptr; + shader_stages[1].flags = 0; + shader_stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT; + shader_stages[1].module = fragment_shader_module; + shader_stages[1].pName = "main"; + shader_stages[1].pSpecializationInfo = nullptr; + VkSpecializationMapEntry sample_id_specialization_map_entry; + uint32_t sample_id_specialization_constant; + VkSpecializationInfo sample_id_specialization_info; + if (dest_is_masked_sample) { + sample_id_specialization_map_entry.constantID = 0; + sample_id_specialization_map_entry.offset = 0; + sample_id_specialization_map_entry.size = sizeof(uint32_t); + sample_id_specialization_constant = 0; + sample_id_specialization_info.mapEntryCount = 1; + sample_id_specialization_info.pMapEntries = + &sample_id_specialization_map_entry; + sample_id_specialization_info.dataSize = + sizeof(sample_id_specialization_constant); + sample_id_specialization_info.pData = &sample_id_specialization_constant; + shader_stages[1].pSpecializationInfo = &sample_id_specialization_info; + } + + VkVertexInputBindingDescription vertex_input_binding; + vertex_input_binding.binding = 0; + vertex_input_binding.stride = sizeof(float) * 2; + vertex_input_binding.inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + VkVertexInputAttributeDescription vertex_input_attribute; + vertex_input_attribute.location = 0; + vertex_input_attribute.binding = 0; + vertex_input_attribute.format = VK_FORMAT_R32G32_SFLOAT; + vertex_input_attribute.offset = 0; + VkPipelineVertexInputStateCreateInfo vertex_input_state; + vertex_input_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vertex_input_state.pNext = nullptr; + vertex_input_state.flags = 0; + vertex_input_state.vertexBindingDescriptionCount = 1; + vertex_input_state.pVertexBindingDescriptions = &vertex_input_binding; + vertex_input_state.vertexAttributeDescriptionCount = 1; + vertex_input_state.pVertexAttributeDescriptions = &vertex_input_attribute; + + VkPipelineInputAssemblyStateCreateInfo input_assembly_state; + input_assembly_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + input_assembly_state.pNext = nullptr; + input_assembly_state.flags = 0; + input_assembly_state.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + input_assembly_state.primitiveRestartEnable = VK_FALSE; + + // Dynamic, to stay within maxViewportDimensions while preferring a + // power-of-two factor for converting from pixel coordinates to NDC for exact + // precision. + VkPipelineViewportStateCreateInfo viewport_state; + viewport_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewport_state.pNext = nullptr; + viewport_state.flags = 0; + viewport_state.viewportCount = 1; + viewport_state.pViewports = nullptr; + viewport_state.scissorCount = 1; + viewport_state.pScissors = nullptr; + + VkPipelineRasterizationStateCreateInfo rasterization_state = {}; + rasterization_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterization_state.polygonMode = VK_POLYGON_MODE_FILL; + rasterization_state.cullMode = VK_CULL_MODE_NONE; + rasterization_state.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rasterization_state.lineWidth = 1.0f; + + // For samples other than the first, will be changed for the pipelines for + // other samples. + VkSampleMask sample_mask = UINT32_MAX; + VkPipelineMultisampleStateCreateInfo multisample_state = {}; + multisample_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisample_state.rasterizationSamples = + (dest_sample_count == 2 && !msaa_2x_attachments_supported_) + ? VK_SAMPLE_COUNT_4_BIT + : VkSampleCountFlagBits(dest_sample_count); + if (dest_sample_count > 1) { + if (device_features.sampleRateShading) { + multisample_state.sampleShadingEnable = VK_TRUE; + multisample_state.minSampleShading = 1.0f; + if (dest_sample_count == 2 && !msaa_2x_attachments_supported_) { + // Emulating 2x MSAA as samples 0 and 3 of 4x MSAA when 2x is not + // supported. + sample_mask = 0b1001; + } + } else { + sample_mask = 0b1; + } + if (sample_mask != UINT32_MAX) { + multisample_state.pSampleMask = &sample_mask; + } + } + + // Whether the depth / stencil state is used depends on the presence of a + // depth attachment in the render pass - but not making assumptions about + // whether the render pass contains any specific attachments, so setting up + // valid depth / stencil state unconditionally. + VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {}; + depth_stencil_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + if (mode.output == TransferOutput::kDepth) { + depth_stencil_state.depthTestEnable = VK_TRUE; + depth_stencil_state.depthWriteEnable = VK_TRUE; + depth_stencil_state.depthCompareOp = cvars::depth_transfer_not_equal_test + ? VK_COMPARE_OP_NOT_EQUAL + : VK_COMPARE_OP_ALWAYS; + } + if ((mode.output == TransferOutput::kDepth && + provider.device_extensions().ext_shader_stencil_export) || + mode.output == TransferOutput::kStencilBit) { + depth_stencil_state.stencilTestEnable = VK_TRUE; + depth_stencil_state.front.failOp = VK_STENCIL_OP_KEEP; + depth_stencil_state.front.passOp = VK_STENCIL_OP_REPLACE; + depth_stencil_state.front.depthFailOp = VK_STENCIL_OP_REPLACE; + // Using ALWAYS, not NOT_EQUAL, so depth writing is unaffected by stencil + // being different. + depth_stencil_state.front.compareOp = VK_COMPARE_OP_ALWAYS; + // Will be dynamic for stencil bit output. + depth_stencil_state.front.writeMask = UINT8_MAX; + depth_stencil_state.front.reference = UINT8_MAX; + depth_stencil_state.back = depth_stencil_state.front; + } + + // Whether the color blend state is used depends on the presence of color + // attachments in the render pass - but not making assumptions about whether + // the render pass contains any specific attachments, so setting up valid + // color blend state unconditionally. + VkPipelineColorBlendAttachmentState + color_blend_attachments[xenos::kMaxColorRenderTargets] = {}; + VkPipelineColorBlendStateCreateInfo color_blend_state = {}; + color_blend_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + color_blend_state.attachmentCount = + 32 - xe::lzcnt(key.render_pass_key.depth_and_color_used >> 1); + color_blend_state.pAttachments = color_blend_attachments; + if (mode.output == TransferOutput::kColor) { + if (device_features.independentBlend) { + // State the intention more explicitly. + color_blend_attachments[key.shader_key.dest_color_rt_index] + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + } else { + // The blend state for all attachments must be identical, but other render + // targets are not written to by the shader. + for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) { + color_blend_attachments[i].colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + } + } + } + + std::array dynamic_states; + VkPipelineDynamicStateCreateInfo dynamic_state; + dynamic_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamic_state.pNext = nullptr; + dynamic_state.flags = 0; + dynamic_state.dynamicStateCount = 0; + dynamic_state.pDynamicStates = dynamic_states.data(); + dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_VIEWPORT; + dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_SCISSOR; + if (mode.output == TransferOutput::kStencilBit) { + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK; + } + + std::array pipelines{}; + VkGraphicsPipelineCreateInfo pipeline_create_info; + pipeline_create_info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipeline_create_info.pNext = nullptr; + pipeline_create_info.flags = 0; + if (dest_is_masked_sample) { + pipeline_create_info.flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT; + } + pipeline_create_info.stageCount = uint32_t(xe::countof(shader_stages)); + pipeline_create_info.pStages = shader_stages; + pipeline_create_info.pVertexInputState = &vertex_input_state; + pipeline_create_info.pInputAssemblyState = &input_assembly_state; + pipeline_create_info.pTessellationState = nullptr; + pipeline_create_info.pViewportState = &viewport_state; + pipeline_create_info.pRasterizationState = &rasterization_state; + pipeline_create_info.pMultisampleState = &multisample_state; + pipeline_create_info.pDepthStencilState = &depth_stencil_state; + pipeline_create_info.pColorBlendState = &color_blend_state; + pipeline_create_info.pDynamicState = &dynamic_state; + pipeline_create_info.layout = + transfer_pipeline_layouts_[size_t(mode.pipeline_layout)]; + pipeline_create_info.renderPass = render_pass; + pipeline_create_info.subpass = 0; + pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; + pipeline_create_info.basePipelineIndex = -1; + if (dfn.vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, + &pipeline_create_info, nullptr, + &pipelines[0]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target ownership " + "transfer pipeline for render pass 0x{:08X}, shader 0x{:08X}", + key.render_pass_key.key, key.shader_key.key); + transfer_pipelines_.emplace(key, std::array{}); + return nullptr; + } + if (dest_is_masked_sample) { + assert_true(multisample_state.pSampleMask == &sample_mask); + pipeline_create_info.flags = (pipeline_create_info.flags & + ~VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT) | + VK_PIPELINE_CREATE_DERIVATIVE_BIT; + pipeline_create_info.basePipelineHandle = pipelines[0]; + for (uint32_t i = 1; i < dest_sample_count; ++i) { + // Emulating 2x MSAA as samples 0 and 3 of 4x MSAA when 2x is not + // supported. + uint32_t host_sample_index = + (dest_sample_count == 2 && !msaa_2x_attachments_supported_ && i == 1) + ? 3 + : i; + sample_id_specialization_constant = host_sample_index; + sample_mask = uint32_t(1) << host_sample_index; + if (dfn.vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, + &pipeline_create_info, nullptr, + &pipelines[i]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target " + "ownership transfer pipeline for render pass 0x{:08X}, shader " + "0x{:08X}, sample {}", + key.render_pass_key.key, key.shader_key.key, i); + for (uint32_t j = 0; j < i; ++j) { + dfn.vkDestroyPipeline(device, pipelines[j], nullptr); + } + transfer_pipelines_.emplace(key, std::array{}); + return nullptr; + } + } + } + return transfer_pipelines_.emplace(key, pipelines).first->second.data(); +} + +void VulkanRenderTargetCache::PerformTransfersAndResolveClears( + uint32_t render_target_count, RenderTarget* const* render_targets, + const std::vector* render_target_transfers, + const uint64_t* render_target_resolve_clear_values, + const Transfer::Rectangle* resolve_clear_rectangle) { + assert_true(GetPath() == Path::kHostRenderTargets); + + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + bool shader_stencil_export = + provider.device_extensions().ext_shader_stencil_export; + uint64_t current_submission = command_processor_.GetCurrentSubmission(); + DeferredCommandBuffer& command_buffer = + command_processor_.deferred_command_buffer(); + + bool resolve_clear_needed = + render_target_resolve_clear_values && resolve_clear_rectangle; + VkClearRect resolve_clear_rect; + if (resolve_clear_needed) { + // Assuming the rectangle is already clamped by the setup function from the + // common render target cache. + resolve_clear_rect.rect.offset.x = + int32_t(resolve_clear_rectangle->x_pixels * resolution_scale_x_); + resolve_clear_rect.rect.offset.y = + int32_t(resolve_clear_rectangle->y_pixels * resolution_scale_y_); + resolve_clear_rect.rect.extent.width = + resolve_clear_rectangle->width_pixels * resolution_scale_x_; + resolve_clear_rect.rect.extent.height = + resolve_clear_rectangle->height_pixels * resolution_scale_y_; + resolve_clear_rect.baseArrayLayer = 0; + resolve_clear_rect.layerCount = 1; + } + + // Do host depth storing for the depth destination (assuming there can be only + // one depth destination) where depth destination == host depth source. + bool host_depth_store_set_up = false; + for (uint32_t i = 0; i < render_target_count; ++i) { + RenderTarget* dest_rt = render_targets[i]; + if (!dest_rt) { + continue; + } + auto& dest_vulkan_rt = *static_cast(dest_rt); + RenderTargetKey dest_rt_key = dest_vulkan_rt.key(); + if (!dest_rt_key.is_depth) { + continue; + } + const std::vector& depth_transfers = render_target_transfers[i]; + for (const Transfer& transfer : depth_transfers) { + if (transfer.host_depth_source != dest_rt) { + continue; + } + if (!host_depth_store_set_up) { + // Pipeline. + command_processor_.BindExternalComputePipeline( + host_depth_store_pipelines_[size_t(dest_rt_key.msaa_samples)]); + // Descriptor set bindings. + VkDescriptorSet host_depth_store_descriptor_sets[] = { + edram_storage_buffer_descriptor_set_, + dest_vulkan_rt.GetDescriptorSetTransferSource(), + }; + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_COMPUTE, host_depth_store_pipeline_layout_, + 0, uint32_t(xe::countof(host_depth_store_descriptor_sets)), + host_depth_store_descriptor_sets, 0, nullptr); + // Render target constant. + HostDepthStoreRenderTargetConstant + host_depth_store_render_target_constant = + GetHostDepthStoreRenderTargetConstant( + dest_rt_key.pitch_tiles_at_32bpp, + msaa_2x_attachments_supported_); + command_buffer.CmdVkPushConstants( + host_depth_store_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + uint32_t(offsetof(HostDepthStoreConstants, render_target)), + sizeof(host_depth_store_render_target_constant), + &host_depth_store_render_target_constant); + // Barriers - don't need to try to combine them with the rest of + // render target transfer barriers now - if this happens, after host + // depth storing, SHADER_READ -> DEPTH_STENCIL_ATTACHMENT_WRITE will be + // done anyway even in the best case, so it's not possible to have all + // the barriers in one place here. + UseEdramBuffer(EdramBufferUsage::kComputeWrite); + // Always transitioning both depth and stencil, not storing separate + // usage flags for depth and stencil. + command_processor_.PushImageMemoryBarrier( + dest_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT), + dest_vulkan_rt.current_stage_mask(), + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + dest_vulkan_rt.current_access_mask(), VK_ACCESS_SHADER_READ_BIT, + dest_vulkan_rt.current_layout(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + dest_vulkan_rt.SetUsage(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + host_depth_store_set_up = true; + } + Transfer::Rectangle + transfer_rectangles[Transfer::kMaxRectanglesWithCutout]; + uint32_t transfer_rectangle_count = transfer.GetRectangles( + dest_rt_key.base_tiles, dest_rt_key.pitch_tiles_at_32bpp, + dest_rt_key.msaa_samples, false, transfer_rectangles, + resolve_clear_rectangle); + assert_not_zero(transfer_rectangle_count); + HostDepthStoreRectangleConstant host_depth_store_rectangle_constant; + for (uint32_t j = 0; j < transfer_rectangle_count; ++j) { + uint32_t group_count_x, group_count_y; + GetHostDepthStoreRectangleInfo( + transfer_rectangles[j], dest_rt_key.msaa_samples, + host_depth_store_rectangle_constant, group_count_x, group_count_y); + command_buffer.CmdVkPushConstants( + host_depth_store_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + uint32_t(offsetof(HostDepthStoreConstants, rectangle)), + sizeof(host_depth_store_rectangle_constant), + &host_depth_store_rectangle_constant); + command_processor_.SubmitBarriers(true); + command_buffer.CmdVkDispatch(group_count_x, group_count_y, 1); + MarkEdramBufferModified(); + } + } + break; + } + + constexpr VkPipelineStageFlags kSourceStageMask = + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + constexpr VkAccessFlags kSourceAccessMask = VK_ACCESS_SHADER_READ_BIT; + constexpr VkImageLayout kSourceLayout = + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + // Try to insert as many barriers as possible in one place, hoping that in the + // best case (no cross-copying between current render targets), barriers will + // need to be only inserted here, not between transfers. In case of + // cross-copying, if the destination use is going to happen before the source + // use, choose the destination state, otherwise the source state - to match + // the order in which transfers will actually happen (otherwise there will be + // just a useless switch back and forth). + for (uint32_t i = 0; i < render_target_count; ++i) { + RenderTarget* dest_rt = render_targets[i]; + if (!dest_rt) { + continue; + } + const std::vector& dest_transfers = render_target_transfers[i]; + if (!resolve_clear_needed && dest_transfers.empty()) { + continue; + } + // Transition the destination, only if not going to be used as a source + // earlier. + bool dest_used_previously_as_source = false; + for (uint32_t j = 0; j < i; ++j) { + for (const Transfer& previous_transfer : render_target_transfers[j]) { + if (previous_transfer.source == dest_rt || + previous_transfer.host_depth_source == dest_rt) { + dest_used_previously_as_source = true; + break; + } + } + } + if (!dest_used_previously_as_source) { + auto& dest_vulkan_rt = *static_cast(dest_rt); + VkPipelineStageFlags dest_dst_stage_mask; + VkAccessFlags dest_dst_access_mask; + VkImageLayout dest_new_layout; + dest_vulkan_rt.GetDrawUsage(&dest_dst_stage_mask, &dest_dst_access_mask, + &dest_new_layout); + command_processor_.PushImageMemoryBarrier( + dest_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + dest_vulkan_rt.key().is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + dest_vulkan_rt.current_stage_mask(), dest_dst_stage_mask, + dest_vulkan_rt.current_access_mask(), dest_dst_access_mask, + dest_vulkan_rt.current_layout(), dest_new_layout); + dest_vulkan_rt.SetUsage(dest_dst_stage_mask, dest_dst_access_mask, + dest_new_layout); + } + // Transition the sources, only if not going to be used as destinations + // earlier. + for (const Transfer& transfer : dest_transfers) { + bool source_previously_used_as_dest = false; + bool host_depth_source_previously_used_as_dest = false; + for (uint32_t j = 0; j < i; ++j) { + if (render_target_transfers[j].empty()) { + continue; + } + const RenderTarget* previous_rt = render_targets[j]; + if (transfer.source == previous_rt) { + source_previously_used_as_dest = true; + } + if (transfer.host_depth_source == previous_rt) { + host_depth_source_previously_used_as_dest = true; + } + } + if (!source_previously_used_as_dest) { + auto& source_vulkan_rt = + *static_cast(transfer.source); + command_processor_.PushImageMemoryBarrier( + source_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + source_vulkan_rt.key().is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + source_vulkan_rt.current_stage_mask(), kSourceStageMask, + source_vulkan_rt.current_access_mask(), kSourceAccessMask, + source_vulkan_rt.current_layout(), kSourceLayout); + source_vulkan_rt.SetUsage(kSourceStageMask, kSourceAccessMask, + kSourceLayout); + } + // transfer.host_depth_source == dest_rt means the EDRAM buffer will be + // used instead, no need to transition. + if (transfer.host_depth_source && transfer.host_depth_source != dest_rt && + !host_depth_source_previously_used_as_dest) { + auto& host_depth_source_vulkan_rt = + *static_cast(transfer.host_depth_source); + command_processor_.PushImageMemoryBarrier( + host_depth_source_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT), + host_depth_source_vulkan_rt.current_stage_mask(), kSourceStageMask, + host_depth_source_vulkan_rt.current_access_mask(), + kSourceAccessMask, host_depth_source_vulkan_rt.current_layout(), + kSourceLayout); + host_depth_source_vulkan_rt.SetUsage(kSourceStageMask, + kSourceAccessMask, kSourceLayout); + } + } + } + if (host_depth_store_set_up) { + // Will be reading copied host depth from the EDRAM buffer. + UseEdramBuffer(EdramBufferUsage::kFragmentRead); + } + + // Perform the transfers and clears. + + TransferPipelineLayoutIndex last_transfer_pipeline_layout_index = + TransferPipelineLayoutIndex::kCount; + uint32_t transfer_descriptor_sets_bound = 0; + uint32_t transfer_push_constants_set = 0; + VkDescriptorSet last_descriptor_set_host_depth_stencil_textures = + VK_NULL_HANDLE; + VkDescriptorSet last_descriptor_set_depth_stencil_textures = VK_NULL_HANDLE; + VkDescriptorSet last_descriptor_set_color_texture = VK_NULL_HANDLE; + TransferAddressConstant last_host_depth_address_constant; + TransferAddressConstant last_address_constant; + + for (uint32_t i = 0; i < render_target_count; ++i) { + RenderTarget* dest_rt = render_targets[i]; + if (!dest_rt) { + continue; + } + + const std::vector& current_transfers = render_target_transfers[i]; + if (current_transfers.empty() && !resolve_clear_needed) { + continue; + } + + auto& dest_vulkan_rt = *static_cast(dest_rt); + RenderTargetKey dest_rt_key = dest_vulkan_rt.key(); + + // Late barriers in case there was cross-copying that prevented merging of + // barriers. + { + VkPipelineStageFlags dest_dst_stage_mask; + VkAccessFlags dest_dst_access_mask; + VkImageLayout dest_new_layout; + dest_vulkan_rt.GetDrawUsage(&dest_dst_stage_mask, &dest_dst_access_mask, + &dest_new_layout); + command_processor_.PushImageMemoryBarrier( + dest_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + dest_rt_key.is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + dest_vulkan_rt.current_stage_mask(), dest_dst_stage_mask, + dest_vulkan_rt.current_access_mask(), dest_dst_access_mask, + dest_vulkan_rt.current_layout(), dest_new_layout); + dest_vulkan_rt.SetUsage(dest_dst_stage_mask, dest_dst_access_mask, + dest_new_layout); + } + + // Get the objects needed for transfers to the destination. + // TODO(Triang3l): Reuse the guest render pass for transfers where possible + // (if the Vulkan format used for drawing is also usable for transfers - for + // instance, R8G8B8A8_UNORM can be used for both, so the guest pass can be + // reused, but R16G16B16A16_SFLOAT render targets use R16G16B16A16_UINT for + // transfers, so the transfer pass has to be separate) to avoid stores and + // loads on tile-based devices to make this actually applicable. Also + // overall perform all non-cross-copying transfers for the current + // framebuffer configuration in a single pass, to load / store only once. + RenderPassKey transfer_render_pass_key; + transfer_render_pass_key.msaa_samples = dest_rt_key.msaa_samples; + if (dest_rt_key.is_depth) { + transfer_render_pass_key.depth_and_color_used = 0b1; + transfer_render_pass_key.depth_format = dest_rt_key.GetDepthFormat(); + } else { + transfer_render_pass_key.depth_and_color_used = 0b1 << 1; + transfer_render_pass_key.color_0_view_format = + dest_rt_key.GetColorFormat(); + transfer_render_pass_key.color_rts_use_transfer_formats = 1; + } + VkRenderPass transfer_render_pass = GetRenderPass(transfer_render_pass_key); + if (transfer_render_pass == VK_NULL_HANDLE) { + continue; + } + const RenderTarget* + transfer_framebuffer_render_targets[1 + xenos::kMaxColorRenderTargets] = + {}; + transfer_framebuffer_render_targets[dest_rt_key.is_depth ? 0 : 1] = dest_rt; + const Framebuffer* transfer_framebuffer = GetFramebuffer( + transfer_render_pass_key, dest_rt_key.pitch_tiles_at_32bpp, + transfer_framebuffer_render_targets); + if (!transfer_framebuffer) { + continue; + } + // Don't enter the render pass immediately - may still insert source + // barriers later. + + if (!current_transfers.empty()) { + uint32_t dest_pitch_tiles = dest_rt_key.GetPitchTiles(); + bool dest_is_64bpp = dest_rt_key.Is64bpp(); + + // Gather shader keys and sort to reduce pipeline state and binding + // switches. Also gather stencil rectangles to clear if needed. + bool need_stencil_bit_draws = + dest_rt_key.is_depth && !shader_stencil_export; + current_transfer_invocations_.clear(); + current_transfer_invocations_.reserve( + current_transfers.size() << uint32_t(need_stencil_bit_draws)); + uint32_t rt_sort_index = 0; + TransferShaderKey new_transfer_shader_key; + new_transfer_shader_key.dest_msaa_samples = dest_rt_key.msaa_samples; + new_transfer_shader_key.dest_resource_format = + dest_rt_key.resource_format; + uint32_t stencil_clear_rectangle_count = 0; + for (uint32_t j = 0; j <= uint32_t(need_stencil_bit_draws); ++j) { + // j == 0 - color or depth. + // j == 1 - stencil bits. + // Stencil bit writing always requires a different root signature, + // handle these separately. Stencil never has a host depth source. + // Clear previously set sort indices. + for (const Transfer& transfer : current_transfers) { + auto host_depth_source_vulkan_rt = + static_cast(transfer.host_depth_source); + if (host_depth_source_vulkan_rt) { + host_depth_source_vulkan_rt->SetTemporarySortIndex(UINT32_MAX); + } + assert_not_null(transfer.source); + auto& source_vulkan_rt = + *static_cast(transfer.source); + source_vulkan_rt.SetTemporarySortIndex(UINT32_MAX); + } + for (const Transfer& transfer : current_transfers) { + assert_not_null(transfer.source); + auto& source_vulkan_rt = + *static_cast(transfer.source); + VulkanRenderTarget* host_depth_source_vulkan_rt = + j ? nullptr + : static_cast(transfer.host_depth_source); + if (host_depth_source_vulkan_rt && + host_depth_source_vulkan_rt->temporary_sort_index() == + UINT32_MAX) { + host_depth_source_vulkan_rt->SetTemporarySortIndex(rt_sort_index++); + } + if (source_vulkan_rt.temporary_sort_index() == UINT32_MAX) { + source_vulkan_rt.SetTemporarySortIndex(rt_sort_index++); + } + RenderTargetKey source_rt_key = source_vulkan_rt.key(); + new_transfer_shader_key.source_msaa_samples = + source_rt_key.msaa_samples; + new_transfer_shader_key.source_resource_format = + source_rt_key.resource_format; + bool host_depth_source_is_copy = + host_depth_source_vulkan_rt == &dest_vulkan_rt; + // The host depth copy buffer has only raw samples. + new_transfer_shader_key.host_depth_source_msaa_samples = + (host_depth_source_vulkan_rt && !host_depth_source_is_copy) + ? host_depth_source_vulkan_rt->key().msaa_samples + : xenos::MsaaSamples::k1X; + if (j) { + new_transfer_shader_key.mode = + source_rt_key.is_depth ? TransferMode::kDepthToStencilBit + : TransferMode::kColorToStencilBit; + stencil_clear_rectangle_count += + transfer.GetRectangles(dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, + nullptr, resolve_clear_rectangle); + } else { + if (dest_rt_key.is_depth) { + if (host_depth_source_vulkan_rt) { + if (host_depth_source_is_copy) { + new_transfer_shader_key.mode = + source_rt_key.is_depth + ? TransferMode::kDepthAndHostDepthCopyToDepth + : TransferMode::kColorAndHostDepthCopyToDepth; + } else { + new_transfer_shader_key.mode = + source_rt_key.is_depth + ? TransferMode::kDepthAndHostDepthToDepth + : TransferMode::kColorAndHostDepthToDepth; + } + } else { + new_transfer_shader_key.mode = + source_rt_key.is_depth ? TransferMode::kDepthToDepth + : TransferMode::kColorToDepth; + } + } else { + new_transfer_shader_key.mode = source_rt_key.is_depth + ? TransferMode::kDepthToColor + : TransferMode::kColorToColor; + } + } + current_transfer_invocations_.emplace_back(transfer, + new_transfer_shader_key); + if (j) { + current_transfer_invocations_.back().transfer.host_depth_source = + nullptr; + } + } + } + std::sort(current_transfer_invocations_.begin(), + current_transfer_invocations_.end()); + + for (auto it = current_transfer_invocations_.cbegin(); + it != current_transfer_invocations_.cend(); ++it) { + assert_not_null(it->transfer.source); + auto& source_vulkan_rt = + *static_cast(it->transfer.source); + command_processor_.PushImageMemoryBarrier( + source_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + source_vulkan_rt.key().is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + source_vulkan_rt.current_stage_mask(), kSourceStageMask, + source_vulkan_rt.current_access_mask(), kSourceAccessMask, + source_vulkan_rt.current_layout(), kSourceLayout); + source_vulkan_rt.SetUsage(kSourceStageMask, kSourceAccessMask, + kSourceLayout); + auto host_depth_source_vulkan_rt = + static_cast(it->transfer.host_depth_source); + if (host_depth_source_vulkan_rt) { + TransferShaderKey transfer_shader_key = it->shader_key; + if (transfer_shader_key.mode == + TransferMode::kDepthAndHostDepthCopyToDepth || + transfer_shader_key.mode == + TransferMode::kColorAndHostDepthCopyToDepth) { + // Reading copied host depth from the EDRAM buffer. + UseEdramBuffer(EdramBufferUsage::kFragmentRead); + } else { + // Reading host depth from the texture. + command_processor_.PushImageMemoryBarrier( + host_depth_source_vulkan_rt->image(), + ui::vulkan::util::InitializeSubresourceRange( + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT), + host_depth_source_vulkan_rt->current_stage_mask(), + kSourceStageMask, + host_depth_source_vulkan_rt->current_access_mask(), + kSourceAccessMask, + host_depth_source_vulkan_rt->current_layout(), kSourceLayout); + host_depth_source_vulkan_rt->SetUsage( + kSourceStageMask, kSourceAccessMask, kSourceLayout); + } + } + } + + // Perform the transfers for the render target. + + command_processor_.SubmitBarriersAndEnterRenderTargetCacheRenderPass( + transfer_render_pass, transfer_framebuffer); + + if (stencil_clear_rectangle_count) { + VkClearAttachment* stencil_clear_attachment; + VkClearRect* stencil_clear_rect_write_ptr; + command_buffer.CmdClearAttachmentsEmplace(1, stencil_clear_attachment, + stencil_clear_rectangle_count, + stencil_clear_rect_write_ptr); + stencil_clear_attachment->aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; + stencil_clear_attachment->colorAttachment = 0; + stencil_clear_attachment->clearValue.depthStencil.depth = 0.0f; + stencil_clear_attachment->clearValue.depthStencil.stencil = 0; + for (const Transfer& transfer : current_transfers) { + Transfer::Rectangle transfer_stencil_clear_rectangles + [Transfer::kMaxRectanglesWithCutout]; + uint32_t transfer_stencil_clear_rectangle_count = + transfer.GetRectangles(dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, + transfer_stencil_clear_rectangles, + resolve_clear_rectangle); + for (uint32_t j = 0; j < transfer_stencil_clear_rectangle_count; + ++j) { + const Transfer::Rectangle& stencil_clear_rectangle = + transfer_stencil_clear_rectangles[j]; + stencil_clear_rect_write_ptr->rect.offset.x = + int32_t(stencil_clear_rectangle.x_pixels * resolution_scale_x_); + stencil_clear_rect_write_ptr->rect.offset.y = + int32_t(stencil_clear_rectangle.y_pixels * resolution_scale_y_); + stencil_clear_rect_write_ptr->rect.extent.width = + stencil_clear_rectangle.width_pixels * resolution_scale_x_; + stencil_clear_rect_write_ptr->rect.extent.height = + stencil_clear_rectangle.height_pixels * resolution_scale_y_; + stencil_clear_rect_write_ptr->baseArrayLayer = 0; + stencil_clear_rect_write_ptr->layerCount = 1; + ++stencil_clear_rect_write_ptr; + } + } + } + + // Prefer power of two viewports for exact division by simply biasing the + // exponent. + VkViewport transfer_viewport; + transfer_viewport.x = 0.0f; + transfer_viewport.y = 0.0f; + transfer_viewport.width = + float(std::min(xe::next_pow2(transfer_framebuffer->host_extent.width), + device_limits.maxViewportDimensions[0])); + transfer_viewport.height = float( + std::min(xe::next_pow2(transfer_framebuffer->host_extent.height), + device_limits.maxViewportDimensions[1])); + transfer_viewport.minDepth = 0.0f; + transfer_viewport.maxDepth = 1.0f; + command_processor_.SetViewport(transfer_viewport); + float pixels_to_ndc_x = 2.0f / transfer_viewport.width; + float pixels_to_ndc_y = 2.0f / transfer_viewport.height; + VkRect2D transfer_scissor; + transfer_scissor.offset.x = 0; + transfer_scissor.offset.y = 0; + transfer_scissor.extent = transfer_framebuffer->host_extent; + command_processor_.SetScissor(transfer_scissor); + + for (auto it = current_transfer_invocations_.cbegin(); + it != current_transfer_invocations_.cend(); ++it) { + const TransferInvocation& transfer_invocation_first = *it; + // Will be merging transfers from the same source into one mesh. + auto it_merged_first = it, it_merged_last = it; + uint32_t transfer_rectangle_count = + transfer_invocation_first.transfer.GetRectangles( + dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, nullptr, + resolve_clear_rectangle); + for (auto it_merge = std::next(it_merged_first); + it_merge != current_transfer_invocations_.cend(); ++it_merge) { + if (!transfer_invocation_first.CanBeMergedIntoOneDraw(*it_merge)) { + break; + } + transfer_rectangle_count += it_merge->transfer.GetRectangles( + dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, nullptr, + resolve_clear_rectangle); + it_merged_last = it_merge; + } + assert_not_zero(transfer_rectangle_count); + // Skip the merged transfers in the subsequent iterations. + it = it_merged_last; + + assert_not_null(it->transfer.source); + auto& source_vulkan_rt = + *static_cast(it->transfer.source); + auto host_depth_source_vulkan_rt = + static_cast(it->transfer.host_depth_source); + TransferShaderKey transfer_shader_key = it->shader_key; + const TransferModeInfo& transfer_mode_info = + kTransferModes[size_t(transfer_shader_key.mode)]; + TransferPipelineLayoutIndex transfer_pipeline_layout_index = + transfer_mode_info.pipeline_layout; + const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = + kTransferPipelineLayoutInfos[size_t( + transfer_pipeline_layout_index)]; + uint32_t transfer_sample_pipeline_count = + device_features.sampleRateShading + ? 1 + : uint32_t(1) << uint32_t(dest_rt_key.msaa_samples); + bool transfer_is_stencil_bit = + (transfer_pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordStencilMaskBit) != 0; + + uint32_t transfer_vertex_count = 6 * transfer_rectangle_count; + VkBuffer transfer_vertex_buffer; + VkDeviceSize transfer_vertex_buffer_offset; + float* transfer_rectangle_write_ptr = + reinterpret_cast(transfer_vertex_buffer_pool_->Request( + current_submission, sizeof(float) * 2 * transfer_vertex_count, + sizeof(float), transfer_vertex_buffer, + transfer_vertex_buffer_offset)); + if (!transfer_rectangle_write_ptr) { + continue; + } + for (auto it_merged = it_merged_first; it_merged <= it_merged_last; + ++it_merged) { + Transfer::Rectangle transfer_invocation_rectangles + [Transfer::kMaxRectanglesWithCutout]; + uint32_t transfer_invocation_rectangle_count = + it_merged->transfer.GetRectangles( + dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, + transfer_invocation_rectangles, resolve_clear_rectangle); + assert_not_zero(transfer_invocation_rectangle_count); + for (uint32_t j = 0; j < transfer_invocation_rectangle_count; ++j) { + const Transfer::Rectangle& transfer_rectangle = + transfer_invocation_rectangles[j]; + float transfer_rectangle_x0 = + -1.0f + transfer_rectangle.x_pixels * pixels_to_ndc_x; + float transfer_rectangle_y0 = + -1.0f + transfer_rectangle.y_pixels * pixels_to_ndc_y; + float transfer_rectangle_x1 = + transfer_rectangle_x0 + + transfer_rectangle.width_pixels * pixels_to_ndc_x; + float transfer_rectangle_y1 = + transfer_rectangle_y0 + + transfer_rectangle.height_pixels * pixels_to_ndc_y; + // O-* + // |/ + // * + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0; + // *-* + // |/ + // O + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1; + // *-O + // |/ + // * + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0; + // O + // /| + // *-* + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0; + // * + // /| + // O-* + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1; + // * + // /| + // *-O + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1; + } + } + command_buffer.CmdVkBindVertexBuffers(0, 1, &transfer_vertex_buffer, + &transfer_vertex_buffer_offset); + + const VkPipeline* transfer_pipelines = GetTransferPipelines( + TransferPipelineKey(transfer_render_pass_key, transfer_shader_key)); + if (!transfer_pipelines) { + continue; + } + command_processor_.BindExternalGraphicsPipeline(transfer_pipelines[0]); + if (last_transfer_pipeline_layout_index != + transfer_pipeline_layout_index) { + last_transfer_pipeline_layout_index = transfer_pipeline_layout_index; + transfer_descriptor_sets_bound = 0; + transfer_push_constants_set = 0; + } + + // Invalidate outdated bindings. + if (transfer_pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetHostDepthStencilTexturesBit) { + assert_not_null(host_depth_source_vulkan_rt); + VkDescriptorSet descriptor_set_host_depth_stencil_textures = + host_depth_source_vulkan_rt->GetDescriptorSetTransferSource(); + if (last_descriptor_set_host_depth_stencil_textures != + descriptor_set_host_depth_stencil_textures) { + last_descriptor_set_host_depth_stencil_textures = + descriptor_set_host_depth_stencil_textures; + transfer_descriptor_sets_bound &= + ~kTransferUsedDescriptorSetHostDepthStencilTexturesBit; + } + } + if (transfer_pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetDepthStencilTexturesBit) { + VkDescriptorSet descriptor_set_depth_stencil_textures = + source_vulkan_rt.GetDescriptorSetTransferSource(); + if (last_descriptor_set_depth_stencil_textures != + descriptor_set_depth_stencil_textures) { + last_descriptor_set_depth_stencil_textures = + descriptor_set_depth_stencil_textures; + transfer_descriptor_sets_bound &= + ~kTransferUsedDescriptorSetDepthStencilTexturesBit; + } + } + if (transfer_pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetColorTextureBit) { + VkDescriptorSet descriptor_set_color_texture = + source_vulkan_rt.GetDescriptorSetTransferSource(); + if (last_descriptor_set_color_texture != + descriptor_set_color_texture) { + last_descriptor_set_color_texture = descriptor_set_color_texture; + transfer_descriptor_sets_bound &= + ~kTransferUsedDescriptorSetColorTextureBit; + } + } + if (transfer_pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + assert_not_null(host_depth_source_vulkan_rt); + RenderTargetKey host_depth_source_rt_key = + host_depth_source_vulkan_rt->key(); + TransferAddressConstant host_depth_address_constant; + host_depth_address_constant.dest_pitch = dest_pitch_tiles; + host_depth_address_constant.source_pitch = + host_depth_source_rt_key.GetPitchTiles(); + host_depth_address_constant.source_to_dest = + int32_t(dest_rt_key.base_tiles) - + int32_t(host_depth_source_rt_key.base_tiles); + if (last_host_depth_address_constant != host_depth_address_constant) { + last_host_depth_address_constant = host_depth_address_constant; + transfer_push_constants_set &= + ~kTransferUsedPushConstantDwordHostDepthAddressBit; + } + } + if (transfer_pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordAddressBit) { + RenderTargetKey source_rt_key = source_vulkan_rt.key(); + TransferAddressConstant address_constant; + address_constant.dest_pitch = dest_pitch_tiles; + address_constant.source_pitch = source_rt_key.GetPitchTiles(); + address_constant.source_to_dest = int32_t(dest_rt_key.base_tiles) - + int32_t(source_rt_key.base_tiles); + if (last_address_constant != address_constant) { + last_address_constant = address_constant; + transfer_push_constants_set &= + ~kTransferUsedPushConstantDwordAddressBit; + } + } + + // Apply the new bindings. + // TODO(Triang3l): Merge binding updates into spans. + VkPipelineLayout transfer_pipeline_layout = + transfer_pipeline_layouts_[size_t(transfer_pipeline_layout_index)]; + uint32_t transfer_descriptor_sets_unbound = + transfer_pipeline_layout_info.used_descriptor_sets & + ~transfer_descriptor_sets_bound; + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetHostDepthBufferBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count(transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthBufferBit - 1)), + 1, &edram_storage_buffer_descriptor_set_, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetHostDepthBufferBit; + } + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetHostDepthStencilTexturesBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count( + transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthStencilTexturesBit - 1)), + 1, &last_descriptor_set_host_depth_stencil_textures, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetHostDepthStencilTexturesBit; + } + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetDepthStencilTexturesBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count( + transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetDepthStencilTexturesBit - 1)), + 1, &last_descriptor_set_depth_stencil_textures, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetDepthStencilTexturesBit; + } + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetColorTextureBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count(transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetColorTextureBit - 1)), + 1, &last_descriptor_set_color_texture, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetColorTextureBit; + } + uint32_t transfer_push_constants_unset = + transfer_pipeline_layout_info.used_push_constant_dwords & + ~transfer_push_constants_set; + if (transfer_push_constants_unset & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + command_buffer.CmdVkPushConstants( + transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordHostDepthAddressBit - 1)), + sizeof(uint32_t), &last_host_depth_address_constant); + transfer_push_constants_set |= + kTransferUsedPushConstantDwordHostDepthAddressBit; + } + if (transfer_push_constants_unset & + kTransferUsedPushConstantDwordAddressBit) { + command_buffer.CmdVkPushConstants( + transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordAddressBit - 1)), + sizeof(uint32_t), &last_address_constant); + transfer_push_constants_set |= + kTransferUsedPushConstantDwordAddressBit; + } + + for (uint32_t j = 0; j < transfer_sample_pipeline_count; ++j) { + if (j) { + command_processor_.BindExternalGraphicsPipeline( + transfer_pipelines[j]); + } + for (uint32_t k = 0; k < uint32_t(transfer_is_stencil_bit ? 8 : 1); + ++k) { + if (transfer_is_stencil_bit) { + uint32_t transfer_stencil_bit = uint32_t(1) << k; + command_buffer.CmdVkPushConstants( + transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info + .used_push_constant_dwords & + (kTransferUsedPushConstantDwordStencilMaskBit - 1)), + sizeof(uint32_t), &transfer_stencil_bit); + command_buffer.CmdVkSetStencilWriteMask( + VK_STENCIL_FACE_FRONT_AND_BACK, transfer_stencil_bit); + } + command_buffer.CmdVkDraw(transfer_vertex_count, 1, 0, 0); + } + } + } + } + + // Perform the clear. + if (resolve_clear_needed) { + command_processor_.SubmitBarriersAndEnterRenderTargetCacheRenderPass( + transfer_render_pass, transfer_framebuffer); + VkClearAttachment resolve_clear_attachment; + resolve_clear_attachment.colorAttachment = 0; + std::memset(&resolve_clear_attachment.clearValue, 0, + sizeof(resolve_clear_attachment.clearValue)); + uint64_t clear_value = render_target_resolve_clear_values[i]; + if (dest_rt_key.is_depth) { + resolve_clear_attachment.aspectMask = + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + uint32_t depth_guest_clear_value = + (uint32_t(clear_value) >> 8) & 0xFFFFFF; + switch (dest_rt_key.GetDepthFormat()) { + case xenos::DepthRenderTargetFormat::kD24S8: + resolve_clear_attachment.clearValue.depthStencil.depth = + xenos::UNorm24To32(depth_guest_clear_value); + break; + case xenos::DepthRenderTargetFormat::kD24FS8: + // Taking [0, 2) -> [0, 1) remapping into account. + resolve_clear_attachment.clearValue.depthStencil.depth = + xenos::Float20e4To32(depth_guest_clear_value) * 0.5f; + break; + } + resolve_clear_attachment.clearValue.depthStencil.stencil = + uint32_t(clear_value) & 0xFF; + } else { + resolve_clear_attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + switch (dest_rt_key.GetColorFormat()) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + for (uint32_t j = 0; j < 4; ++j) { + resolve_clear_attachment.clearValue.color.float32[j] = + ((clear_value >> (j * 8)) & 0xFF) * (1.0f / 0xFF); + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + for (uint32_t j = 0; j < 3; ++j) { + resolve_clear_attachment.clearValue.color.float32[j] = + ((clear_value >> (j * 10)) & 0x3FF) * (1.0f / 0x3FF); + } + resolve_clear_attachment.clearValue.color.float32[3] = + ((clear_value >> 30) & 0x3) * (1.0f / 0x3); + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + for (uint32_t j = 0; j < 3; ++j) { + resolve_clear_attachment.clearValue.color.float32[j] = + xenos::Float7e3To32((clear_value >> (j * 10)) & 0x3FF); + } + resolve_clear_attachment.clearValue.color.float32[3] = + ((clear_value >> 30) & 0x3) * (1.0f / 0x3); + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: { + // Using uint for transfers and clears of both. Disregarding the + // current -32...32 vs. -1...1 settings for consistency with color + // clear via depth aliasing. + // TODO(Triang3l): Handle cases of unsupported multisampled 16_UINT + // and completely unsupported 16_UNORM. + for (uint32_t j = 0; j < 2; ++j) { + resolve_clear_attachment.clearValue.color.uint32[j] = + uint32_t(clear_value >> (j * 16)) & 0xFFFF; + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: { + // Using uint for transfers and clears of both. Disregarding the + // current -32...32 vs. -1...1 settings for consistency with color + // clear via depth aliasing. + // TODO(Triang3l): Handle cases of unsupported multisampled 16_UINT + // and completely unsupported 16_UNORM. + for (uint32_t j = 0; j < 4; ++j) { + resolve_clear_attachment.clearValue.color.uint32[j] = + uint32_t(clear_value >> (j * 16)) & 0xFFFF; + } + } break; + case xenos::ColorRenderTargetFormat::k_32_FLOAT: { + // Using uint for proper denormal and NaN handling. + resolve_clear_attachment.clearValue.color.uint32[0] = + uint32_t(clear_value); + } break; + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: { + // Using uint for proper denormal and NaN handling. + resolve_clear_attachment.clearValue.color.uint32[0] = + uint32_t(clear_value); + resolve_clear_attachment.clearValue.color.uint32[1] = + uint32_t(clear_value >> 32); + } break; + } + } + command_buffer.CmdVkClearAttachments(1, &resolve_clear_attachment, 1, + &resolve_clear_rect); + } + } +} + } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h index 97bb690af..c98da4974 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h @@ -10,13 +10,20 @@ #ifndef XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_ #define XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_ +#include #include #include +#include +#include #include #include "xenia/base/hash.h" +#include "xenia/base/xxhash.h" #include "xenia/gpu/render_target_cache.h" +#include "xenia/gpu/xenos.h" +#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h" #include "xenia/ui/vulkan/vulkan_provider.h" +#include "xenia/ui/vulkan/vulkan_upload_buffer_pool.h" namespace xe { namespace gpu { @@ -28,8 +35,12 @@ class VulkanRenderTargetCache final : public RenderTargetCache { public: union RenderPassKey { struct { - // If emulating 2x as 4x, set this to 4x for 2x not to create unnecessary - // render pass objects. + // If emulating 2x as 4x, this is still 2x for simplicity of using this + // field to make guest-related decisions. Render pass objects are not very + // expensive, and their dependencies can't be shared between 2x-as-4x and + // true 4x MSAA passes (framebuffers because render target cache render + // targets are different for 2x and 4x guest MSAA, pipelines because the + // sample mask will have 2 samples excluded for 2x-as-4x). xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits; // 2 // << 0 is depth, << 1...4 is color. uint32_t depth_and_color_used : 1 + xenos::kMaxColorRenderTargets; // 7 @@ -46,7 +57,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache { xenos::ColorRenderTargetFormat color_2_view_format : xenos::kColorRenderTargetFormatBits; // 20 xenos::ColorRenderTargetFormat color_3_view_format - : xenos::kColorRenderTargetFormatBits; // 24 + : xenos::kColorRenderTargetFormatBits; // 24 + uint32_t color_rts_use_transfer_formats : 1; // 25 }; uint32_t key = 0; struct Hasher { @@ -60,6 +72,9 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool operator!=(const RenderPassKey& other_key) const { return !(*this == other_key); } + bool operator<(const RenderPassKey& other_key) const { + return key < other_key.key; + } }; static_assert_size(RenderPassKey, sizeof(uint32_t)); @@ -78,12 +93,14 @@ class VulkanRenderTargetCache final : public RenderTargetCache { void Shutdown(bool from_destructor = false); void ClearCache() override; - // TOOD(Triang3l): Fragment shader interlock. + void CompletedSubmissionUpdated(); + void EndSubmission(); + + // TODO(Triang3l): Fragment shader interlock. Path GetPath() const override { return Path::kHostRenderTargets; } - // TODO(Triang3l): Resolution scaling. - uint32_t GetResolutionScaleX() const override { return 1; } - uint32_t GetResolutionScaleY() const override { return 1; } + uint32_t GetResolutionScaleX() const override { return resolution_scale_x_; } + uint32_t GetResolutionScaleY() const override { return resolution_scale_y_; } bool Update(bool is_rasterization_done, uint32_t shader_writes_color_targets) override; @@ -98,6 +115,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache { return last_update_framebuffer_; } + bool msaa_2x_attachments_supported() const { + return msaa_2x_attachments_supported_; + } + bool msaa_2x_no_attachments_supported() const { + return msaa_2x_no_attachments_supported_; + } + bool IsMsaa2xSupported(bool subpass_has_attachments) const { + return subpass_has_attachments ? msaa_2x_attachments_supported_ + : msaa_2x_no_attachments_supported_; + } + // Returns the render pass object, or VK_NULL_HANDLE if failed to create. // A render pass managed by the render target cache may be ended and resumed // at any time (to allow for things like copying and texture loading). @@ -110,6 +138,99 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool* is_integer_out = nullptr) const; protected: + uint32_t GetMaxRenderTargetWidth() const override; + uint32_t GetMaxRenderTargetHeight() const override; + + RenderTarget* CreateRenderTarget(RenderTargetKey key) override; + + // TODO(Triang3l): Check actual unorm24 support. + bool IsHostDepthEncodingDifferent( + xenos::DepthRenderTargetFormat format) const override { + return true; + } + + private: + enum class EdramBufferUsage { + // There's no need for combined fragment and compute usages. + // With host render targets, the usual usage sequence is as follows: + // - Optionally compute writes - host depth copy storing for EDRAM range + // ownership transfers. + // - Optionally fragment reads - host depth copy storing for EDRAM range + // ownership transfers. + // - Compute writes - copying from host render targets during resolving. + // - Compute reads - writing to the shared memory during resolving. + // With the render backend implementation based on fragment shader + // interlocks, it's: + // - Fragment reads and writes - depth / stencil and color operations. + // - Compute reads - writing to the shared memory during resolving. + // So, fragment reads and compute reads normally don't follow each other, + // and there's no need to amortize the cost of a read > read barrier in an + // exceptional situation by using a wider barrier in the normal scenario. + + // Host depth copy storing. + kFragmentRead, + // Fragment shader interlock depth / stencil and color operations. + kFragmentReadWrite, + // Resolve - copying to the shared memory. + kComputeRead, + // Resolve - copying from host render targets. + kComputeWrite, + // Trace recording. + kTransferRead, + // Trace playback. + kTransferWrite, + }; + enum class EdramBufferModificationStatus { + // The values are ordered by how strong the barrier conditions are. + // No uncommitted shader writes. + kUnmodified, + // Need to commit before the next fragment shader interlock usage with + // overlap. + kViaFragmentShaderInterlock, + // Need to commit before any next fragment shader interlock usage. + kViaUnordered, + }; + static void GetEdramBufferUsageMasks(EdramBufferUsage usage, + VkPipelineStageFlags& stage_mask_out, + VkAccessFlags& access_mask_out); + void UseEdramBuffer(EdramBufferUsage new_usage); + void MarkEdramBufferModified( + EdramBufferModificationStatus modification_status = + EdramBufferModificationStatus::kViaUnordered); + void CommitEdramBufferShaderWrites( + EdramBufferModificationStatus commit_status = + EdramBufferModificationStatus::kViaFragmentShaderInterlock); + + VulkanCommandProcessor& command_processor_; + + uint32_t resolution_scale_x_ = 1; + uint32_t resolution_scale_y_ = 1; + + // Accessible in fragment and compute shaders. + VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE; + VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE; + VkDescriptorSetLayout descriptor_set_layout_sampled_image_x2_ = + VK_NULL_HANDLE; + + std::unique_ptr + descriptor_set_pool_sampled_image_; + std::unique_ptr + descriptor_set_pool_sampled_image_x2_; + + VkDeviceMemory edram_buffer_memory_ = VK_NULL_HANDLE; + VkBuffer edram_buffer_ = VK_NULL_HANDLE; + EdramBufferUsage edram_buffer_usage_; + EdramBufferModificationStatus edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + VkDescriptorPool edram_storage_buffer_descriptor_pool_ = VK_NULL_HANDLE; + VkDescriptorSet edram_storage_buffer_descriptor_set_; + + // RenderPassKey::key -> VkRenderPass. + // VK_NULL_HANDLE if failed to create. + std::unordered_map render_passes_; + + // For host render targets. + // Can only be destroyed when framebuffers referencing it are destroyed! class VulkanRenderTarget final : public RenderTarget { public: @@ -131,27 +252,45 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Takes ownership of the Vulkan objects passed to the constructor. VulkanRenderTarget(RenderTargetKey key, - const ui::vulkan::VulkanProvider& provider, + VulkanRenderTargetCache& render_target_cache, VkImage image, VkDeviceMemory memory, VkImageView view_depth_color, VkImageView view_depth_stencil, VkImageView view_stencil, VkImageView view_srgb, - VkImageView view_color_transfer_separate) + VkImageView view_color_transfer_separate, + size_t descriptor_set_index_transfer_source) : RenderTarget(key), - provider_(provider), + render_target_cache_(render_target_cache), image_(image), memory_(memory), view_depth_color_(view_depth_color), view_depth_stencil_(view_depth_stencil), view_stencil_(view_stencil), view_srgb_(view_srgb), - view_color_transfer_separate_(view_color_transfer_separate) {} + view_color_transfer_separate_(view_color_transfer_separate), + descriptor_set_index_transfer_source_( + descriptor_set_index_transfer_source) {} ~VulkanRenderTarget(); VkImage image() const { return image_; } VkImageView view_depth_color() const { return view_depth_color_; } VkImageView view_depth_stencil() const { return view_depth_stencil_; } + VkImageView view_color_transfer_separate() const { + return view_color_transfer_separate_; + } + VkImageView view_color_transfer() const { + return view_color_transfer_separate_ != VK_NULL_HANDLE + ? view_color_transfer_separate_ + : view_depth_color_; + } + VkDescriptorSet GetDescriptorSetTransferSource() const { + ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool = + key().is_depth + ? *render_target_cache_.descriptor_set_pool_sampled_image_x2_ + : *render_target_cache_.descriptor_set_pool_sampled_image_; + return descriptor_set_pool.Get(descriptor_set_index_transfer_source_); + } static void GetDrawUsage(bool is_depth, VkPipelineStageFlags* stage_mask_out, @@ -185,8 +324,13 @@ class VulkanRenderTargetCache final : public RenderTargetCache { current_layout_ = layout; } + uint32_t temporary_sort_index() const { return temporary_sort_index_; } + void SetTemporarySortIndex(uint32_t index) { + temporary_sort_index_ = index; + } + private: - const ui::vulkan::VulkanProvider& provider_; + VulkanRenderTargetCache& render_target_cache_; VkImage image_; VkDeviceMemory memory_; @@ -200,30 +344,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache { VkImageView view_srgb_; VkImageView view_color_transfer_separate_; + // 2 sampled images for depth / stencil, 1 sampled image for color. + size_t descriptor_set_index_transfer_source_; + VkPipelineStageFlags current_stage_mask_ = 0; VkAccessFlags current_access_mask_ = 0; VkImageLayout current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED; + + // Temporary storage for indices in operations like transfers and dumps. + uint32_t temporary_sort_index_ = 0; }; - uint32_t GetMaxRenderTargetWidth() const override; - uint32_t GetMaxRenderTargetHeight() const override; - - RenderTarget* CreateRenderTarget(RenderTargetKey key) override; - - // TODO(Triang3l): Check actual unorm24 support. - bool IsHostDepthEncodingDifferent( - xenos::DepthRenderTargetFormat format) const override { - return true; - } - - private: - VulkanCommandProcessor& command_processor_; - - // RenderPassKey::key -> VkRenderPass. - std::unordered_map render_passes_; - - // For host render targets. - struct FramebufferKey { RenderPassKey render_pass_key; @@ -254,13 +385,276 @@ class VulkanRenderTargetCache final : public RenderTargetCache { void Reset() { std::memset(this, 0, sizeof(*this)); } }; + enum TransferUsedDescriptorSet : uint32_t { + // Ordered from the least to the most frequently changed. + kTransferUsedDescriptorSetHostDepthBuffer, + kTransferUsedDescriptorSetHostDepthStencilTextures, + kTransferUsedDescriptorSetDepthStencilTextures, + // Mutually exclusive with kTransferUsedDescriptorSetDepthStencilTextures. + kTransferUsedDescriptorSetColorTexture, + + kTransferUsedDescriptorSetCount, + + kTransferUsedDescriptorSetHostDepthBufferBit = + uint32_t(1) << kTransferUsedDescriptorSetHostDepthBuffer, + kTransferUsedDescriptorSetHostDepthStencilTexturesBit = + uint32_t(1) << kTransferUsedDescriptorSetHostDepthStencilTextures, + kTransferUsedDescriptorSetDepthStencilTexturesBit = + uint32_t(1) << kTransferUsedDescriptorSetDepthStencilTextures, + kTransferUsedDescriptorSetColorTextureBit = + uint32_t(1) << kTransferUsedDescriptorSetColorTexture, + }; + + // 32-bit push constants (for simplicity of size calculation and to avoid + // std140 packing issues). + enum TransferUsedPushConstantDword : uint32_t { + kTransferUsedPushConstantDwordHostDepthAddress, + kTransferUsedPushConstantDwordAddress, + // Changed 8 times per transfer. + kTransferUsedPushConstantDwordStencilMask, + + kTransferUsedPushConstantDwordCount, + + kTransferUsedPushConstantDwordHostDepthAddressBit = + uint32_t(1) << kTransferUsedPushConstantDwordHostDepthAddress, + kTransferUsedPushConstantDwordAddressBit = + uint32_t(1) << kTransferUsedPushConstantDwordAddress, + kTransferUsedPushConstantDwordStencilMaskBit = + uint32_t(1) << kTransferUsedPushConstantDwordStencilMask, + }; + + enum class TransferPipelineLayoutIndex { + kColor, + kDepth, + kColorToStencilBit, + kDepthToStencilBit, + kColorAndHostDepthTexture, + kColorAndHostDepthBuffer, + kDepthAndHostDepthTexture, + kDepthAndHostDepthBuffer, + + kCount, + }; + + struct TransferPipelineLayoutInfo { + uint32_t used_descriptor_sets; + uint32_t used_push_constant_dwords; + }; + + static const TransferPipelineLayoutInfo + kTransferPipelineLayoutInfos[size_t(TransferPipelineLayoutIndex::kCount)]; + + enum class TransferMode : uint32_t { + kColorToDepth, + kColorToColor, + + kDepthToDepth, + kDepthToColor, + + kColorToStencilBit, + kDepthToStencilBit, + + // Two-source modes, using the host depth if it, when converted to the guest + // format, matches what's in the owner source (not modified, keep host + // precision), or the guest data otherwise (significantly modified, possibly + // cleared). Stencil for FragStencilRef is always taken from the guest + // source. + + kColorAndHostDepthToDepth, + // When using different source and destination depth formats. + kDepthAndHostDepthToDepth, + + // If host depth is fetched, but it's the same image as the destination, + // it's copied to the EDRAM buffer (but since it's just a scratch buffer, + // with tiles laid out linearly with the same pitch as in the original + // render target; also no swapping of 40-sample columns as opposed to the + // host render target - this is done only for the color source) and fetched + // from there instead of the host depth texture. + kColorAndHostDepthCopyToDepth, + kDepthAndHostDepthCopyToDepth, + + kCount, + }; + + enum class TransferOutput { + kColor, + kDepth, + kStencilBit, + }; + + struct TransferModeInfo { + TransferOutput output; + TransferPipelineLayoutIndex pipeline_layout; + }; + + static const TransferModeInfo kTransferModes[size_t(TransferMode::kCount)]; + + union TransferShaderKey { + uint32_t key; + struct { + xenos::MsaaSamples dest_msaa_samples : xenos::kMsaaSamplesBits; + uint32_t dest_color_rt_index : xenos::kColorRenderTargetIndexBits; + uint32_t dest_resource_format : xenos::kRenderTargetFormatBits; + xenos::MsaaSamples source_msaa_samples : xenos::kMsaaSamplesBits; + // Always 1x when the host depth is a copy from a buffer rather than an + // image, not to create the same pipeline for different MSAA sample counts + // as it doesn't matter in this case. + xenos::MsaaSamples host_depth_source_msaa_samples + : xenos::kMsaaSamplesBits; + uint32_t source_resource_format : xenos::kRenderTargetFormatBits; + + // Last bits because this affects the pipeline layout - after sorting, + // only change it as fewer times as possible. Depth buffers have an + // additional stencil texture. + static_assert(size_t(TransferMode::kCount) <= (size_t(1) << 4)); + TransferMode mode : 4; + }; + + TransferShaderKey() : key(0) { static_assert_size(*this, sizeof(key)); } + + struct Hasher { + size_t operator()(const TransferShaderKey& key) const { + return std::hash{}(key.key); + } + }; + bool operator==(const TransferShaderKey& other_key) const { + return key == other_key.key; + } + bool operator!=(const TransferShaderKey& other_key) const { + return !(*this == other_key); + } + bool operator<(const TransferShaderKey& other_key) const { + return key < other_key.key; + } + }; + + struct TransferPipelineKey { + RenderPassKey render_pass_key; + TransferShaderKey shader_key; + + TransferPipelineKey(RenderPassKey render_pass_key, + TransferShaderKey shader_key) + : render_pass_key(render_pass_key), shader_key(shader_key) {} + + struct Hasher { + size_t operator()(const TransferPipelineKey& key) const { + XXH3_state_t hash_state; + XXH3_64bits_reset(&hash_state); + XXH3_64bits_update(&hash_state, &key.render_pass_key, + sizeof(key.render_pass_key)); + XXH3_64bits_update(&hash_state, &key.shader_key, + sizeof(key.shader_key)); + return static_cast(XXH3_64bits_digest(&hash_state)); + } + }; + bool operator==(const TransferPipelineKey& other_key) const { + return render_pass_key == other_key.render_pass_key && + shader_key == other_key.shader_key; + } + bool operator!=(const TransferPipelineKey& other_key) const { + return !(*this == other_key); + } + bool operator<(const TransferPipelineKey& other_key) const { + if (render_pass_key != other_key.render_pass_key) { + return render_pass_key < other_key.render_pass_key; + } + return shader_key < other_key.shader_key; + } + }; + + union TransferAddressConstant { + uint32_t constant; + struct { + // All in tiles. + uint32_t dest_pitch : xenos::kEdramPitchTilesBits; + uint32_t source_pitch : xenos::kEdramPitchTilesBits; + // Safe to use 12 bits for signed difference - no ownership transfer can + // ever occur between render targets with EDRAM base >= 2048 as this would + // result in 0-length spans. 10 + 10 + 12 is exactly 32, any more bits, + // and more root 32-bit constants will be used. + // Destination base in tiles minus source base in tiles (not vice versa + // because this is a transform of the coordinate system, not addresses + // themselves). + // 0 for host_depth_source_is_copy (ignored in this case anyway as + // destination == source anyway). + int32_t source_to_dest : xenos::kEdramBaseTilesBits; + }; + TransferAddressConstant() : constant(0) { + static_assert_size(*this, sizeof(constant)); + } + bool operator==(const TransferAddressConstant& other_constant) const { + return constant == other_constant.constant; + } + bool operator!=(const TransferAddressConstant& other_constant) const { + return !(*this == other_constant); + } + }; + + struct TransferInvocation { + Transfer transfer; + TransferShaderKey shader_key; + TransferInvocation(const Transfer& transfer, + const TransferShaderKey& shader_key) + : transfer(transfer), shader_key(shader_key) {} + bool operator<(const TransferInvocation& other_invocation) { + // TODO(Triang3l): See if it may be better to sort by the source in the + // first place, especially when reading the same data multiple times (like + // to write the stencil bits after depth) for better read locality. + // Sort by the shader key primarily to reduce pipeline state (context) + // switches. + if (shader_key != other_invocation.shader_key) { + return shader_key < other_invocation.shader_key; + } + // Host depth render targets are changed rarely if they exist, won't save + // many binding changes, ignore them for simplicity (their existence is + // caught by the shader key change). + assert_not_null(transfer.source); + assert_not_null(other_invocation.transfer.source); + uint32_t source_index = + static_cast(transfer.source) + ->temporary_sort_index(); + uint32_t other_source_index = static_cast( + other_invocation.transfer.source) + ->temporary_sort_index(); + if (source_index != other_source_index) { + return source_index < other_source_index; + } + return transfer.start_tiles < other_invocation.transfer.start_tiles; + } + bool CanBeMergedIntoOneDraw( + const TransferInvocation& other_invocation) const { + return shader_key == other_invocation.shader_key && + transfer.AreSourcesSame(other_invocation.transfer); + } + }; + // Returns the framebuffer object, or VK_NULL_HANDLE if failed to create. const Framebuffer* GetFramebuffer( RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp, const RenderTarget* const* depth_and_color_render_targets); + VkShaderModule GetTransferShader(TransferShaderKey key); + // With sample-rate shading, returns a pointer to one pipeline. Without + // sample-rate shading, returns a pointer to as many pipelines as there are + // samples. If there was a failure to create a pipeline, returns nullptr. + VkPipeline const* GetTransferPipelines(TransferPipelineKey key); + + // Do ownership transfers for render targets - each render target / vector may + // be null / empty in case there's nothing to do for them. + // resolve_clear_rectangle is expected to be provided by + // PrepareHostRenderTargetsResolveClear which should do all the needed size + // bound checks. + void PerformTransfersAndResolveClears( + uint32_t render_target_count, RenderTarget* const* render_targets, + const std::vector* render_target_transfers, + const uint64_t* render_target_resolve_clear_values = nullptr, + const Transfer::Rectangle* resolve_clear_rectangle = nullptr); + bool gamma_render_target_as_srgb_ = false; + bool msaa_2x_attachments_supported_ = false; + bool msaa_2x_no_attachments_supported_ = false; + std::unordered_map framebuffers_; @@ -271,6 +665,32 @@ class VulkanRenderTargetCache final : public RenderTargetCache { last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] = {}; const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE; + + // Set 0 - EDRAM storage buffer, set 1 - source depth sampled image (and + // unused stencil from the transfer descriptor set), HostDepthStoreConstants + // passed via push constants. + VkPipelineLayout host_depth_store_pipeline_layout_ = VK_NULL_HANDLE; + VkPipeline host_depth_store_pipelines_[size_t(xenos::MsaaSamples::k4X) + 1] = + {}; + + std::unique_ptr + transfer_vertex_buffer_pool_; + VkShaderModule transfer_passthrough_vertex_shader_ = VK_NULL_HANDLE; + VkPipelineLayout transfer_pipeline_layouts_[size_t( + TransferPipelineLayoutIndex::kCount)] = {}; + // VK_NULL_HANDLE if failed to create. + std::unordered_map + transfer_shaders_; + // With sample-rate shading, one pipeline per entry. Without sample-rate + // shading, one pipeline per sample per entry. VK_NULL_HANDLE if failed to + // create. + std::unordered_map, + TransferPipelineKey::Hasher> + transfer_pipelines_; + + // Temporary storage for PerformTransfersAndResolveClears. + std::vector current_transfer_invocations_; }; } // namespace vulkan diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc index 0d95189da..788b8166a 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc @@ -177,6 +177,10 @@ bool VulkanSharedMemory::Initialize() { } } + // The first usage will likely be uploading. + last_usage_ = Usage::kTransferDestination; + last_written_range_ = std::make_pair(0, 0); + upload_buffer_pool_ = std::make_unique( provider, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, xe::align(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, @@ -190,9 +194,6 @@ void VulkanSharedMemory::Shutdown(bool from_destructor) { upload_buffer_pool_.reset(); - last_written_range_ = std::make_pair(0, 0); - last_usage_ = Usage::kTransferDestination; - const ui::vulkan::VulkanProvider& provider = command_processor_.GetVulkanProvider(); const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); @@ -226,8 +227,8 @@ void VulkanSharedMemory::Use(Usage usage, if (last_usage_ != usage || last_written_range_.second) { VkPipelineStageFlags src_stage_mask, dst_stage_mask; VkAccessFlags src_access_mask, dst_access_mask; - GetBarrier(last_usage_, src_stage_mask, src_access_mask); - GetBarrier(usage, dst_stage_mask, dst_access_mask); + GetUsageMasks(last_usage_, src_stage_mask, src_access_mask); + GetUsageMasks(usage, dst_stage_mask, dst_access_mask); VkDeviceSize offset, size; if (last_usage_ == usage) { // Committing the previous write, while not changing the access mask @@ -447,9 +448,9 @@ bool VulkanSharedMemory::UploadRanges( return successful; } -void VulkanSharedMemory::GetBarrier(Usage usage, - VkPipelineStageFlags& stage_mask, - VkAccessFlags& access_mask) const { +void VulkanSharedMemory::GetUsageMasks(Usage usage, + VkPipelineStageFlags& stage_mask, + VkAccessFlags& access_mask) const { switch (usage) { case Usage::kComputeWrite: stage_mask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.h b/src/xenia/gpu/vulkan/vulkan_shared_memory.h index 0d8e90813..b37949ec8 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h @@ -47,8 +47,8 @@ class VulkanSharedMemory : public SharedMemory { kComputeWrite, kTransferDestination, }; - // Places pipeline barrier for the target usage, also ensuring writes of - // adjacent are ordered with writes of each other and reads. + // Inserts a pipeline barrier for the target usage, also ensuring consecutive + // read-write accesses are ordered with each other. void Use(Usage usage, std::pair written_range = {}); VkBuffer buffer() const { return buffer_; } @@ -65,8 +65,8 @@ class VulkanSharedMemory : public SharedMemory { upload_page_ranges) override; private: - void GetBarrier(Usage usage, VkPipelineStageFlags& stage_mask, - VkAccessFlags& access_mask) const; + void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask, + VkAccessFlags& access_mask) const; VulkanCommandProcessor& command_processor_; TraceWriter& trace_writer_; @@ -76,9 +76,8 @@ class VulkanSharedMemory : public SharedMemory { // Single for non-sparse, every allocation so far for sparse. std::vector buffer_memory_; - // First usage will likely be uploading. - Usage last_usage_ = Usage::kTransferDestination; - std::pair last_written_range_ = {}; + Usage last_usage_; + std::pair last_written_range_; std::unique_ptr upload_buffer_pool_; std::vector upload_regions_; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index d2279a7b8..2f88bc74c 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -248,6 +248,7 @@ enum class MsaaSamples : uint32_t { constexpr uint32_t kMsaaSamplesBits = 2; +constexpr uint32_t kColorRenderTargetIndexBits = 2; constexpr uint32_t kMaxColorRenderTargets = 4; enum class ColorRenderTargetFormat : uint32_t { diff --git a/src/xenia/ui/vulkan/functions/device_1_0.inc b/src/xenia/ui/vulkan/functions/device_1_0.inc index 2a979f55f..148d6dd52 100644 --- a/src/xenia/ui/vulkan/functions/device_1_0.inc +++ b/src/xenia/ui/vulkan/functions/device_1_0.inc @@ -15,6 +15,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdClearColorImage) XE_UI_VULKAN_FUNCTION(vkCmdCopyBuffer) XE_UI_VULKAN_FUNCTION(vkCmdCopyBufferToImage) XE_UI_VULKAN_FUNCTION(vkCmdCopyImageToBuffer) +XE_UI_VULKAN_FUNCTION(vkCmdDispatch) XE_UI_VULKAN_FUNCTION(vkCmdDraw) XE_UI_VULKAN_FUNCTION(vkCmdDrawIndexed) XE_UI_VULKAN_FUNCTION(vkCmdEndRenderPass) @@ -29,6 +30,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdSetStencilWriteMask) XE_UI_VULKAN_FUNCTION(vkCmdSetViewport) XE_UI_VULKAN_FUNCTION(vkCreateBuffer) XE_UI_VULKAN_FUNCTION(vkCreateCommandPool) +XE_UI_VULKAN_FUNCTION(vkCreateComputePipelines) XE_UI_VULKAN_FUNCTION(vkCreateDescriptorPool) XE_UI_VULKAN_FUNCTION(vkCreateDescriptorSetLayout) XE_UI_VULKAN_FUNCTION(vkCreateFence) diff --git a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc new file mode 100644 index 000000000..8dfff2a3f --- /dev/null +++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc @@ -0,0 +1,120 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h" + +#include "xenia/base/assert.h" +#include "xenia/base/logging.h" + +namespace xe { +namespace ui { +namespace vulkan { + +SingleLayoutDescriptorSetPool::SingleLayoutDescriptorSetPool( + const VulkanProvider& provider, uint32_t pool_set_count, + uint32_t set_layout_descriptor_counts_count, + const VkDescriptorPoolSize* set_layout_descriptor_counts, + VkDescriptorSetLayout set_layout) + : provider_(provider), + pool_set_count_(pool_set_count), + set_layout_(set_layout) { + assert_not_zero(pool_set_count); + pool_descriptor_counts_.resize(set_layout_descriptor_counts_count); + for (uint32_t i = 0; i < set_layout_descriptor_counts_count; ++i) { + VkDescriptorPoolSize& pool_descriptor_type_count = + pool_descriptor_counts_[i]; + const VkDescriptorPoolSize& set_layout_descriptor_type_count = + set_layout_descriptor_counts[i]; + pool_descriptor_type_count.type = set_layout_descriptor_type_count.type; + pool_descriptor_type_count.descriptorCount = + set_layout_descriptor_type_count.descriptorCount * pool_set_count; + } +} + +SingleLayoutDescriptorSetPool::~SingleLayoutDescriptorSetPool() { + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn(); + VkDevice device = provider_.device(); + if (current_pool_ != VK_NULL_HANDLE) { + dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr); + } + for (VkDescriptorPool pool : full_pools_) { + dfn.vkDestroyDescriptorPool(device, pool, nullptr); + } +} + +size_t SingleLayoutDescriptorSetPool::Allocate() { + if (!descriptor_sets_free_.empty()) { + size_t free_index = descriptor_sets_free_.back(); + descriptor_sets_free_.pop_back(); + return free_index; + } + + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn(); + VkDevice device = provider_.device(); + + // Two iterations so if vkAllocateDescriptorSets fails even with a non-zero + // current_pool_sets_remaining_, another attempt will be made in a new pool. + for (uint32_t i = 0; i < 2; ++i) { + if (current_pool_ != VK_NULL_HANDLE && !current_pool_sets_remaining_) { + full_pools_.push_back(current_pool_); + current_pool_ = VK_NULL_HANDLE; + } + if (current_pool_ == VK_NULL_HANDLE) { + VkDescriptorPoolCreateInfo pool_create_info; + pool_create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + pool_create_info.pNext = nullptr; + pool_create_info.flags = 0; + pool_create_info.maxSets = pool_set_count_; + pool_create_info.poolSizeCount = uint32_t(pool_descriptor_counts_.size()); + pool_create_info.pPoolSizes = pool_descriptor_counts_.data(); + if (dfn.vkCreateDescriptorPool(device, &pool_create_info, nullptr, + ¤t_pool_) != VK_SUCCESS) { + XELOGE( + "SingleLayoutDescriptorSetPool: Failed to create a descriptor " + "pool"); + return SIZE_MAX; + } + current_pool_sets_remaining_ = pool_set_count_; + } + + VkDescriptorSetAllocateInfo descriptor_set_allocate_info; + descriptor_set_allocate_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + descriptor_set_allocate_info.pNext = nullptr; + descriptor_set_allocate_info.descriptorPool = current_pool_; + descriptor_set_allocate_info.descriptorSetCount = 1; + descriptor_set_allocate_info.pSetLayouts = &set_layout_; + VkDescriptorSet descriptor_set; + if (dfn.vkAllocateDescriptorSets(device, &descriptor_set_allocate_info, + &descriptor_set) != VK_SUCCESS) { + XELOGE( + "SingleLayoutDescriptorSetPool: Failed to allocate a descriptor " + "layout"); + if (current_pool_sets_remaining_ >= pool_set_count_) { + // Failed to allocate in a new pool - something completely wrong, don't + // store empty pools as full. + dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr); + current_pool_ = VK_NULL_HANDLE; + return SIZE_MAX; + } + full_pools_.push_back(current_pool_); + current_pool_ = VK_NULL_HANDLE; + } + --current_pool_sets_remaining_; + descriptor_sets_.push_back(descriptor_set); + return descriptor_sets_.size() - 1; + } + + // Both attempts have failed. + return SIZE_MAX; +} + +} // namespace vulkan +} // namespace ui +} // namespace xe diff --git a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h new file mode 100644 index 000000000..c3f3eb080 --- /dev/null +++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h @@ -0,0 +1,63 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_ +#define XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_ + +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/ui/vulkan/vulkan_provider.h" + +namespace xe { +namespace ui { +namespace vulkan { + +class SingleLayoutDescriptorSetPool { + public: + // set_layout_descriptor_counts must contain the numbers of descriptors of + // each type in a single set with the layout (the multiplication by the pool + // set count will be done internally). The descriptor set layout must not be + // destroyed until this object is also destroyed. + SingleLayoutDescriptorSetPool( + const VulkanProvider& provider, uint32_t pool_set_count, + uint32_t set_layout_descriptor_counts_count, + const VkDescriptorPoolSize* set_layout_descriptor_counts, + VkDescriptorSetLayout set_layout); + ~SingleLayoutDescriptorSetPool(); + + // Returns SIZE_MAX in case of a failure. + size_t Allocate(); + void Free(size_t index) { + assert_true(index < descriptor_sets_.size()); + descriptor_sets_free_.push_back(index); + } + VkDescriptorSet Get(size_t index) const { return descriptor_sets_[index]; } + + private: + const VulkanProvider& provider_; + uint32_t pool_set_count_; + std::vector pool_descriptor_counts_; + VkDescriptorSetLayout set_layout_; + + std::vector full_pools_; + VkDescriptorPool current_pool_ = VK_NULL_HANDLE; + uint32_t current_pool_sets_remaining_ = 0; + + std::vector descriptor_sets_; + std::vector descriptor_sets_free_; +}; + +} // namespace vulkan +} // namespace ui +} // namespace xe + +#endif // XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_ diff --git a/src/xenia/ui/vulkan/vulkan_provider.cc b/src/xenia/ui/vulkan/vulkan_provider.cc index 2d93485ff..eb48cfa23 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.cc +++ b/src/xenia/ui/vulkan/vulkan_provider.cc @@ -715,6 +715,8 @@ bool VulkanProvider::Initialize() { static const std::pair kUsedDeviceExtensions[] = { {"VK_EXT_fragment_shader_interlock", offsetof(DeviceExtensions, ext_fragment_shader_interlock)}, + {"VK_EXT_shader_stencil_export", + offsetof(DeviceExtensions, ext_shader_stencil_export)}, {"VK_KHR_dedicated_allocation", offsetof(DeviceExtensions, khr_dedicated_allocation)}, {"VK_KHR_image_format_list", @@ -946,6 +948,8 @@ bool VulkanProvider::Initialize() { XELOGVK("Vulkan device extensions:"); XELOGVK("* VK_EXT_fragment_shader_interlock: {}", device_extensions_.ext_fragment_shader_interlock ? "yes" : "no"); + XELOGVK("* VK_EXT_shader_stencil_export: {}", + device_extensions_.ext_shader_stencil_export ? "yes" : "no"); XELOGVK("* VK_KHR_dedicated_allocation: {}", device_extensions_.khr_dedicated_allocation ? "yes" : "no"); XELOGVK("* VK_KHR_image_format_list: {}", diff --git a/src/xenia/ui/vulkan/vulkan_provider.h b/src/xenia/ui/vulkan/vulkan_provider.h index 0887b88ac..83f4d587f 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.h +++ b/src/xenia/ui/vulkan/vulkan_provider.h @@ -132,6 +132,7 @@ class VulkanProvider : public GraphicsProvider { } struct DeviceExtensions { bool ext_fragment_shader_interlock; + bool ext_shader_stencil_export; // Core since 1.1.0. bool khr_dedicated_allocation; // Core since 1.2.0. diff --git a/src/xenia/ui/vulkan/vulkan_util.cc b/src/xenia/ui/vulkan/vulkan_util.cc index f8dd5846e..b4eb02c3f 100644 --- a/src/xenia/ui/vulkan/vulkan_util.cc +++ b/src/xenia/ui/vulkan/vulkan_util.cc @@ -189,6 +189,53 @@ bool CreateDedicatedAllocationImage(const VulkanProvider& provider, return true; } +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + VkShaderModule shader, const VkSpecializationInfo* specialization_info, + const char* entry_point) { + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + VkComputePipelineCreateInfo pipeline_create_info; + pipeline_create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + pipeline_create_info.pNext = nullptr; + pipeline_create_info.flags = 0; + pipeline_create_info.stage.sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + pipeline_create_info.stage.pNext = nullptr; + pipeline_create_info.stage.flags = 0; + pipeline_create_info.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + pipeline_create_info.stage.module = shader; + pipeline_create_info.stage.pName = entry_point; + pipeline_create_info.stage.pSpecializationInfo = specialization_info; + pipeline_create_info.layout = layout; + pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; + pipeline_create_info.basePipelineIndex = -1; + VkPipeline pipeline; + if (dfn.vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, + &pipeline_create_info, nullptr, + &pipeline) != VK_SUCCESS) { + return VK_NULL_HANDLE; + } + return pipeline; +} + +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + const uint32_t* shader_code, size_t shader_code_size_bytes, + const VkSpecializationInfo* specialization_info, const char* entry_point) { + VkShaderModule shader = + CreateShaderModule(provider, shader_code, shader_code_size_bytes); + if (shader == VK_NULL_HANDLE) { + return VK_NULL_HANDLE; + } + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + VkPipeline pipeline = CreateComputePipeline(provider, layout, shader, + specialization_info, entry_point); + dfn.vkDestroyShaderModule(device, shader, nullptr); + return pipeline; +} + } // namespace util } // namespace vulkan } // namespace ui diff --git a/src/xenia/ui/vulkan/vulkan_util.h b/src/xenia/ui/vulkan/vulkan_util.h index fda575305..7af10f65f 100644 --- a/src/xenia/ui/vulkan/vulkan_util.h +++ b/src/xenia/ui/vulkan/vulkan_util.h @@ -164,6 +164,17 @@ inline VkShaderModule CreateShaderModule(const VulkanProvider& provider, : VK_NULL_HANDLE; } +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + VkShaderModule shader, + const VkSpecializationInfo* specialization_info = nullptr, + const char* entry_point = "main"); +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + const uint32_t* shader_code, size_t shader_code_size_bytes, + const VkSpecializationInfo* specialization_info = nullptr, + const char* entry_point = "main"); + } // namespace util } // namespace vulkan } // namespace ui