diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index f0e59fb5f..2bac528bd 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -302,6 +302,10 @@ class RenderTargetCache { } return xenos::IsColorRenderTargetFormat64bpp(GetColorFormat()); } + const char* GetFormatName() const { + return is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat()) + : xenos::GetColorRenderTargetFormatName(GetColorFormat()); + } uint32_t GetPitchTiles() const { return pitch_tiles_at_32bpp << uint32_t(Is64bpp()); @@ -317,11 +321,9 @@ class RenderTargetCache { } std::string GetDebugName() const { - return fmt::format( - "RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles, GetPitchTiles(), - uint32_t(1) << uint32_t(msaa_samples), - is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat()) - : xenos::GetColorRenderTargetFormatName(GetColorFormat())); + return fmt::format("RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles, + GetPitchTiles(), uint32_t(1) << uint32_t(msaa_samples), + GetFormatName()); } }; diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index ce940da49..bcd140445 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -113,11 +113,9 @@ uint32_t SpirvShaderTranslator::GetModificationRegisterCount() const { } void SpirvShaderTranslator::StartTranslation() { - // Tool ID 26 "Xenia Emulator Microcode Translator". - // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79 // TODO(Triang3l): Logger. - builder_ = std::make_unique(features_.spirv_version, - (26 << 16) | 1, nullptr); + builder_ = std::make_unique( + features_.spirv_version, (kSpirvMagicToolId << 16) | 1, nullptr); builder_->addCapability(IsSpirvTessEvalShader() ? spv::CapabilityTessellation : spv::CapabilityShader); @@ -1535,20 +1533,20 @@ spv::Id SpirvShaderTranslator::GetUnmodifiedOperandComponents( static_cast(original_operand.GetComponent(scalar_index)) - static_cast(SwizzleSource::kX)); } - id_vector_temp_util_.clear(); - id_vector_temp_util_.reserve(component_count); + uint_vector_temp_util_.clear(); + uint_vector_temp_util_.reserve(component_count); uint32_t components_remaining = components; uint32_t component_index; while (xe::bit_scan_forward(components_remaining, &component_index)) { components_remaining &= ~(uint32_t(1) << component_index); - id_vector_temp_util_.push_back( + uint_vector_temp_util_.push_back( static_cast( original_operand.GetComponent(component_index)) - static_cast(SwizzleSource::kX)); } return builder_->createRvalueSwizzle(spv::NoPrecision, type_float_vectors_[component_count - 1], - operand_storage, id_vector_temp_util_); + operand_storage, uint_vector_temp_util_); } void SpirvShaderTranslator::GetOperandScalarXY( diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 932bd608f..beb478bb6 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -138,6 +138,10 @@ class SpirvShaderTranslator : public ShaderTranslator { kDescriptorSetCount, }; + // "Xenia Emulator Microcode Translator". + // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79 + static constexpr uint32_t kSpirvMagicToolId = 26; + struct Features { explicit Features(const ui::vulkan::VulkanProvider& provider); explicit Features(bool all = false); @@ -172,6 +176,38 @@ class SpirvShaderTranslator : public ShaderTranslator { features_.max_storage_buffer_range); } + // Common functions useful not only for the translator, but also for EDRAM + // emulation via conventional render targets. + + // Converts the color value externally clamped to [0, 31.875] to 7e3 floating + // point, with zeros in bits 10:31, rounding to the nearest even. + static spv::Id PreClampedFloat32To7e3(spv::Builder& builder, + spv::Id f32_scalar, + spv::Id ext_inst_glsl_std_450); + // Same as PreClampedFloat32To7e3, but clamps the input to [0, 31.875]. + static spv::Id UnclampedFloat32To7e3(spv::Builder& builder, + spv::Id f32_scalar, + spv::Id ext_inst_glsl_std_450); + // Converts the 7e3 number in bits [f10_shift, f10_shift + 10) to a 32-bit + // float. + static spv::Id Float7e3To32(spv::Builder& builder, spv::Id f10_uint_scalar, + uint32_t f10_shift, bool result_as_uint, + spv::Id ext_inst_glsl_std_450); + // Converts the depth value externally clamped to the representable [0, 2) + // range to 20e4 floating point, with zeros in bits 24:31, rounding to the + // nearest even. If remap_from_0_to_0_5 is true, it's assumed that 0...1 is + // pre-remapped to 0...0.5 in the input. + static spv::Id PreClampedDepthTo20e4(spv::Builder& builder, + spv::Id f32_scalar, + bool remap_from_0_to_0_5, + spv::Id ext_inst_glsl_std_450); + // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit + // float. + static spv::Id Depth20e4To32(spv::Builder& builder, spv::Id f24_uint_scalar, + uint32_t f24_shift, bool remap_to_0_to_0_5, + bool result_as_uint, + spv::Id ext_inst_glsl_std_450); + protected: void Reset() override; diff --git a/src/xenia/gpu/spirv_shader_translator_rb.cc b/src/xenia/gpu/spirv_shader_translator_rb.cc new file mode 100644 index 000000000..4cb260bdd --- /dev/null +++ b/src/xenia/gpu/spirv_shader_translator_rb.cc @@ -0,0 +1,425 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/spirv_shader_translator.h" + +#include +#include + +#include "third_party/glslang/SPIRV/GLSL.std.450.h" +#include "xenia/base/assert.h" + +namespace xe { +namespace gpu { + +spv::Id SpirvShaderTranslator::PreClampedFloat32To7e3( + spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) { + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + // Assuming the value is already clamped to [0, 31.875]. + + spv::Id type_uint = builder.makeUintType(32); + + // Need the source as uint for bit operations. + { + spv::Id source_type = builder.getTypeId(f32_scalar); + assert_true(builder.isScalarType(source_type)); + if (!builder.isUintType(source_type)) { + f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar); + } + } + + // The denormal 7e3 case. + // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000 + spv::Id denormal_biased_f32; + { + spv::Instruction* denormal_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + denormal_insert_instruction->addIdOperand(f32_scalar); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_insert_instruction)); + denormal_biased_f32 = denormal_insert_instruction->getResultId(); + } + // denormal_biased_f32_shift_amount = min(125 - (f32 >> 23), 24) + // Not allowing the shift to overflow as that's undefined in SPIR-V. + spv::Id denormal_biased_f32_shift_amount; + { + spv::Instruction* denormal_shift_amount_instruction = + new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst); + denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin); + denormal_shift_amount_instruction->addIdOperand(builder.createBinOp( + spv::OpISub, type_uint, builder.makeUintConstant(125), + builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar, + builder.makeUintConstant(23)))); + denormal_shift_amount_instruction->addIdOperand( + builder.makeUintConstant(24)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_shift_amount_instruction)); + denormal_biased_f32_shift_amount = + denormal_shift_amount_instruction->getResultId(); + } + // denormal_biased_f32 = + // ((f32 & 0x7FFFFF) | 0x800000) >> min(125 - (f32 >> 23), 24) + denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint, + denormal_biased_f32, + denormal_biased_f32_shift_amount); + + // The normal 7e3 case. + // Bias the exponent. + // normal_biased_f32 = f32 - (124 << 23) + spv::Id normal_biased_f32 = + builder.createBinOp(spv::OpISub, type_uint, f32_scalar, + builder.makeUintConstant(UINT32_C(124) << 23)); + + // Select the needed conversion depending on whether the number is too small + // to be represented as normalized 7e3. + spv::Id biased_f32 = builder.createTriOp( + spv::OpSelect, type_uint, + builder.createBinOp(spv::OpULessThan, builder.makeBoolType(), f32_scalar, + builder.makeUintConstant(0x3E800000)), + denormal_biased_f32, normal_biased_f32); + + // Build the 7e3 number rounding to the nearest even. + // ((biased_f32 + 0x7FFF + ((biased_f32 >> 16) & 1)) >> 16) & 0x3FF + return builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIAdd, type_uint, biased_f32, + builder.makeUintConstant(0x7FFF)), + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32, + builder.makeUintConstant(16), + builder.makeUintConstant(1))), + builder.makeUintConstant(16), builder.makeUintConstant(10)); +} + +spv::Id SpirvShaderTranslator::UnclampedFloat32To7e3( + spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) { + spv::Id type_float = builder.makeFloatType(32); + + // Need the source as float for clamping. + { + spv::Id source_type = builder.getTypeId(f32_scalar); + assert_true(builder.isScalarType(source_type)); + if (!builder.isFloatType(source_type)) { + f32_scalar = + builder.createUnaryOp(spv::OpBitcast, type_float, f32_scalar); + } + } + + { + spv::Instruction* clamp_instruction = + new spv::Instruction(builder.getUniqueId(), type_float, spv::OpExtInst); + clamp_instruction->addIdOperand(ext_inst_glsl_std_450); + clamp_instruction->addImmediateOperand(GLSLstd450NClamp); + clamp_instruction->addIdOperand(f32_scalar); + clamp_instruction->addIdOperand(builder.makeFloatConstant(0.0f)); + clamp_instruction->addIdOperand(builder.makeFloatConstant(31.875f)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(clamp_instruction)); + f32_scalar = clamp_instruction->getResultId(); + } + + return PreClampedFloat32To7e3(builder, f32_scalar, ext_inst_glsl_std_450); +} + +spv::Id SpirvShaderTranslator::Float7e3To32(spv::Builder& builder, + spv::Id f10_uint_scalar, + uint32_t f10_shift, + bool result_as_uint, + spv::Id ext_inst_glsl_std_450) { + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + + assert_true(builder.isUintType(builder.getTypeId(f10_uint_scalar))); + assert_true(f10_shift <= (32 - 10)); + + spv::Id type_bool = builder.makeBoolType(); + spv::Id type_int = builder.makeIntType(32); + spv::Id type_uint = builder.makeUintType(32); + + spv::Id f10_unbiased_exponent = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f10_uint_scalar, + builder.makeUintConstant(f10_shift + 7), builder.makeUintConstant(3)); + spv::Id f10_mantissa = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f10_uint_scalar, + builder.makeUintConstant(f10_shift), builder.makeUintConstant(7)); + + // The denormal nonzero 7e3 case. + // denormal_mantissa_msb = findMSB(f10_mantissa) + spv::Id denormal_mantissa_msb; + { + spv::Instruction* denormal_mantissa_msb_instruction = + new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst); + denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb); + denormal_mantissa_msb_instruction->addIdOperand(f10_mantissa); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_mantissa_msb_instruction)); + denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId(); + } + denormal_mantissa_msb = + builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb); + // denormal_f32_unbiased_exponent = 1 - (7 - findMSB(f10_mantissa)) + // Or: + // denormal_f32_unbiased_exponent = findMSB(f10_mantissa) - 6 + spv::Id denormal_f32_unbiased_exponent = + builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb, + builder.makeUintConstant(6)); + // Normalize the mantissa. + // denormal_f32_mantissa = f10_mantissa << (7 - findMSB(f10_mantissa)) + spv::Id denormal_f32_mantissa = builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, f10_mantissa, + builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(7), + denormal_mantissa_msb)); + // If the 7e3 number is zero, make sure the float32 number is zero too. + spv::Id f10_mantissa_is_nonzero = builder.createBinOp( + spv::OpINotEqual, type_bool, f10_mantissa, builder.makeUintConstant(0)); + // Set the unbiased exponent to -124 for zero - 124 will be added later, + // resulting in zero float32. + denormal_f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f10_mantissa_is_nonzero, + denormal_f32_unbiased_exponent, builder.makeUintConstant(uint32_t(-124))); + denormal_f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f10_mantissa_is_nonzero, + denormal_f32_mantissa, builder.makeUintConstant(0)); + + // Select the needed conversion depending on whether the number is normal. + spv::Id f10_is_normal = + builder.createBinOp(spv::OpINotEqual, type_bool, f10_unbiased_exponent, + builder.makeUintConstant(0)); + spv::Id f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f10_is_normal, f10_unbiased_exponent, + denormal_f32_unbiased_exponent); + spv::Id f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f10_is_normal, f10_mantissa, + denormal_f32_mantissa); + + // Bias the exponent and construct the build the float32 number. + spv::Id f32_shifted; + { + spv::Instruction* f32_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + f32_insert_instruction->addIdOperand(f32_mantissa); + f32_insert_instruction->addIdOperand( + builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent, + builder.makeUintConstant(124))); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(7)); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(8)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(f32_insert_instruction)); + f32_shifted = f32_insert_instruction->getResultId(); + } + spv::Id f32 = + builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted, + builder.makeUintConstant(23 - 7)); + + if (!result_as_uint) { + f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32); + } + + return f32; +} + +spv::Id SpirvShaderTranslator::PreClampedDepthTo20e4( + spv::Builder& builder, spv::Id f32_scalar, bool remap_from_0_to_0_5, + spv::Id ext_inst_glsl_std_450) { + // CFloat24 from d3dref9.dll + + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + // Assuming the value is already clamped to [0, 2) (in all places, the depth + // is written with saturation). + + uint32_t remap_bias = uint32_t(remap_from_0_to_0_5); + + spv::Id type_uint = builder.makeUintType(32); + + // Need the source as uint for bit operations. + { + spv::Id source_type = builder.getTypeId(f32_scalar); + assert_true(builder.isScalarType(source_type)); + if (!builder.isUintType(source_type)) { + f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar); + } + } + + // The denormal 20e4 case. + // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000 + spv::Id denormal_biased_f32; + { + spv::Instruction* denormal_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + denormal_insert_instruction->addIdOperand(f32_scalar); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23)); + denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_insert_instruction)); + denormal_biased_f32 = denormal_insert_instruction->getResultId(); + } + // denormal_biased_f32_shift_amount = min(113 - (f32 >> 23), 24) + // Not allowing the shift to overflow as that's undefined in SPIR-V. + spv::Id denormal_biased_f32_shift_amount; + { + spv::Instruction* denormal_shift_amount_instruction = + new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst); + denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin); + denormal_shift_amount_instruction->addIdOperand(builder.createBinOp( + spv::OpISub, type_uint, builder.makeUintConstant(113 - remap_bias), + builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar, + builder.makeUintConstant(23)))); + denormal_shift_amount_instruction->addIdOperand( + builder.makeUintConstant(24)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_shift_amount_instruction)); + denormal_biased_f32_shift_amount = + denormal_shift_amount_instruction->getResultId(); + } + // denormal_biased_f32 = + // ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) + denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint, + denormal_biased_f32, + denormal_biased_f32_shift_amount); + + // The normal 20e4 case. + // Bias the exponent. + // normal_biased_f32 = f32 - (112 << 23) + spv::Id normal_biased_f32 = builder.createBinOp( + spv::OpISub, type_uint, f32_scalar, + builder.makeUintConstant((UINT32_C(112) + remap_bias) << 23)); + + // Select the needed conversion depending on whether the number is too small + // to be represented as normalized 20e4. + spv::Id biased_f32 = builder.createTriOp( + spv::OpSelect, type_uint, + builder.createBinOp( + spv::OpULessThan, builder.makeBoolType(), f32_scalar, + builder.makeUintConstant(0x38800000 - (remap_bias << 23))), + denormal_biased_f32, normal_biased_f32); + + // Build the 20e4 number rounding to the nearest even. + // ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF + return builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIAdd, type_uint, biased_f32, + builder.makeUintConstant(3)), + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32, + builder.makeUintConstant(3), + builder.makeUintConstant(1))), + builder.makeUintConstant(3), builder.makeUintConstant(24)); +} + +spv::Id SpirvShaderTranslator::Depth20e4To32(spv::Builder& builder, + spv::Id f24_uint_scalar, + uint32_t f24_shift, + bool remap_to_0_to_0_5, + bool result_as_uint, + spv::Id ext_inst_glsl_std_450) { + // CFloat24 from d3dref9.dll + + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + + assert_true(builder.isUintType(builder.getTypeId(f24_uint_scalar))); + assert_true(f24_shift <= (32 - 24)); + + uint32_t remap_bias = uint32_t(remap_to_0_to_0_5); + + spv::Id type_bool = builder.makeBoolType(); + spv::Id type_int = builder.makeIntType(32); + spv::Id type_uint = builder.makeUintType(32); + + spv::Id f24_unbiased_exponent = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f24_uint_scalar, + builder.makeUintConstant(f24_shift + 20), builder.makeUintConstant(4)); + spv::Id f24_mantissa = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, f24_uint_scalar, + builder.makeUintConstant(f24_shift), builder.makeUintConstant(20)); + + // The denormal nonzero 20e4 case. + // denormal_mantissa_msb = findMSB(f24_mantissa) + spv::Id denormal_mantissa_msb; + { + spv::Instruction* denormal_mantissa_msb_instruction = + new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst); + denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450); + denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb); + denormal_mantissa_msb_instruction->addIdOperand(f24_mantissa); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(denormal_mantissa_msb_instruction)); + denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId(); + } + denormal_mantissa_msb = + builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb); + // denormal_f32_unbiased_exponent = 1 - (20 - findMSB(f24_mantissa)) + // Or: + // denormal_f32_unbiased_exponent = findMSB(f24_mantissa) - 19 + spv::Id denormal_f32_unbiased_exponent = + builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb, + builder.makeUintConstant(19)); + // Normalize the mantissa. + // denormal_f32_mantissa = f24_mantissa << (20 - findMSB(f24_mantissa)) + spv::Id denormal_f32_mantissa = builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, f24_mantissa, + builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(20), + denormal_mantissa_msb)); + // If the 20e4 number is zero, make sure the float32 number is zero too. + spv::Id f24_mantissa_is_nonzero = builder.createBinOp( + spv::OpINotEqual, type_bool, f24_mantissa, builder.makeUintConstant(0)); + // Set the unbiased exponent to -112 for zero - 112 will be added later, + // resulting in zero float32. + denormal_f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f24_mantissa_is_nonzero, + denormal_f32_unbiased_exponent, + builder.makeUintConstant(uint32_t(-int32_t(112 - remap_bias)))); + denormal_f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f24_mantissa_is_nonzero, + denormal_f32_mantissa, builder.makeUintConstant(0)); + + // Select the needed conversion depending on whether the number is normal. + spv::Id f24_is_normal = + builder.createBinOp(spv::OpINotEqual, type_bool, f24_unbiased_exponent, + builder.makeUintConstant(0)); + spv::Id f32_unbiased_exponent = builder.createTriOp( + spv::OpSelect, type_uint, f24_is_normal, f24_unbiased_exponent, + denormal_f32_unbiased_exponent); + spv::Id f32_mantissa = + builder.createTriOp(spv::OpSelect, type_uint, f24_is_normal, f24_mantissa, + denormal_f32_mantissa); + + // Bias the exponent and construct the build the float32 number. + spv::Id f32_shifted; + { + spv::Instruction* f32_insert_instruction = new spv::Instruction( + builder.getUniqueId(), type_uint, spv::OpBitFieldInsert); + f32_insert_instruction->addIdOperand(f32_mantissa); + f32_insert_instruction->addIdOperand( + builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent, + builder.makeUintConstant(112 - remap_bias))); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(20)); + f32_insert_instruction->addIdOperand(builder.makeUintConstant(8)); + builder.getBuildPoint()->addInstruction( + std::unique_ptr(f32_insert_instruction)); + f32_shifted = f32_insert_instruction->getResultId(); + } + spv::Id f32 = + builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted, + builder.makeUintConstant(23 - 20)); + + if (!result_as_uint) { + f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32); + } + + return f32; +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/vulkan/deferred_command_buffer.cc b/src/xenia/gpu/vulkan/deferred_command_buffer.cc index 470d8adde..98d42865d 100644 --- a/src/xenia/gpu/vulkan/deferred_command_buffer.cc +++ b/src/xenia/gpu/vulkan/deferred_command_buffer.cc @@ -103,6 +103,37 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) { args.pipeline); } break; + case Command::kVkBindVertexBuffers: { + auto& args = *reinterpret_cast(stream); + size_t offset_bytes = + xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer)); + const VkBuffer* buffers = reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + offset_bytes = + xe::align(offset_bytes + sizeof(VkBuffer) * args.binding_count, + alignof(VkDeviceSize)); + const VkDeviceSize* offsets = reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + dfn.vkCmdBindVertexBuffers(command_buffer, args.first_binding, + args.binding_count, buffers, offsets); + } break; + + case Command::kVkClearAttachments: { + auto& args = *reinterpret_cast(stream); + size_t offset_bytes = xe::align(sizeof(ArgsVkClearAttachments), + alignof(VkClearAttachment)); + const VkClearAttachment* attachments = + reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + offset_bytes = xe::align( + offset_bytes + sizeof(VkClearAttachment) * args.attachment_count, + alignof(VkClearRect)); + const VkClearRect* rects = reinterpret_cast( + reinterpret_cast(stream) + offset_bytes); + dfn.vkCmdClearAttachments(command_buffer, args.attachment_count, + attachments, args.rect_count, rects); + } break; + case Command::kVkCopyBuffer: { auto& args = *reinterpret_cast(stream); dfn.vkCmdCopyBuffer( @@ -112,6 +143,12 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) { xe::align(sizeof(ArgsVkCopyBuffer), alignof(VkBufferCopy)))); } break; + case Command::kVkDispatch: { + auto& args = *reinterpret_cast(stream); + dfn.vkCmdDispatch(command_buffer, args.group_count_x, + args.group_count_y, args.group_count_z); + } break; + case Command::kVkDraw: { auto& args = *reinterpret_cast(stream); dfn.vkCmdDraw(command_buffer, args.vertex_count, args.instance_count, @@ -168,6 +205,14 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) { args.image_memory_barrier_count, image_memory_barriers); } break; + case Command::kVkPushConstants: { + auto& args = *reinterpret_cast(stream); + dfn.vkCmdPushConstants(command_buffer, args.layout, args.stage_flags, + args.offset, args.size, + reinterpret_cast(stream) + + sizeof(ArgsVkPushConstants)); + } break; + case Command::kVkSetBlendConstants: { auto& args = *reinterpret_cast(stream); dfn.vkCmdSetBlendConstants(command_buffer, args.blend_constants); diff --git a/src/xenia/gpu/vulkan/deferred_command_buffer.h b/src/xenia/gpu/vulkan/deferred_command_buffer.h index ac4c88f85..e3605f1e6 100644 --- a/src/xenia/gpu/vulkan/deferred_command_buffer.h +++ b/src/xenia/gpu/vulkan/deferred_command_buffer.h @@ -108,6 +108,61 @@ class DeferredCommandBuffer { args.pipeline = pipeline; } + void CmdVkBindVertexBuffers(uint32_t first_binding, uint32_t binding_count, + const VkBuffer* buffers, + const VkDeviceSize* offsets) { + size_t arguments_size = + xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer)); + size_t buffers_offset = arguments_size; + arguments_size = + xe::align(arguments_size + sizeof(VkBuffer) * binding_count, + alignof(VkDeviceSize)); + size_t offsets_offset = arguments_size; + arguments_size += sizeof(VkDeviceSize) * binding_count; + uint8_t* args_ptr = reinterpret_cast( + WriteCommand(Command::kVkBindVertexBuffers, arguments_size)); + auto& args = *reinterpret_cast(args_ptr); + args.first_binding = first_binding; + args.binding_count = binding_count; + std::memcpy(args_ptr + buffers_offset, buffers, + sizeof(VkBuffer) * binding_count); + std::memcpy(args_ptr + offsets_offset, offsets, + sizeof(VkDeviceSize) * binding_count); + } + + void CmdClearAttachmentsEmplace(uint32_t attachment_count, + VkClearAttachment*& attachments_out, + uint32_t rect_count, + VkClearRect*& rects_out) { + size_t arguments_size = + xe::align(sizeof(ArgsVkClearAttachments), alignof(VkClearAttachment)); + size_t attachments_offset = arguments_size; + arguments_size = + xe::align(arguments_size + sizeof(VkClearAttachment) * attachment_count, + alignof(VkClearRect)); + size_t rects_offset = arguments_size; + arguments_size += sizeof(VkClearRect) * rect_count; + uint8_t* args_ptr = reinterpret_cast( + WriteCommand(Command::kVkClearAttachments, arguments_size)); + auto& args = *reinterpret_cast(args_ptr); + args.attachment_count = attachment_count; + args.rect_count = rect_count; + attachments_out = + reinterpret_cast(args_ptr + attachments_offset); + rects_out = reinterpret_cast(args_ptr + rects_offset); + } + void CmdVkClearAttachments(uint32_t attachment_count, + const VkClearAttachment* attachments, + uint32_t rect_count, const VkClearRect* rects) { + VkClearAttachment* attachments_arg; + VkClearRect* rects_arg; + CmdClearAttachmentsEmplace(attachment_count, attachments_arg, rect_count, + rects_arg); + std::memcpy(attachments_arg, attachments, + sizeof(VkClearAttachment) * attachment_count); + std::memcpy(rects_arg, rects, sizeof(VkClearRect) * rect_count); + } + VkBufferCopy* CmdCopyBufferEmplace(VkBuffer src_buffer, VkBuffer dst_buffer, uint32_t region_count) { const size_t header_size = @@ -127,6 +182,15 @@ class DeferredCommandBuffer { regions, sizeof(VkBufferCopy) * region_count); } + void CmdVkDispatch(uint32_t group_count_x, uint32_t group_count_y, + uint32_t group_count_z) { + auto& args = *reinterpret_cast( + WriteCommand(Command::kVkDispatch, sizeof(ArgsVkDispatch))); + args.group_count_x = group_count_x; + args.group_count_y = group_count_y; + args.group_count_z = group_count_z; + } + void CmdVkDraw(uint32_t vertex_count, uint32_t instance_count, uint32_t first_vertex, uint32_t first_instance) { auto& args = *reinterpret_cast( @@ -162,6 +226,19 @@ class DeferredCommandBuffer { uint32_t image_memory_barrier_count, const VkImageMemoryBarrier* image_memory_barriers); + void CmdVkPushConstants(VkPipelineLayout layout, + VkShaderStageFlags stage_flags, uint32_t offset, + uint32_t size, const void* values) { + uint8_t* args_ptr = reinterpret_cast(WriteCommand( + Command::kVkPushConstants, sizeof(ArgsVkPushConstants) + size)); + auto& args = *reinterpret_cast(args_ptr); + args.layout = layout; + args.stage_flags = stage_flags; + args.offset = offset; + args.size = size; + std::memcpy(args_ptr + sizeof(ArgsVkPushConstants), values, size); + } + void CmdVkSetBlendConstants(const float* blend_constants) { auto& args = *reinterpret_cast(WriteCommand( Command::kVkSetBlendConstants, sizeof(ArgsVkSetBlendConstants))); @@ -237,11 +314,15 @@ class DeferredCommandBuffer { kVkBindDescriptorSets, kVkBindIndexBuffer, kVkBindPipeline, + kVkBindVertexBuffers, + kVkClearAttachments, kVkCopyBuffer, + kVkDispatch, kVkDraw, kVkDrawIndexed, kVkEndRenderPass, kVkPipelineBarrier, + kVkPushConstants, kVkSetBlendConstants, kVkSetDepthBias, kVkSetScissor, @@ -289,6 +370,22 @@ class DeferredCommandBuffer { VkPipeline pipeline; }; + struct ArgsVkBindVertexBuffers { + uint32_t first_binding; + uint32_t binding_count; + // Followed by aligned VkBuffer[], VkDeviceSize[]. + static_assert(alignof(VkBuffer) <= alignof(uintmax_t)); + static_assert(alignof(VkDeviceSize) <= alignof(uintmax_t)); + }; + + struct ArgsVkClearAttachments { + uint32_t attachment_count; + uint32_t rect_count; + // Followed by aligned VkClearAttachment[], VkClearRect[]. + static_assert(alignof(VkClearAttachment) <= alignof(uintmax_t)); + static_assert(alignof(VkClearRect) <= alignof(uintmax_t)); + }; + struct ArgsVkCopyBuffer { VkBuffer src_buffer; VkBuffer dst_buffer; @@ -297,6 +394,12 @@ class DeferredCommandBuffer { static_assert(alignof(VkBufferCopy) <= alignof(uintmax_t)); }; + struct ArgsVkDispatch { + uint32_t group_count_x; + uint32_t group_count_y; + uint32_t group_count_z; + }; + struct ArgsVkDraw { uint32_t vertex_count; uint32_t instance_count; @@ -326,6 +429,14 @@ class DeferredCommandBuffer { static_assert(alignof(VkImageMemoryBarrier) <= alignof(uintmax_t)); }; + struct ArgsVkPushConstants { + VkPipelineLayout layout; + VkShaderStageFlags stage_flags; + uint32_t offset; + uint32_t size; + // Followed by `size` bytes of values. + }; + struct ArgsVkSetBlendConstants { float blend_constants[4]; }; diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua index 44205f326..ffc359504 100644 --- a/src/xenia/gpu/vulkan/premake5.lua +++ b/src/xenia/gpu/vulkan/premake5.lua @@ -8,6 +8,7 @@ project("xenia-gpu-vulkan") language("C++") links({ "fmt", + "glslang-spirv", "xenia-base", "xenia-gpu", "xenia-ui", diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 69d0c70a3..4f534c9dd 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -476,7 +476,7 @@ bool VulkanCommandProcessor::SetupContext() { swap_pipeline_create_info.renderPass = swap_render_pass_; swap_pipeline_create_info.subpass = 0; swap_pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; - swap_pipeline_create_info.basePipelineIndex = UINT32_MAX; + swap_pipeline_create_info.basePipelineIndex = -1; VkResult swap_pipeline_create_result = dfn.vkCreateGraphicsPipelines( device, VK_NULL_HANDLE, 1, &swap_pipeline_create_info, nullptr, &swap_pipeline_); @@ -810,8 +810,6 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, deferred_command_buffer_.CmdVkBeginRenderPass( &render_pass_begin_info, VK_SUBPASS_CONTENTS_INLINE); - dynamic_viewport_update_needed_ = true; - dynamic_scissor_update_needed_ = true; VkViewport viewport; viewport.x = 0.0f; viewport.y = 0.0f; @@ -819,13 +817,13 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, viewport.height = float(scaled_height); viewport.minDepth = 0.0f; viewport.maxDepth = 1.0f; - deferred_command_buffer_.CmdVkSetViewport(0, 1, &viewport); - VkRect2D scissor_rect; - scissor_rect.offset.x = 0; - scissor_rect.offset.y = 0; - scissor_rect.extent.width = scaled_width; - scissor_rect.extent.height = scaled_height; - deferred_command_buffer_.CmdVkSetScissor(0, 1, &scissor_rect); + SetViewport(viewport); + VkRect2D scissor; + scissor.offset.x = 0; + scissor.offset.y = 0; + scissor.extent.width = scaled_width; + scissor.extent.height = scaled_height; + SetScissor(scissor); BindExternalGraphicsPipeline(swap_pipeline_); @@ -856,7 +854,7 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, EndSubmission(true); } -void VulkanCommandProcessor::PushBufferMemoryBarrier( +bool VulkanCommandProcessor::PushBufferMemoryBarrier( VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, @@ -865,7 +863,7 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier( if (skip_if_equal && src_stage_mask == dst_stage_mask && src_access_mask == dst_access_mask && src_queue_family_index == dst_queue_family_index) { - return; + return false; } // Separate different barriers for overlapping buffer ranges into different @@ -889,10 +887,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier( src_queue_family_index && other_buffer_memory_barrier.dstQueueFamilyIndex == dst_queue_family_index) { - // The barrier is already present. + // The barrier is already pending. current_pending_barrier_.src_stage_mask |= src_stage_mask; current_pending_barrier_.dst_stage_mask |= dst_stage_mask; - return; + return true; } SplitPendingBarrier(); break; @@ -911,9 +909,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier( buffer_memory_barrier.buffer = buffer; buffer_memory_barrier.offset = offset; buffer_memory_barrier.size = size; + return true; } -void VulkanCommandProcessor::PushImageMemoryBarrier( +bool VulkanCommandProcessor::PushImageMemoryBarrier( VkImage image, const VkImageSubresourceRange& subresource_range, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, @@ -923,7 +922,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier( if (skip_if_equal && src_stage_mask == dst_stage_mask && src_access_mask == dst_access_mask && old_layout == new_layout && src_queue_family_index == dst_queue_family_index) { - return; + return false; } // Separate different barriers for overlapping image subresource ranges into @@ -969,10 +968,10 @@ void VulkanCommandProcessor::PushImageMemoryBarrier( src_queue_family_index && other_image_memory_barrier.dstQueueFamilyIndex == dst_queue_family_index) { - // The barrier is already present. + // The barrier is already pending. current_pending_barrier_.src_stage_mask |= src_stage_mask; current_pending_barrier_.dst_stage_mask |= dst_stage_mask; - return; + return true; } SplitPendingBarrier(); break; @@ -992,6 +991,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier( image_memory_barrier.dstQueueFamilyIndex = dst_queue_family_index; image_memory_barrier.image = image; image_memory_barrier.subresourceRange = subresource_range; + return true; } bool VulkanCommandProcessor::SubmitBarriers(bool force_end_render_pass) { @@ -1257,6 +1257,53 @@ void VulkanCommandProcessor::BindExternalGraphicsPipeline( current_guest_graphics_pipeline_layout_ = VK_NULL_HANDLE; } +void VulkanCommandProcessor::BindExternalComputePipeline(VkPipeline pipeline) { + if (current_external_compute_pipeline_ == pipeline) { + return; + } + deferred_command_buffer_.CmdVkBindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); + current_external_compute_pipeline_ = pipeline; +} + +void VulkanCommandProcessor::SetViewport(const VkViewport& viewport) { + if (!dynamic_viewport_update_needed_) { + dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x; + dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.width != viewport.width; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.height != viewport.height; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.minDepth != viewport.minDepth; + dynamic_viewport_update_needed_ |= + dynamic_viewport_.maxDepth != viewport.maxDepth; + } + if (dynamic_viewport_update_needed_) { + dynamic_viewport_ = viewport; + deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_); + dynamic_viewport_update_needed_ = false; + } +} + +void VulkanCommandProcessor::SetScissor(const VkRect2D& scissor) { + if (!dynamic_scissor_update_needed_) { + dynamic_scissor_update_needed_ |= + dynamic_scissor_.offset.x != scissor.offset.x; + dynamic_scissor_update_needed_ |= + dynamic_scissor_.offset.y != scissor.offset.y; + dynamic_scissor_update_needed_ |= + dynamic_scissor_.extent.width != scissor.extent.width; + dynamic_scissor_update_needed_ |= + dynamic_scissor_.extent.height != scissor.extent.height; + } + if (dynamic_scissor_update_needed_) { + dynamic_scissor_ = scissor; + deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_); + dynamic_scissor_update_needed_ = false; + } +} + Shader* VulkanCommandProcessor::LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, @@ -1417,8 +1464,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, } const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); - const VkPhysicalDeviceProperties& device_properties = - provider.device_properties(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; // Get dynamic rasterizer state. draw_util::ViewportInfo viewport_info; @@ -1438,10 +1485,10 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // life. Or even disregard the viewport bounds range in the fragment shader // interlocks case completely - apply the viewport and the scissor offset // directly to pixel address and to things like ps_param_gen. - draw_util::GetHostViewportInfo( - regs, 1, 1, false, device_properties.limits.maxViewportDimensions[0], - device_properties.limits.maxViewportDimensions[1], true, false, false, - false, viewport_info); + draw_util::GetHostViewportInfo(regs, 1, 1, false, + device_limits.maxViewportDimensions[0], + device_limits.maxViewportDimensions[1], true, + false, false, false, viewport_info); // Update dynamic graphics pipeline state. UpdateDynamicState(viewport_info, primitive_polygonal); @@ -1675,6 +1722,8 @@ void VulkanCommandProcessor::CheckSubmissionFenceAndDeviceLoss( primitive_processor_->CompletedSubmissionUpdated(); + render_target_cache_->CompletedSubmissionUpdated(); + // Destroy outdated swap objects. while (!swap_framebuffers_outdated_.empty()) { const auto& framebuffer_pair = swap_framebuffers_outdated_.front(); @@ -1752,6 +1801,7 @@ bool VulkanCommandProcessor::BeginSubmission(bool is_guest_command) { current_framebuffer_ = nullptr; current_guest_graphics_pipeline_ = VK_NULL_HANDLE; current_external_graphics_pipeline_ = VK_NULL_HANDLE; + current_external_compute_pipeline_ = VK_NULL_HANDLE; current_guest_graphics_pipeline_layout_ = nullptr; current_graphics_descriptor_sets_bound_up_to_date_ = 0; @@ -1861,6 +1911,8 @@ bool VulkanCommandProcessor::EndSubmission(bool is_swap) { if (submission_open_) { EndRenderPass(); + render_target_cache_->EndSubmission(); + primitive_processor_->EndSubmission(); shared_memory_->EndSubmission(); @@ -2112,20 +2164,7 @@ void VulkanCommandProcessor::UpdateDynamicState( } viewport.minDepth = viewport_info.z_min; viewport.maxDepth = viewport_info.z_max; - dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x; - dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y; - dynamic_viewport_update_needed_ |= dynamic_viewport_.width != viewport.width; - dynamic_viewport_update_needed_ |= - dynamic_viewport_.height != viewport.height; - dynamic_viewport_update_needed_ |= - dynamic_viewport_.minDepth != viewport.minDepth; - dynamic_viewport_update_needed_ |= - dynamic_viewport_.maxDepth != viewport.maxDepth; - if (dynamic_viewport_update_needed_) { - dynamic_viewport_ = viewport; - deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_); - dynamic_viewport_update_needed_ = false; - } + SetViewport(viewport); // Scissor. draw_util::Scissor scissor; @@ -2135,19 +2174,7 @@ void VulkanCommandProcessor::UpdateDynamicState( scissor_rect.offset.y = int32_t(scissor.offset[1]); scissor_rect.extent.width = scissor.extent[0]; scissor_rect.extent.height = scissor.extent[1]; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.offset.x != scissor_rect.offset.x; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.offset.y != scissor_rect.offset.y; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.extent.width != scissor_rect.extent.width; - dynamic_scissor_update_needed_ |= - dynamic_scissor_.extent.height != scissor_rect.extent.height; - if (dynamic_scissor_update_needed_) { - dynamic_scissor_ = scissor_rect; - deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_); - dynamic_scissor_update_needed_ = false; - } + SetScissor(scissor_rect); // Depth bias. // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 551a3fcae..54c25d22f 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2020 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -81,15 +81,16 @@ class VulkanCommandProcessor : public CommandProcessor { uint64_t GetCurrentFrame() const { return frame_current_; } uint64_t GetCompletedFrame() const { return frame_completed_; } - // Submission must be open to insert barriers. - void PushBufferMemoryBarrier( + // Submission must be open to insert barriers. Returning true if the barrier + // has actually been inserted and not dropped. + bool PushBufferMemoryBarrier( VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, uint32_t src_queue_family_index = VK_QUEUE_FAMILY_IGNORED, uint32_t dst_queue_family_index = VK_QUEUE_FAMILY_IGNORED, bool skip_if_equal = true); - void PushImageMemoryBarrier( + bool PushImageMemoryBarrier( VkImage image, const VkImageSubresourceRange& subresource_range, VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask, @@ -125,6 +126,9 @@ class VulkanCommandProcessor : public CommandProcessor { bool keep_dynamic_depth_bias = false, bool keep_dynamic_blend_constants = false, bool keep_dynamic_stencil_mask_ref = false); + void BindExternalComputePipeline(VkPipeline pipeline); + void SetViewport(const VkViewport& viewport); + void SetScissor(const VkRect2D& scissor); protected: bool SetupContext() override; @@ -211,6 +215,9 @@ class VulkanCommandProcessor : public CommandProcessor { // open non-frame submission, BeginSubmission(true) will promote it to a // frame. EndSubmission(true) will close the frame no matter whether the // submission has already been closed. + // Unlike on Direct3D 12, submission boundaries do not imply any memory + // barriers aside from an incoming host write (but not outgoing host read) + // dependency. // Rechecks submission number and reclaims per-submission resources. Pass 0 as // the submission to await to simply check status, or pass @@ -396,6 +403,7 @@ class VulkanCommandProcessor : public CommandProcessor { // TODO(Triang3l): Change to a deferred compilation handle. VkPipeline current_guest_graphics_pipeline_; VkPipeline current_external_graphics_pipeline_; + VkPipeline current_external_compute_pipeline_; // Pipeline layout of the current guest graphics pipeline. const PipelineLayout* current_guest_graphics_pipeline_layout_; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index 8f581f0fa..450a346b0 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -884,11 +884,25 @@ bool VulkanPipelineCache::EnsurePipelineCreated( // TODO(Triang3l): Wide lines. rasterization_state.lineWidth = 1.0f; + VkSampleMask sample_mask = UINT32_MAX; VkPipelineMultisampleStateCreateInfo multisample_state = {}; multisample_state.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisample_state.rasterizationSamples = VkSampleCountFlagBits( - uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples)); + if (description.render_pass_key.msaa_samples == xenos::MsaaSamples::k2X && + !render_target_cache_.IsMsaa2xSupported( + description.render_pass_key.depth_and_color_used != 0)) { + // Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same + // sample locations, but still top-left and bottom-right - however, this can + // be adjusted with custom sample locations). + multisample_state.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT; + sample_mask = 0b1001; + // TODO(Triang3l): Research sample mask behavior without attachments (in + // Direct3D, it's completely ignored in this case). + multisample_state.pSampleMask = &sample_mask; + } else { + multisample_state.rasterizationSamples = VkSampleCountFlagBits( + uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples)); + } VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {}; depth_stencil_state.sType = @@ -1061,7 +1075,7 @@ bool VulkanPipelineCache::EnsurePipelineCreated( pipeline_create_info.renderPass = creation_arguments.render_pass; pipeline_create_info.subpass = 0; pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; - pipeline_create_info.basePipelineIndex = UINT32_MAX; + pipeline_create_info.basePipelineIndex = -1; const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc index 24eb8e14b..b029f64dd 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc @@ -10,23 +10,109 @@ #include "xenia/gpu/vulkan/vulkan_render_target_cache.h" #include +#include #include #include #include +#include #include #include +#include +#include "third_party/glslang/SPIRV/GLSL.std.450.h" +#include "third_party/glslang/SPIRV/SpvBuilder.h" #include "xenia/base/assert.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" +#include "xenia/gpu/draw_util.h" #include "xenia/gpu/registers.h" +#include "xenia/gpu/spirv_shader_translator.h" +#include "xenia/gpu/vulkan/deferred_command_buffer.h" #include "xenia/gpu/vulkan/vulkan_command_processor.h" +#include "xenia/gpu/xenos.h" #include "xenia/ui/vulkan/vulkan_util.h" namespace xe { namespace gpu { namespace vulkan { +// Generated with `xb buildshaders`. +namespace shaders { +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_1xmsaa_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_2xmsaa_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_4xmsaa_cs.h" +#include "xenia/gpu/shaders/bytecode/vulkan_spirv/passthrough_position_xy_vs.h" +} // namespace shaders + +const VulkanRenderTargetCache::TransferPipelineLayoutInfo + VulkanRenderTargetCache::kTransferPipelineLayoutInfos[size_t( + TransferPipelineLayoutIndex::kCount)] = { + // kColor + {kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordAddressBit}, + // kDepth + {kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordAddressBit}, + // kColorToStencilBit + {kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordAddressBit | + kTransferUsedPushConstantDwordStencilMaskBit}, + // kDepthToStencilBit + {kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordAddressBit | + kTransferUsedPushConstantDwordStencilMaskBit}, + // kColorAndHostDepthTexture + {kTransferUsedDescriptorSetHostDepthStencilTexturesBit | + kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, + // kColorAndHostDepthBuffer + {kTransferUsedDescriptorSetHostDepthBufferBit | + kTransferUsedDescriptorSetColorTextureBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, + // kDepthAndHostDepthTexture + {kTransferUsedDescriptorSetHostDepthStencilTexturesBit | + kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, + // kDepthAndHostDepthBuffer + {kTransferUsedDescriptorSetHostDepthBufferBit | + kTransferUsedDescriptorSetDepthStencilTexturesBit, + kTransferUsedPushConstantDwordHostDepthAddressBit | + kTransferUsedPushConstantDwordAddressBit}, +}; + +const VulkanRenderTargetCache::TransferModeInfo + VulkanRenderTargetCache::kTransferModes[size_t(TransferMode::kCount)] = { + // kColorToDepth + {TransferOutput::kDepth, TransferPipelineLayoutIndex::kColor}, + // kColorToColor + {TransferOutput::kColor, TransferPipelineLayoutIndex::kColor}, + // kDepthToDepth + {TransferOutput::kDepth, TransferPipelineLayoutIndex::kDepth}, + // kDepthToColor + {TransferOutput::kColor, TransferPipelineLayoutIndex::kDepth}, + // kColorToStencilBit + {TransferOutput::kStencilBit, + TransferPipelineLayoutIndex::kColorToStencilBit}, + // kDepthToStencilBit + {TransferOutput::kStencilBit, + TransferPipelineLayoutIndex::kDepthToStencilBit}, + // kColorAndHostDepthToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kColorAndHostDepthTexture}, + // kDepthAndHostDepthToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kDepthAndHostDepthTexture}, + // kColorAndHostDepthCopyToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kColorAndHostDepthBuffer}, + // kDepthAndHostDepthCopyToDepth + {TransferOutput::kDepth, + TransferPipelineLayoutIndex::kDepthAndHostDepthBuffer}, +}; + VulkanRenderTargetCache::VulkanRenderTargetCache( VulkanCommandProcessor& command_processor, const RegisterFile& register_file) @@ -35,6 +121,342 @@ VulkanRenderTargetCache::VulkanRenderTargetCache( VulkanRenderTargetCache::~VulkanRenderTargetCache() { Shutdown(true); } bool VulkanRenderTargetCache::Initialize() { + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + + // Descriptor set layouts. + VkDescriptorSetLayoutBinding descriptor_set_layout_bindings[2]; + descriptor_set_layout_bindings[0].binding = 0; + descriptor_set_layout_bindings[0].descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + descriptor_set_layout_bindings[0].descriptorCount = 1; + descriptor_set_layout_bindings[0].stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT; + descriptor_set_layout_bindings[0].pImmutableSamplers = nullptr; + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info; + descriptor_set_layout_create_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + descriptor_set_layout_create_info.pNext = nullptr; + descriptor_set_layout_create_info.flags = 0; + descriptor_set_layout_create_info.bindingCount = 1; + descriptor_set_layout_create_info.pBindings = descriptor_set_layout_bindings; + if (dfn.vkCreateDescriptorSetLayout( + device, &descriptor_set_layout_create_info, nullptr, + &descriptor_set_layout_storage_buffer_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the descriptor set layout " + "with one storage buffer"); + Shutdown(); + return false; + } + descriptor_set_layout_bindings[0].descriptorType = + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + if (dfn.vkCreateDescriptorSetLayout( + device, &descriptor_set_layout_create_info, nullptr, + &descriptor_set_layout_sampled_image_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the descriptor set layout " + "with one sampled image"); + Shutdown(); + return false; + } + descriptor_set_layout_bindings[1].binding = 1; + descriptor_set_layout_bindings[1].descriptorType = + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_layout_bindings[1].descriptorCount = 1; + descriptor_set_layout_bindings[1].stageFlags = + descriptor_set_layout_bindings[0].stageFlags; + descriptor_set_layout_bindings[1].pImmutableSamplers = nullptr; + descriptor_set_layout_create_info.bindingCount = 2; + if (dfn.vkCreateDescriptorSetLayout( + device, &descriptor_set_layout_create_info, nullptr, + &descriptor_set_layout_sampled_image_x2_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the descriptor set layout " + "with two sampled images"); + Shutdown(); + return false; + } + + // Descriptor set pools. + // The pool sizes were chosen without a specific reason. + VkDescriptorPoolSize descriptor_set_layout_size; + descriptor_set_layout_size.type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_layout_size.descriptorCount = 1; + descriptor_set_pool_sampled_image_ = + std::make_unique( + provider, 256, 1, &descriptor_set_layout_size, + descriptor_set_layout_sampled_image_); + descriptor_set_layout_size.descriptorCount = 2; + descriptor_set_pool_sampled_image_x2_ = + std::make_unique( + provider, 256, 1, &descriptor_set_layout_size, + descriptor_set_layout_sampled_image_x2_); + + // EDRAM contents reinterpretation buffer. + // 90 MB with 9x resolution scaling - within the minimum + // maxStorageBufferRange. + if (!ui::vulkan::util::CreateDedicatedAllocationBuffer( + provider, + VkDeviceSize(xenos::kEdramSizeBytes * resolution_scale_x_ * + resolution_scale_y_), + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + ui::vulkan::util::MemoryPurpose::kDeviceLocal, edram_buffer_, + edram_buffer_memory_)) { + XELOGE("VulkanRenderTargetCache: Failed to create the EDRAM buffer"); + Shutdown(); + return false; + } + if (GetPath() == Path::kPixelShaderInterlock) { + // The first operation will likely be drawing. + edram_buffer_usage_ = EdramBufferUsage::kFragmentReadWrite; + } else { + // The first operation will likely be depth self-comparison. + edram_buffer_usage_ = EdramBufferUsage::kFragmentRead; + } + edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + VkDescriptorPoolSize edram_storage_buffer_descriptor_pool_size; + edram_storage_buffer_descriptor_pool_size.type = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + edram_storage_buffer_descriptor_pool_size.descriptorCount = 1; + VkDescriptorPoolCreateInfo edram_storage_buffer_descriptor_pool_create_info; + edram_storage_buffer_descriptor_pool_create_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + edram_storage_buffer_descriptor_pool_create_info.pNext = nullptr; + edram_storage_buffer_descriptor_pool_create_info.flags = 0; + edram_storage_buffer_descriptor_pool_create_info.maxSets = 1; + edram_storage_buffer_descriptor_pool_create_info.poolSizeCount = 1; + edram_storage_buffer_descriptor_pool_create_info.pPoolSizes = + &edram_storage_buffer_descriptor_pool_size; + if (dfn.vkCreateDescriptorPool( + device, &edram_storage_buffer_descriptor_pool_create_info, nullptr, + &edram_storage_buffer_descriptor_pool_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the EDRAM buffer storage " + "buffer descriptor pool"); + Shutdown(); + return false; + } + VkDescriptorSetAllocateInfo edram_storage_buffer_descriptor_set_allocate_info; + edram_storage_buffer_descriptor_set_allocate_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + edram_storage_buffer_descriptor_set_allocate_info.pNext = nullptr; + edram_storage_buffer_descriptor_set_allocate_info.descriptorPool = + edram_storage_buffer_descriptor_pool_; + edram_storage_buffer_descriptor_set_allocate_info.descriptorSetCount = 1; + edram_storage_buffer_descriptor_set_allocate_info.pSetLayouts = + &descriptor_set_layout_storage_buffer_; + if (dfn.vkAllocateDescriptorSets( + device, &edram_storage_buffer_descriptor_set_allocate_info, + &edram_storage_buffer_descriptor_set_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to allocate the EDRAM buffer storage " + "buffer descriptor set"); + Shutdown(); + return false; + } + VkDescriptorBufferInfo edram_storage_buffer_descriptor_buffer_info; + edram_storage_buffer_descriptor_buffer_info.buffer = edram_buffer_; + edram_storage_buffer_descriptor_buffer_info.offset = 0; + edram_storage_buffer_descriptor_buffer_info.range = VK_WHOLE_SIZE; + VkWriteDescriptorSet edram_storage_buffer_descriptor_write; + edram_storage_buffer_descriptor_write.sType = + VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + edram_storage_buffer_descriptor_write.pNext = nullptr; + edram_storage_buffer_descriptor_write.dstSet = + edram_storage_buffer_descriptor_set_; + edram_storage_buffer_descriptor_write.dstBinding = 0; + edram_storage_buffer_descriptor_write.dstArrayElement = 0; + edram_storage_buffer_descriptor_write.descriptorCount = 1; + edram_storage_buffer_descriptor_write.descriptorType = + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + edram_storage_buffer_descriptor_write.pImageInfo = nullptr; + edram_storage_buffer_descriptor_write.pBufferInfo = + &edram_storage_buffer_descriptor_buffer_info; + edram_storage_buffer_descriptor_write.pTexelBufferView = nullptr; + dfn.vkUpdateDescriptorSets(device, 1, &edram_storage_buffer_descriptor_write, + 0, nullptr); + + // TODO(Triang3l): All paths (FSI). + + // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in + // transfers. + if (cvars::native_2x_msaa) { + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; + // Multisampled integer sampled images are optional in Vulkan and in Xenia. + msaa_2x_attachments_supported_ = + (device_limits.framebufferColorSampleCounts & + device_limits.framebufferDepthSampleCounts & + device_limits.framebufferStencilSampleCounts & + device_limits.sampledImageColorSampleCounts & + device_limits.sampledImageDepthSampleCounts & + device_limits.sampledImageStencilSampleCounts & + VK_SAMPLE_COUNT_2_BIT) && + (device_limits.sampledImageIntegerSampleCounts & + (VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT)) != + VK_SAMPLE_COUNT_4_BIT; + msaa_2x_no_attachments_supported_ = + (device_limits.framebufferNoAttachmentsSampleCounts & + VK_SAMPLE_COUNT_2_BIT) != 0; + } else { + msaa_2x_attachments_supported_ = false; + msaa_2x_no_attachments_supported_ = false; + } + + // Host depth storing pipeline layout. + VkDescriptorSetLayout host_depth_store_descriptor_set_layouts[] = { + // Destination EDRAM storage buffer. + descriptor_set_layout_storage_buffer_, + // Source depth / stencil texture (only depth is used). + descriptor_set_layout_sampled_image_x2_, + }; + VkPushConstantRange host_depth_store_push_constant_range; + host_depth_store_push_constant_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + host_depth_store_push_constant_range.offset = 0; + host_depth_store_push_constant_range.size = sizeof(HostDepthStoreConstants); + VkPipelineLayoutCreateInfo host_depth_store_pipeline_layout_create_info; + host_depth_store_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + host_depth_store_pipeline_layout_create_info.pNext = nullptr; + host_depth_store_pipeline_layout_create_info.flags = 0; + host_depth_store_pipeline_layout_create_info.setLayoutCount = + uint32_t(xe::countof(host_depth_store_descriptor_set_layouts)); + host_depth_store_pipeline_layout_create_info.pSetLayouts = + host_depth_store_descriptor_set_layouts; + host_depth_store_pipeline_layout_create_info.pushConstantRangeCount = 1; + host_depth_store_pipeline_layout_create_info.pPushConstantRanges = + &host_depth_store_push_constant_range; + if (dfn.vkCreatePipelineLayout( + device, &host_depth_store_pipeline_layout_create_info, nullptr, + &host_depth_store_pipeline_layout_) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the host depth storing " + "pipeline layout"); + Shutdown(); + return false; + } + const std::pair host_depth_store_shaders[] = { + {shaders::host_depth_store_1xmsaa_cs, + sizeof(shaders::host_depth_store_1xmsaa_cs)}, + {shaders::host_depth_store_2xmsaa_cs, + sizeof(shaders::host_depth_store_2xmsaa_cs)}, + {shaders::host_depth_store_4xmsaa_cs, + sizeof(shaders::host_depth_store_4xmsaa_cs)}, + }; + for (size_t i = 0; i < xe::countof(host_depth_store_shaders); ++i) { + const std::pair host_depth_store_shader = + host_depth_store_shaders[i]; + VkPipeline host_depth_store_pipeline = + ui::vulkan::util::CreateComputePipeline( + provider, host_depth_store_pipeline_layout_, + host_depth_store_shader.first, host_depth_store_shader.second); + if (host_depth_store_pipeline == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the {}-sample host depth " + "storing pipeline", + uint32_t(1) << i); + Shutdown(); + return false; + } + host_depth_store_pipelines_[i] = host_depth_store_pipeline; + } + + // Transfer and clear vertex buffer, for quads of up to tile granularity. + transfer_vertex_buffer_pool_ = + std::make_unique( + provider, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + std::max(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, + sizeof(float) * 2 * 6 * + Transfer::kMaxCutoutBorderRectangles * + xenos::kEdramTileCount)); + + // Transfer vertex shader. + transfer_passthrough_vertex_shader_ = ui::vulkan::util::CreateShaderModule( + provider, shaders::passthrough_position_xy_vs, + sizeof(shaders::passthrough_position_xy_vs)); + if (transfer_passthrough_vertex_shader_ == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target ownership " + "transfer vertex shader"); + Shutdown(); + return false; + } + + // Transfer pipeline layouts. + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layouts + [kTransferUsedDescriptorSetCount]; + VkPushConstantRange transfer_pipeline_layout_push_constant_range; + transfer_pipeline_layout_push_constant_range.stageFlags = + VK_SHADER_STAGE_FRAGMENT_BIT; + transfer_pipeline_layout_push_constant_range.offset = 0; + VkPipelineLayoutCreateInfo transfer_pipeline_layout_create_info; + transfer_pipeline_layout_create_info.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + transfer_pipeline_layout_create_info.pNext = nullptr; + transfer_pipeline_layout_create_info.flags = 0; + transfer_pipeline_layout_create_info.pSetLayouts = + transfer_pipeline_layout_descriptor_set_layouts; + transfer_pipeline_layout_create_info.pPushConstantRanges = + &transfer_pipeline_layout_push_constant_range; + for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { + const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = + kTransferPipelineLayoutInfos[i]; + transfer_pipeline_layout_create_info.setLayoutCount = 0; + uint32_t transfer_pipeline_layout_descriptor_sets_remaining = + transfer_pipeline_layout_info.used_descriptor_sets; + uint32_t transfer_pipeline_layout_descriptor_set_index; + while ( + xe::bit_scan_forward(transfer_pipeline_layout_descriptor_sets_remaining, + &transfer_pipeline_layout_descriptor_set_index)) { + transfer_pipeline_layout_descriptor_sets_remaining &= + ~(uint32_t(1) << transfer_pipeline_layout_descriptor_set_index); + VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layout = + VK_NULL_HANDLE; + switch (TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)) { + case kTransferUsedDescriptorSetHostDepthBuffer: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_storage_buffer_; + break; + case kTransferUsedDescriptorSetHostDepthStencilTextures: + case kTransferUsedDescriptorSetDepthStencilTextures: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_x2_; + break; + case kTransferUsedDescriptorSetColorTexture: + transfer_pipeline_layout_descriptor_set_layout = + descriptor_set_layout_sampled_image_; + break; + default: + assert_unhandled_case(TransferUsedDescriptorSet( + transfer_pipeline_layout_descriptor_set_index)); + } + transfer_pipeline_layout_descriptor_set_layouts + [transfer_pipeline_layout_create_info.setLayoutCount++] = + transfer_pipeline_layout_descriptor_set_layout; + } + transfer_pipeline_layout_push_constant_range.size = uint32_t( + sizeof(uint32_t) * + xe::bit_count(transfer_pipeline_layout_info.used_push_constant_dwords)); + transfer_pipeline_layout_create_info.pushConstantRangeCount = + transfer_pipeline_layout_info.used_push_constant_dwords ? 1 : 0; + if (dfn.vkCreatePipelineLayout( + device, &transfer_pipeline_layout_create_info, nullptr, + &transfer_pipeline_layouts_[i]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target " + "ownership transfer pipeline layout {}", + i); + Shutdown(); + return false; + } + } + InitializeCommon(); return true; } @@ -45,6 +467,36 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) { const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); + for (const auto& transfer_pipeline_array_pair : transfer_pipelines_) { + for (VkPipeline transfer_pipeline : transfer_pipeline_array_pair.second) { + // May be null to prevent recreation attempts. + if (transfer_pipeline != VK_NULL_HANDLE) { + dfn.vkDestroyPipeline(device, transfer_pipeline, nullptr); + } + } + } + transfer_pipelines_.clear(); + for (const auto& transfer_shader_pair : transfer_shaders_) { + if (transfer_shader_pair.second != VK_NULL_HANDLE) { + dfn.vkDestroyShaderModule(device, transfer_shader_pair.second, nullptr); + } + } + transfer_shaders_.clear(); + for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) { + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device, + transfer_pipeline_layouts_[i]); + } + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyShaderModule, device, + transfer_passthrough_vertex_shader_); + transfer_vertex_buffer_pool_.reset(); + + for (size_t i = 0; i < xe::countof(host_depth_store_pipelines_); ++i) { + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipeline, device, + host_depth_store_pipelines_[i]); + } + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device, + host_depth_store_pipeline_layout_); + last_update_framebuffer_ = VK_NULL_HANDLE; for (const auto& framebuffer_pair : framebuffers_) { dfn.vkDestroyFramebuffer(device, framebuffer_pair.second.framebuffer, @@ -54,10 +506,32 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) { last_update_render_pass_ = VK_NULL_HANDLE; for (const auto& render_pass_pair : render_passes_) { - dfn.vkDestroyRenderPass(device, render_pass_pair.second, nullptr); + if (render_pass_pair.second != VK_NULL_HANDLE) { + dfn.vkDestroyRenderPass(device, render_pass_pair.second, nullptr); + } } render_passes_.clear(); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorPool, device, + edram_storage_buffer_descriptor_pool_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device, + edram_buffer_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device, + edram_buffer_memory_); + + descriptor_set_pool_sampled_image_x2_.reset(); + descriptor_set_pool_sampled_image_.reset(); + + ui::vulkan::util::DestroyAndNullHandle( + dfn.vkDestroyDescriptorSetLayout, device, + descriptor_set_layout_sampled_image_x2_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, + device, + descriptor_set_layout_sampled_image_); + ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout, + device, + descriptor_set_layout_storage_buffer_); + if (!from_destructor) { ShutdownCommon(); } @@ -87,6 +561,19 @@ void VulkanRenderTargetCache::ClearCache() { RenderTargetCache::ClearCache(); } +void VulkanRenderTargetCache::CompletedSubmissionUpdated() { + if (transfer_vertex_buffer_pool_) { + transfer_vertex_buffer_pool_->Reclaim( + command_processor_.GetCompletedSubmission()); + } +} + +void VulkanRenderTargetCache::EndSubmission() { + if (transfer_vertex_buffer_pool_) { + transfer_vertex_buffer_pool_->FlushWrites(); + } +} + bool VulkanRenderTargetCache::Update(bool is_rasterization_done, uint32_t shader_writes_color_targets) { if (!RenderTargetCache::Update(is_rasterization_done, @@ -94,9 +581,16 @@ bool VulkanRenderTargetCache::Update(bool is_rasterization_done, return false; } - auto rb_surface_info = register_file().Get(); + // TODO(Triang3l): All paths (FSI). + RenderTarget* const* depth_and_color_render_targets = last_update_accumulated_render_targets(); + + PerformTransfersAndResolveClears(1 + xenos::kMaxColorRenderTargets, + depth_and_color_render_targets, + last_update_transfers()); + + auto rb_surface_info = register_file().Get(); uint32_t render_targets_are_srgb = gamma_render_target_as_srgb_ ? last_update_accumulated_color_targets_are_gamma() @@ -104,7 +598,6 @@ bool VulkanRenderTargetCache::Update(bool is_rasterization_done, RenderPassKey render_pass_key; render_pass_key.msaa_samples = rb_surface_info.msaa_samples; - // TODO(Triang3l): 2x MSAA as 4x. if (depth_and_color_render_targets[0]) { render_pass_key.depth_and_color_used |= 1 << 0; render_pass_key.depth_format = @@ -220,9 +713,9 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { samples = VK_SAMPLE_COUNT_1_BIT; break; case xenos::MsaaSamples::k2X: - // Using unconditionally because if 2x is emulated as 4x, the key will - // also contain 4x. - samples = VK_SAMPLE_COUNT_2_BIT; + samples = IsMsaa2xSupported(key.depth_and_color_used != 0) + ? VK_SAMPLE_COUNT_2_BIT + : VK_SAMPLE_COUNT_4_BIT; break; case xenos::MsaaSamples::k4X: samples = VK_SAMPLE_COUNT_4_BIT; @@ -264,7 +757,11 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { color_attachment.attachment = attachment_index; VkAttachmentDescription& attachment = attachments[attachment_index]; attachment.flags = 0; - attachment.format = GetColorVulkanFormat(color_formats[i]); + xenos::ColorRenderTargetFormat color_format = color_formats[i]; + attachment.format = + key.color_rts_use_transfer_formats + ? GetColorOwnershipTransferVulkanFormat(color_format) + : GetColorVulkanFormat(color_format); attachment.samples = samples; attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; @@ -340,7 +837,8 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) { VkRenderPass render_pass; if (dfn.vkCreateRenderPass(device, &render_pass_create_info, nullptr, &render_pass) != VK_SUCCESS) { - XELOGE("Failed to create a Vulkan render pass"); + XELOGE("VulkanRenderTargetCache: Failed to create a render pass"); + render_passes_.emplace(key.key, VK_NULL_HANDLE); return VK_NULL_HANDLE; } render_passes_.emplace(key.key, render_pass); @@ -419,8 +917,15 @@ VkFormat VulkanRenderTargetCache::GetColorOwnershipTransferVulkanFormat( } VulkanRenderTargetCache::VulkanRenderTarget::~VulkanRenderTarget() { - const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn(); - VkDevice device = provider_.device(); + const ui::vulkan::VulkanProvider& provider = + render_target_cache_.command_processor_.GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool = + key().is_depth + ? *render_target_cache_.descriptor_set_pool_sampled_image_x2_ + : *render_target_cache_.descriptor_set_pool_sampled_image_; + descriptor_set_pool.Free(descriptor_set_index_transfer_source_); if (view_color_transfer_separate_ != VK_NULL_HANDLE) { dfn.vkDestroyImageView(device, view_color_transfer_separate_, nullptr); } @@ -464,16 +969,20 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( image_create_info.pNext = nullptr; image_create_info.flags = 0; image_create_info.imageType = VK_IMAGE_TYPE_2D; - // TODO(Triang3l): Resolution scaling. - image_create_info.extent.width = key.GetWidth(); + image_create_info.extent.width = key.GetWidth() * resolution_scale_x_; image_create_info.extent.height = - GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples); + GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples) * + resolution_scale_y_; image_create_info.extent.depth = 1; image_create_info.mipLevels = 1; image_create_info.arrayLayers = 1; - // TODO(Triang3l): 2x MSAA as 4x. - image_create_info.samples = - VkSampleCountFlagBits(uint32_t(1) << uint32_t(key.msaa_samples)); + if (key.msaa_samples == xenos::MsaaSamples::k2X && + !msaa_2x_attachments_supported_) { + image_create_info.samples = VK_SAMPLE_COUNT_4_BIT; + } else { + image_create_info.samples = + VkSampleCountFlagBits(uint32_t(1) << uint32_t(key.msaa_samples)); + } image_create_info.tiling = VK_IMAGE_TILING_OPTIMAL; image_create_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT; image_create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; @@ -509,7 +1018,11 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( if (!ui::vulkan::util::CreateDedicatedAllocationImage( provider, image_create_info, ui::vulkan::util::MemoryPurpose::kDeviceLocal, image, memory)) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a {}x{} {}xMSAA {} render target " + "image", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName()); return nullptr; } @@ -532,7 +1045,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( VkImageView view_depth_color; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_depth_color) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a {} view for a {}x{} {}xMSAA {} " + "render target", + key.is_depth ? "depth" : "color", image_create_info.extent.width, + image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName()); dfn.vkDestroyImage(device, image, nullptr); dfn.vkFreeMemory(device, memory, nullptr); return nullptr; @@ -546,7 +1064,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_depth_stencil) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a depth / stencil view for a " + "{}x{} {}xMSAA {} render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), + xenos::GetDepthRenderTargetFormatName(key.GetDepthFormat())); dfn.vkDestroyImageView(device, view_depth_color, nullptr); dfn.vkDestroyImage(device, image, nullptr); dfn.vkFreeMemory(device, memory, nullptr); @@ -555,7 +1078,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( view_create_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_stencil) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a stencil view for a {}x{} " + "{}xMSAA render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), + xenos::GetDepthRenderTargetFormatName(key.GetDepthFormat())); dfn.vkDestroyImageView(device, view_depth_stencil, nullptr); dfn.vkDestroyImageView(device, view_depth_color, nullptr); dfn.vkDestroyImage(device, image, nullptr); @@ -567,7 +1095,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( view_create_info.format = VK_FORMAT_R8G8B8A8_SRGB; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_srgb) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create an sRGB view for a {}x{} " + "{}xMSAA render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), + xenos::GetColorRenderTargetFormatName(key.GetColorFormat())); dfn.vkDestroyImageView(device, view_depth_color, nullptr); dfn.vkDestroyImage(device, image, nullptr); dfn.vkFreeMemory(device, memory, nullptr); @@ -578,7 +1111,11 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( view_create_info.format = transfer_format; if (dfn.vkCreateImageView(device, &view_create_info, nullptr, &view_color_transfer_separate) != VK_SUCCESS) { - // TODO(Triang3l): Error message. + XELOGE( + "VulkanRenderTarget: Failed to create a transfer view for a {}x{} " + "{}xMSAA {} render target", + image_create_info.extent.width, image_create_info.extent.height, + uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName()); if (view_srgb != VK_NULL_HANDLE) { dfn.vkDestroyImageView(device, view_srgb, nullptr); } @@ -590,11 +1127,170 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( } } - VkImageView view_transfer_separate = VK_NULL_HANDLE; + ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool = + key.is_depth ? *descriptor_set_pool_sampled_image_x2_ + : *descriptor_set_pool_sampled_image_; + size_t descriptor_set_index_transfer_source = descriptor_set_pool.Allocate(); + if (descriptor_set_index_transfer_source == SIZE_MAX) { + XELOGE( + "VulkanRenderTargetCache: Failed to allocate sampled image descriptors " + "for a {} render target", + key.is_depth ? "depth/stencil" : "color"); + if (view_color_transfer_separate != VK_NULL_HANDLE) { + dfn.vkDestroyImageView(device, view_color_transfer_separate, nullptr); + } + if (view_srgb != VK_NULL_HANDLE) { + dfn.vkDestroyImageView(device, view_srgb, nullptr); + } + dfn.vkDestroyImageView(device, view_depth_color, nullptr); + dfn.vkDestroyImage(device, image, nullptr); + dfn.vkFreeMemory(device, memory, nullptr); + return nullptr; + } + VkDescriptorSet descriptor_set_transfer_source = + descriptor_set_pool.Get(descriptor_set_index_transfer_source); + VkWriteDescriptorSet descriptor_set_write[2]; + VkDescriptorImageInfo descriptor_set_write_depth_color; + descriptor_set_write_depth_color.sampler = VK_NULL_HANDLE; + descriptor_set_write_depth_color.imageView = + view_color_transfer_separate != VK_NULL_HANDLE + ? view_color_transfer_separate + : view_depth_color; + descriptor_set_write_depth_color.imageLayout = + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + descriptor_set_write[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptor_set_write[0].pNext = nullptr; + descriptor_set_write[0].dstSet = descriptor_set_transfer_source; + descriptor_set_write[0].dstBinding = 0; + descriptor_set_write[0].dstArrayElement = 0; + descriptor_set_write[0].descriptorCount = 1; + descriptor_set_write[0].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_write[0].pImageInfo = &descriptor_set_write_depth_color; + descriptor_set_write[0].pBufferInfo = nullptr; + descriptor_set_write[0].pTexelBufferView = nullptr; + VkDescriptorImageInfo descriptor_set_write_stencil; + if (key.is_depth) { + descriptor_set_write_stencil.sampler = VK_NULL_HANDLE; + descriptor_set_write_stencil.imageView = view_stencil; + descriptor_set_write_stencil.imageLayout = + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + descriptor_set_write[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptor_set_write[1].pNext = nullptr; + descriptor_set_write[1].dstSet = descriptor_set_transfer_source; + descriptor_set_write[1].dstBinding = 1; + descriptor_set_write[1].dstArrayElement = 0; + descriptor_set_write[1].descriptorCount = 1; + descriptor_set_write[1].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + descriptor_set_write[1].pImageInfo = &descriptor_set_write_stencil; + descriptor_set_write[1].pBufferInfo = nullptr; + descriptor_set_write[1].pTexelBufferView = nullptr; + } + dfn.vkUpdateDescriptorSets(device, key.is_depth ? 2 : 1, descriptor_set_write, + 0, nullptr); - return new VulkanRenderTarget(key, provider, image, memory, view_depth_color, + return new VulkanRenderTarget(key, *this, image, memory, view_depth_color, view_depth_stencil, view_stencil, view_srgb, - view_color_transfer_separate); + view_color_transfer_separate, + descriptor_set_index_transfer_source); +} + +void VulkanRenderTargetCache::GetEdramBufferUsageMasks( + EdramBufferUsage usage, VkPipelineStageFlags& stage_mask_out, + VkAccessFlags& access_mask_out) { + switch (usage) { + case EdramBufferUsage::kFragmentRead: + stage_mask_out = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_READ_BIT; + break; + case EdramBufferUsage::kFragmentReadWrite: + stage_mask_out = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + break; + case EdramBufferUsage::kComputeRead: + stage_mask_out = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_READ_BIT; + break; + case EdramBufferUsage::kComputeWrite: + stage_mask_out = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + access_mask_out = VK_ACCESS_SHADER_WRITE_BIT; + break; + case EdramBufferUsage::kTransferRead: + stage_mask_out = VK_PIPELINE_STAGE_TRANSFER_BIT; + access_mask_out = VK_ACCESS_TRANSFER_READ_BIT; + break; + case EdramBufferUsage::kTransferWrite: + stage_mask_out = VK_PIPELINE_STAGE_TRANSFER_BIT; + access_mask_out = VK_ACCESS_TRANSFER_WRITE_BIT; + break; + default: + assert_unhandled_case(usage); + } +} + +void VulkanRenderTargetCache::UseEdramBuffer(EdramBufferUsage new_usage) { + if (edram_buffer_usage_ == new_usage) { + return; + } + VkPipelineStageFlags src_stage_mask, dst_stage_mask; + VkAccessFlags src_access_mask, dst_access_mask; + GetEdramBufferUsageMasks(edram_buffer_usage_, src_stage_mask, + src_access_mask); + GetEdramBufferUsageMasks(new_usage, dst_stage_mask, dst_access_mask); + if (command_processor_.PushBufferMemoryBarrier( + edram_buffer_, 0, VK_WHOLE_SIZE, src_stage_mask, dst_stage_mask, + src_access_mask, dst_access_mask)) { + // Resetting edram_buffer_modification_status_ only if the barrier has been + // truly inserted. + edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + } + edram_buffer_usage_ = new_usage; +} + +void VulkanRenderTargetCache::MarkEdramBufferModified( + EdramBufferModificationStatus modification_status) { + assert_true(modification_status != + EdramBufferModificationStatus::kUnmodified); + switch (edram_buffer_usage_) { + case EdramBufferUsage::kFragmentReadWrite: + // max because being modified via unordered access requires stricter + // synchronization than via fragment shader interlocks. + edram_buffer_modification_status_ = + std::max(edram_buffer_modification_status_, modification_status); + break; + case EdramBufferUsage::kComputeWrite: + assert_true(modification_status == + EdramBufferModificationStatus::kViaUnordered); + modification_status = EdramBufferModificationStatus::kViaUnordered; + break; + default: + assert_always( + "While changing the usage of the EDRAM buffer before marking it as " + "modified is handled safely (but will cause spurious marking as " + "modified after the changes have been implicitly committed by the " + "usage switch), normally that shouldn't be done and is an " + "indication of architectural mistakes. Alternatively, this may " + "indicate that the usage switch has been forgotten before writing, " + "which is a clearly invalid situation."); + } +} + +void VulkanRenderTargetCache::CommitEdramBufferShaderWrites( + EdramBufferModificationStatus commit_status) { + assert_true(commit_status != EdramBufferModificationStatus::kUnmodified); + if (edram_buffer_modification_status_ < commit_status) { + return; + } + VkPipelineStageFlags stage_mask; + VkAccessFlags access_mask; + GetEdramBufferUsageMasks(edram_buffer_usage_, stage_mask, access_mask); + assert_not_zero(access_mask & VK_ACCESS_SHADER_WRITE_BIT); + command_processor_.PushBufferMemoryBarrier( + edram_buffer_, 0, VK_WHOLE_SIZE, stage_mask, stage_mask, access_mask, + access_mask, VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, false); + edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + PixelShaderInterlockFullEdramBarrierPlaced(); } const VulkanRenderTargetCache::Framebuffer* @@ -646,8 +1342,15 @@ VulkanRenderTargetCache::GetFramebuffer( depth_and_color_rts_remaining &= ~(uint32_t(1) << rt_index); const auto& vulkan_rt = *static_cast( depth_and_color_render_targets[rt_index]); - attachments[attachment_count++] = rt_index ? vulkan_rt.view_depth_color() - : vulkan_rt.view_depth_stencil(); + VkImageView attachment; + if (rt_index) { + attachment = render_pass_key.color_rts_use_transfer_formats + ? vulkan_rt.view_color_transfer() + : vulkan_rt.view_depth_color(); + } else { + attachment = vulkan_rt.view_depth_stencil(); + } + attachments[attachment_count++] = attachment; } VkFramebufferCreateInfo framebuffer_create_info; @@ -684,6 +1387,3491 @@ VulkanRenderTargetCache::GetFramebuffer( .first->second; } +VkShaderModule VulkanRenderTargetCache::GetTransferShader( + TransferShaderKey key) { + auto shader_it = transfer_shaders_.find(key); + if (shader_it != transfer_shaders_.end()) { + return shader_it->second; + } + + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + + std::vector id_vector_temp; + std::vector uint_vector_temp; + + spv::Builder builder(spv::Spv_1_0, + (SpirvShaderTranslator::kSpirvMagicToolId << 16) | 1, + nullptr); + spv::Id ext_inst_glsl_std_450 = builder.import("GLSL.std.450"); + builder.addCapability(spv::CapabilityShader); + builder.setMemoryModel(spv::AddressingModelLogical, spv::MemoryModelGLSL450); + builder.setSource(spv::SourceLanguageUnknown, 0); + + spv::Id type_void = builder.makeVoidType(); + spv::Id type_bool = builder.makeBoolType(); + spv::Id type_int = builder.makeIntType(32); + spv::Id type_int2 = builder.makeVectorType(type_int, 2); + spv::Id type_uint = builder.makeUintType(32); + spv::Id type_uint2 = builder.makeVectorType(type_uint, 2); + spv::Id type_uint4 = builder.makeVectorType(type_uint, 4); + spv::Id type_float = builder.makeFloatType(32); + spv::Id type_float2 = builder.makeVectorType(type_float, 2); + spv::Id type_float4 = builder.makeVectorType(type_float, 4); + + const TransferModeInfo& mode = kTransferModes[size_t(key.mode)]; + const TransferPipelineLayoutInfo& pipeline_layout_info = + kTransferPipelineLayoutInfos[size_t(mode.pipeline_layout)]; + + // If not dest_is_color, it's depth, or stencil bit - 40-sample columns are + // swapped as opposed to color source. + bool dest_is_color = (mode.output == TransferOutput::kColor); + xenos::ColorRenderTargetFormat dest_color_format = + xenos::ColorRenderTargetFormat(key.dest_resource_format); + xenos::DepthRenderTargetFormat dest_depth_format = + xenos::DepthRenderTargetFormat(key.dest_resource_format); + bool dest_is_64bpp = + dest_is_color && xenos::IsColorRenderTargetFormat64bpp(dest_color_format); + + xenos::ColorRenderTargetFormat source_color_format = + xenos::ColorRenderTargetFormat(key.source_resource_format); + xenos::DepthRenderTargetFormat source_depth_format = + xenos::DepthRenderTargetFormat(key.source_resource_format); + // If not source_is_color, it's depth / stencil - 40-sample columns are + // swapped as opposed to color destination. + bool source_is_color = (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetColorTextureBit) != 0; + bool source_is_64bpp; + uint32_t source_color_format_component_count; + uint32_t source_color_texture_component_mask; + bool source_color_is_uint; + spv::Id source_color_component_type; + if (source_is_color) { + assert_zero(pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetDepthStencilTexturesBit); + source_is_64bpp = + xenos::IsColorRenderTargetFormat64bpp(source_color_format); + source_color_format_component_count = + xenos::GetColorRenderTargetFormatComponentCount(source_color_format); + if (mode.output == TransferOutput::kStencilBit) { + if (source_is_64bpp && !dest_is_64bpp) { + // Need one component, but choosing from the two 32bpp halves of the + // 64bpp sample. + source_color_texture_component_mask = + 0b1 | (0b1 << (source_color_format_component_count >> 1)); + } else { + // Red is at least 8 bits per component in all formats. + source_color_texture_component_mask = 0b1; + } + } else { + source_color_texture_component_mask = + (uint32_t(1) << source_color_format_component_count) - 1; + } + GetColorOwnershipTransferVulkanFormat(source_color_format, + &source_color_is_uint); + source_color_component_type = source_color_is_uint ? type_uint : type_float; + } else { + source_is_64bpp = false; + source_color_format_component_count = 0; + source_color_texture_component_mask = 0; + source_color_is_uint = false; + source_color_component_type = spv::NoType; + } + + std::vector main_interface; + + // Outputs. + bool shader_uses_stencil_reference_output = + mode.output == TransferOutput::kDepth && + provider.device_extensions().ext_shader_stencil_export; + bool dest_color_is_uint = false; + uint32_t dest_color_component_count = 0; + spv::Id type_fragment_data_component = spv::NoResult; + spv::Id type_fragment_data = spv::NoResult; + spv::Id output_fragment_data = spv::NoResult; + spv::Id output_fragment_depth = spv::NoResult; + spv::Id output_fragment_stencil_ref = spv::NoResult; + switch (mode.output) { + case TransferOutput::kColor: + GetColorOwnershipTransferVulkanFormat(dest_color_format, + &dest_color_is_uint); + dest_color_component_count = + xenos::GetColorRenderTargetFormatComponentCount(dest_color_format); + type_fragment_data_component = + dest_color_is_uint ? type_uint : type_float; + type_fragment_data = + dest_color_component_count > 1 + ? builder.makeVectorType(type_fragment_data_component, + dest_color_component_count) + : type_fragment_data_component; + output_fragment_data = builder.createVariable( + spv::NoPrecision, spv::StorageClassOutput, type_fragment_data, + "xe_transfer_fragment_data"); + builder.addDecoration(output_fragment_data, spv::DecorationLocation, + key.dest_color_rt_index); + main_interface.push_back(output_fragment_data); + break; + case TransferOutput::kDepth: + output_fragment_depth = + builder.createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float, "gl_FragDepth"); + builder.addDecoration(output_fragment_depth, spv::DecorationBuiltIn, + spv::BuiltInFragDepth); + main_interface.push_back(output_fragment_depth); + if (shader_uses_stencil_reference_output) { + builder.addExtension("SPV_EXT_shader_stencil_export"); + builder.addCapability(spv::CapabilityStencilExportEXT); + output_fragment_stencil_ref = + builder.createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_int, "gl_FragStencilRefARB"); + builder.addDecoration(output_fragment_stencil_ref, + spv::DecorationBuiltIn, + spv::BuiltInFragStencilRefEXT); + main_interface.push_back(output_fragment_stencil_ref); + } + break; + default: + break; + } + + // Bindings. + // Generating SPIR-V 1.0, no need to add bindings to the entry point's + // interface until SPIR-V 1.4. + // Color source. + bool source_is_multisampled = + key.source_msaa_samples != xenos::MsaaSamples::k1X; + spv::Id source_color_texture = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetColorTextureBit) { + source_color_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType(source_color_component_type, spv::Dim2D, false, + false, source_is_multisampled, 1, + spv::ImageFormatUnknown), + "xe_transfer_color"); + builder.addDecoration( + source_color_texture, spv::DecorationDescriptorSet, + xe::bit_count(pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetColorTextureBit - 1))); + builder.addDecoration(source_color_texture, spv::DecorationBinding, 0); + } + // Depth / stencil source. + spv::Id source_depth_texture = spv::NoResult; + spv::Id source_stencil_texture = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetDepthStencilTexturesBit) { + uint32_t source_depth_stencil_descriptor_set = + xe::bit_count(pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetDepthStencilTexturesBit - 1)); + // Using `depth == false` in makeImageType because comparisons are not + // required, and other values of `depth` are causing issues in drivers. + // https://github.com/microsoft/DirectXShaderCompiler/issues/1107 + if (mode.output != TransferOutput::kStencilBit) { + source_depth_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType(type_float, spv::Dim2D, false, false, + source_is_multisampled, 1, + spv::ImageFormatUnknown), + "xe_transfer_depth"); + builder.addDecoration(source_depth_texture, spv::DecorationDescriptorSet, + source_depth_stencil_descriptor_set); + builder.addDecoration(source_depth_texture, spv::DecorationBinding, 0); + } + if (mode.output != TransferOutput::kDepth || + shader_uses_stencil_reference_output) { + source_stencil_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType(type_uint, spv::Dim2D, false, false, + source_is_multisampled, 1, + spv::ImageFormatUnknown), + "xe_transfer_stencil"); + builder.addDecoration(source_stencil_texture, + spv::DecorationDescriptorSet, + source_depth_stencil_descriptor_set); + builder.addDecoration(source_stencil_texture, spv::DecorationBinding, 1); + } + } + // Host depth source buffer. + spv::Id host_depth_source_buffer = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetHostDepthBufferBit) { + id_vector_temp.clear(); + id_vector_temp.push_back(builder.makeRuntimeArray(type_uint)); + // Storage buffers have std430 packing, no padding to 4-component vectors. + builder.addDecoration(id_vector_temp.back(), spv::DecorationArrayStride, + sizeof(float)); + spv::Id type_host_depth_source_buffer = + builder.makeStructType(id_vector_temp, "XeTransferHostDepthBuffer"); + builder.addMemberName(type_host_depth_source_buffer, 0, "host_depth"); + builder.addMemberDecoration(type_host_depth_source_buffer, 0, + spv::DecorationNonWritable); + builder.addMemberDecoration(type_host_depth_source_buffer, 0, + spv::DecorationOffset, 0); + // Block since SPIR-V 1.3, but since SPIR-V 1.0 is generated, it's + // BufferBlock. + builder.addDecoration(type_host_depth_source_buffer, + spv::DecorationBufferBlock); + // StorageBuffer since SPIR-V 1.3, but since SPIR-V 1.0 is generated, it's + // Uniform. + host_depth_source_buffer = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniform, + type_host_depth_source_buffer, "xe_transfer_host_depth_buffer"); + builder.addDecoration( + host_depth_source_buffer, spv::DecorationDescriptorSet, + xe::bit_count(pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthBufferBit - 1))); + builder.addDecoration(host_depth_source_buffer, spv::DecorationBinding, 0); + } + // Host depth source texture (the depth / stencil descriptor set is reused, + // but stencil is not needed). + spv::Id host_depth_source_texture = spv::NoResult; + if (pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetHostDepthStencilTexturesBit) { + host_depth_source_texture = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniformConstant, + builder.makeImageType( + type_float, spv::Dim2D, false, false, + key.host_depth_source_msaa_samples != xenos::MsaaSamples::k1X, 1, + spv::ImageFormatUnknown), + "xe_transfer_host_depth"); + builder.addDecoration( + host_depth_source_texture, spv::DecorationDescriptorSet, + xe::bit_count( + pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthStencilTexturesBit - 1))); + builder.addDecoration(host_depth_source_texture, spv::DecorationBinding, 0); + } + // Push constants. + id_vector_temp.clear(); + uint32_t push_constants_member_host_depth_address = UINT32_MAX; + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + push_constants_member_host_depth_address = uint32_t(id_vector_temp.size()); + id_vector_temp.push_back(type_uint); + } + uint32_t push_constants_member_address = UINT32_MAX; + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordAddressBit) { + push_constants_member_address = uint32_t(id_vector_temp.size()); + id_vector_temp.push_back(type_uint); + } + uint32_t push_constants_member_stencil_mask = UINT32_MAX; + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordStencilMaskBit) { + push_constants_member_stencil_mask = uint32_t(id_vector_temp.size()); + id_vector_temp.push_back(type_uint); + } + spv::Id push_constants = spv::NoResult; + if (!id_vector_temp.empty()) { + spv::Id type_push_constants = + builder.makeStructType(id_vector_temp, "XeTransferPushConstants"); + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + assert_true(push_constants_member_host_depth_address != UINT32_MAX); + builder.addMemberName(type_push_constants, + push_constants_member_host_depth_address, + "host_depth_address"); + builder.addMemberDecoration( + type_push_constants, push_constants_member_host_depth_address, + spv::DecorationOffset, + sizeof(uint32_t) * + xe::bit_count( + pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordHostDepthAddressBit - 1))); + } + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordAddressBit) { + assert_true(push_constants_member_address != UINT32_MAX); + builder.addMemberName(type_push_constants, push_constants_member_address, + "address"); + builder.addMemberDecoration( + type_push_constants, push_constants_member_address, + spv::DecorationOffset, + sizeof(uint32_t) * + xe::bit_count(pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordAddressBit - 1))); + } + if (pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordStencilMaskBit) { + assert_true(push_constants_member_stencil_mask != UINT32_MAX); + builder.addMemberName(type_push_constants, + push_constants_member_stencil_mask, "stencil_mask"); + builder.addMemberDecoration( + type_push_constants, push_constants_member_stencil_mask, + spv::DecorationOffset, + sizeof(uint32_t) * + xe::bit_count( + pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordStencilMaskBit - 1))); + } + builder.addDecoration(type_push_constants, spv::DecorationBlock); + push_constants = builder.createVariable( + spv::NoPrecision, spv::StorageClassPushConstant, type_push_constants, + "xe_transfer_push_constants"); + } + + // Coordinate inputs. + spv::Id input_fragment_coord = builder.createVariable( + spv::NoPrecision, spv::StorageClassInput, type_float4, "gl_FragCoord"); + builder.addDecoration(input_fragment_coord, spv::DecorationBuiltIn, + spv::BuiltInFragCoord); + main_interface.push_back(input_fragment_coord); + spv::Id input_sample_id = spv::NoResult; + spv::Id spec_const_sample_id = spv::NoResult; + if (key.dest_msaa_samples != xenos::MsaaSamples::k1X) { + if (device_features.sampleRateShading) { + // One draw for all samples. + builder.addCapability(spv::CapabilitySampleRateShading); + input_sample_id = builder.createVariable( + spv::NoPrecision, spv::StorageClassInput, type_int, "gl_SampleID"); + builder.addDecoration(input_sample_id, spv::DecorationFlat); + builder.addDecoration(input_sample_id, spv::DecorationBuiltIn, + spv::BuiltInSampleId); + main_interface.push_back(input_sample_id); + } else { + // One sample per draw, with different sample masks. + spec_const_sample_id = builder.makeUintConstant(0, true); + builder.addName(spec_const_sample_id, "xe_transfer_sample_id"); + builder.addDecoration(spec_const_sample_id, spv::DecorationSpecId, 0); + } + } + + // Begin the main function. + std::vector main_param_types; + std::vector> main_precisions; + spv::Block* main_entry; + spv::Function* main_function = + builder.makeFunctionEntry(spv::NoPrecision, type_void, "main", + main_param_types, main_precisions, &main_entry); + + // Working with unsigned numbers for simplicity now, bitcasting to signed will + // be done at texture fetch. + + uint32_t tile_width_samples_scaled = + xenos::kEdramTileWidthSamples * resolution_scale_x_; + uint32_t tile_height_samples_scaled = + xenos::kEdramTileHeightSamples * resolution_scale_y_; + + // Convert the fragment coordinates to uint2. + uint_vector_temp.clear(); + uint_vector_temp.reserve(2); + uint_vector_temp.push_back(0); + uint_vector_temp.push_back(1); + spv::Id dest_pixel_coord = builder.createUnaryOp( + spv::OpConvertFToU, type_uint2, + builder.createRvalueSwizzle( + spv::NoPrecision, type_float2, + builder.createLoad(input_fragment_coord, spv::NoPrecision), + uint_vector_temp)); + + // Prove to the AMD compiler that 24*24 multiplication can be done. 16 bits + // are more than enough for coordinates even with 3x resolution scaling (and + // Direct3D 11 hardware has 16.8 fixed-point coordinates). + // TODO(Triang3l): OpUnreachable if the coordinates have upper bits set. + + // Split the destination pixel coordinate into scalars. + spv::Id dest_pixel_x = + builder.createCompositeExtract(dest_pixel_coord, type_uint, 0); + spv::Id dest_pixel_y = + builder.createCompositeExtract(dest_pixel_coord, type_uint, 1); + + // Split the destination pixel index into 32bpp tile and 32bpp-tile-relative + // pixel index. + uint32_t dest_sample_width_log2 = + uint32_t(dest_is_64bpp) + + uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X); + uint32_t dest_sample_height_log2 = + uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X); + uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_shift; + draw_util::GetEdramTileWidthDivideScaleAndUpperShift( + resolution_scale_x_, dest_tile_width_divide_scale, + dest_tile_width_divide_shift); + // Doing 16*16=32 multiplication, not 32*32=64. + // TODO(Triang3l): Abstract this away, don't do 32*32 on Direct3D 12 too. + dest_tile_width_divide_scale &= UINT16_MAX; + dest_tile_width_divide_shift += 16; + // Need the host tile size in pixels, not samples. + dest_tile_width_divide_shift -= dest_sample_width_log2; + spv::Id dest_tile_index_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, dest_pixel_x, + builder.makeUintConstant(dest_tile_width_divide_scale)), + builder.makeUintConstant(dest_tile_width_divide_shift)); + spv::Id dest_tile_pixel_x = builder.createBinOp( + spv::OpISub, type_uint, dest_pixel_x, + builder.createBinOp(spv::OpIMul, type_uint, dest_tile_index_x, + builder.makeUintConstant(tile_width_samples_scaled >> + dest_sample_width_log2))); + spv::Id dest_tile_index_y, dest_tile_pixel_y; + if (resolution_scale_y_ == 3) { + dest_tile_index_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, dest_pixel_y, + builder.makeUintConstant(draw_util::kDivideScale3 & UINT16_MAX)), + builder.makeUintConstant(draw_util::kDivideUpperShift3 + 16 + 4 - + dest_sample_height_log2)); + dest_tile_pixel_y = builder.createBinOp( + spv::OpISub, type_uint, dest_pixel_y, + builder.createBinOp( + spv::OpIMul, type_uint, dest_tile_index_y, + builder.makeUintConstant(tile_height_samples_scaled >> + dest_sample_height_log2))); + } else { + assert_true(resolution_scale_y_ <= 2); + uint32_t dest_tile_height_pixels_log2 = + (resolution_scale_y_ == 2 ? 5 : 4) - dest_sample_height_log2; + dest_tile_index_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_pixel_y, + builder.makeUintConstant(dest_tile_height_pixels_log2)); + dest_tile_pixel_y = builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_pixel_y, + builder.makeUintConstant((uint32_t(1) << dest_tile_height_pixels_log2) - + 1)); + } + + assert_true(push_constants_member_address != UINT32_MAX); + id_vector_temp.clear(); + id_vector_temp.push_back( + builder.makeIntConstant(int32_t(push_constants_member_address))); + spv::Id address_constant = builder.createLoad( + builder.createAccessChain(spv::StorageClassPushConstant, push_constants, + id_vector_temp), + spv::NoPrecision); + + // Calculate the 32bpp tile index from its X and Y parts. + spv::Id dest_tile_index = builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, address_constant, + builder.makeUintConstant(0), + builder.makeUintConstant(xenos::kEdramPitchTilesBits)), + dest_tile_index_y), + dest_tile_index_x); + + // Load the destination sample index. + spv::Id dest_sample_id = spv::NoResult; + if (key.dest_msaa_samples != xenos::MsaaSamples::k1X) { + if (device_features.sampleRateShading) { + assert_true(input_sample_id != spv::NoResult); + dest_sample_id = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createLoad(input_sample_id, spv::NoPrecision)); + } else { + assert_true(spec_const_sample_id != spv::NoResult); + // Already uint. + dest_sample_id = spec_const_sample_id; + } + } + + // Transform the destination framebuffer pixel and sample coordinates into the + // source texture pixel and sample coordinates. + + // First sample bit at 4x with Vulkan standard locations - horizontal sample. + // Second sample bit at 4x with Vulkan standard locations - vertical sample. + // At 2x: + // - Native 2x: top is 1 in Vulkan, bottom is 0. + // - 2x as 4x: top is 0, bottom is 3. + + spv::Id source_sample_id = dest_sample_id; + spv::Id source_tile_pixel_x = dest_tile_pixel_x; + spv::Id source_tile_pixel_y = dest_tile_pixel_y; + spv::Id source_color_half = spv::NoResult; + if (!source_is_64bpp && dest_is_64bpp) { + // 32bpp -> 64bpp, need two samples of the source. + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // 32bpp -> 64bpp, 4x ->. + // Source has 32bpp halves in two adjacent samples. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 32bpp -> 64bpp, 4x -> 4x. + // 1 destination horizontal sample = 2 source horizontal samples. + // D p0,0 s0,0 = S p0,0 s0,0 | S p0,0 s1,0 + // D p0,0 s1,0 = S p1,0 s0,0 | S p1,0 s1,0 + // D p0,0 s0,1 = S p0,0 s0,1 | S p0,0 s1,1 + // D p0,0 s1,1 = S p1,0 s0,1 | S p1,0 s1,1 + // Thus destination horizontal sample -> source horizontal pixel, + // vertical samples are 1:1. + source_sample_id = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id, + builder.makeUintConstant(1 << 1)); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_x = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + } else if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // 32bpp -> 64bpp, 4x -> 2x. + // 1 destination horizontal pixel = 2 source horizontal samples. + // D p0,0 s0 = S p0,0 s0,0 | S p0,0 s1,0 + // D p0,0 s1 = S p0,0 s0,1 | S p0,0 s1,1 + // D p1,0 s0 = S p1,0 s0,0 | S p1,0 s1,0 + // D p1,0 s1 = S p1,0 s0,1 | S p1,0 s1,1 + // Pixel index can be reused. Sample 1 (for native 2x) or 0 (for 2x as + // 4x) should become samples 01, sample 0 or 3 should become samples 23. + if (msaa_2x_attachments_supported_) { + source_sample_id = builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, + builder.createBinOp(spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1)), + builder.makeUintConstant(1)); + } else { + source_sample_id = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id, + builder.makeUintConstant(1 << 1)); + } + } else { + // 32bpp -> 64bpp, 4x -> 1x. + // 1 destination horizontal pixel = 2 source horizontal samples. + // D p0,0 = S p0,0 s0,0 | S p0,0 s1,0 + // D p0,1 = S p0,0 s0,1 | S p0,0 s1,1 + // Horizontal pixel index can be reused. Vertical pixel 1 should + // become sample 2. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + source_tile_pixel_y = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_y, builder.makeUintConstant(1)); + } + } else { + // 32bpp -> 64bpp, 1x/2x ->. + // Source has 32bpp halves in two adjacent pixels. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 32bpp -> 64bpp, 1x/2x -> 4x. + // The X part. + // 1 destination horizontal sample = 2 source horizontal pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpShiftLeftLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(2))); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_tile_pixel_x = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + // Y is handled by common code. + } else { + // 32bpp -> 64bpp, 1x/2x -> 1x/2x. + // The X part. + // 1 destination horizontal pixel = 2 source horizontal pixels. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftLeftLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(1)); + // Y is handled by common code. + } + } + } else if (source_is_64bpp && !dest_is_64bpp) { + // 64bpp -> 32bpp, also the half to load. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 64bpp -> 32bpp, -> 4x. + // The needed half is in the destination horizontal sample index. + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // 64bpp -> 32bpp, 4x -> 4x. + // D p0,0 s0,0 = S s0,0 low + // D p0,0 s1,0 = S s0,0 high + // D p1,0 s0,0 = S s1,0 low + // D p1,0 s1,0 = S s1,0 high + // Vertical pixel and sample (second bit) addressing is the same. + // However, 1 horizontal destination pixel = 1 horizontal source sample. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + // 2 destination horizontal samples = 1 source horizontal sample, thus + // 2 destination horizontal pixels = 1 source horizontal pixel. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(1)); + } else { + // 64bpp -> 32bpp, 1x/2x -> 4x. + // 2 destination horizontal samples = 1 source horizontal pixel, thus + // 1 destination horizontal pixel = 1 source horizontal pixel. Can reuse + // horizontal pixel index. + // Y is handled by common code. + } + // Half from the destination horizontal sample index. + source_color_half = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id, + builder.makeUintConstant(1)); + } else { + // 64bpp -> 32bpp, -> 1x/2x. + // The needed half is in the destination horizontal pixel index. + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // 64bpp -> 32bpp, 4x -> 1x/2x. + // (Destination horizontal pixel >> 1) & 1 = source horizontal sample + // (first bit). + source_sample_id = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1), builder.makeUintConstant(1)); + if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // 64bpp -> 32bpp, 4x -> 2x. + // Destination vertical samples (1/0 in the first bit for native 2x or + // 0/1 in the second bit for 2x as 4x) = source vertical samples + // (second bit). + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + } else { + // 64bpp -> 32bpp, 4x -> 1x. + // 1 destination vertical pixel = 1 source vertical sample. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(source_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } + // 2 destination horizontal pixels = 1 source horizontal sample. + // 4 destination horizontal pixels = 1 source horizontal pixel. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(2)); + } else { + // 64bpp -> 32bpp, 1x/2x -> 1x/2x. + // The X part. + // 2 destination horizontal pixels = 1 destination source pixel. + source_tile_pixel_x = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_x, builder.makeUintConstant(1)); + // Y is handled by common code. + } + // Half from the destination horizontal pixel index. + source_color_half = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + } + assert_true(source_color_half != spv::NoResult); + } else { + // Same bit count. + if (key.source_msaa_samples != key.dest_msaa_samples) { + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + // Same BPP, 4x -> 1x/2x. + if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // Same BPP, 4x -> 2x. + // Horizontal pixels to samples. Vertical sample (1/0 in the first bit + // for native 2x or 0/1 in the second bit for 2x as 4x) to second + // sample bit. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + } else { + // Same BPP, 4x -> 1x. + // Pixels to samples. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } + } else { + // Same BPP, 1x/2x -> 1x/2x/4x (as long as they're different). + // Only the X part - Y is handled by common code. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // Horizontal samples to pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_x = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + } + } + } + // Common source Y and sample index for 1x/2x AA sources, independent of bits + // per sample. + if (key.source_msaa_samples < xenos::MsaaSamples::k4X && + key.source_msaa_samples != key.dest_msaa_samples) { + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 1x/2x -> 4x. + if (key.source_msaa_samples == xenos::MsaaSamples::k2X) { + // 2x -> 4x. + // Vertical samples (second bit) of 4x destination to vertical sample + // (1, 0 for native 2x, or 0, 3 for 2x as 4x) of 2x source. + source_sample_id = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_sample_id, builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + source_sample_id = builder.createBinOp(spv::OpBitwiseXor, type_uint, + source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } else { + // 1x -> 4x. + // Vertical samples (second bit) to Y pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back( + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_sample_id, builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_y = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } else { + // 1x/2x -> different 1x/2x. + if (key.source_msaa_samples == xenos::MsaaSamples::k2X) { + // 2x -> 1x. + // Vertical pixels of 2x destination to vertical samples (1, 0 for + // native 2x, or 0, 3 for 2x as 4x) of 1x source. + source_sample_id = + builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + source_sample_id = builder.createBinOp(spv::OpBitwiseXor, type_uint, + source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + source_tile_pixel_y = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_tile_pixel_y, builder.makeUintConstant(1)); + } else { + // 1x -> 2x. + // Vertical samples (1/0 in the first bit for native 2x or 0/1 in the + // second bit for 2x as 4x) of 2x destination to vertical pixels of 1x + // source. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back( + builder.createBinOp(spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_y = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back( + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + dest_sample_id, builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + source_tile_pixel_y = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + } + } + } + + uint32_t source_pixel_width_dwords_log2 = + uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k4X) + + uint32_t(source_is_64bpp); + + if (source_is_color != dest_is_color) { + // Copying between color and depth / stencil - swap 40-32bpp-sample columns + // in the pixel index within the source 32bpp tile. + uint32_t source_32bpp_tile_half_pixels = + tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2); + source_tile_pixel_x = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createBinOp( + spv::OpIAdd, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, + source_tile_pixel_x), + builder.createTriOp( + spv::OpSelect, type_int, + builder.createBinOp( + spv::OpULessThan, builder.makeBoolType(), + source_tile_pixel_x, + builder.makeUintConstant(source_32bpp_tile_half_pixels)), + builder.makeIntConstant(int32_t(source_32bpp_tile_half_pixels)), + builder.makeIntConstant( + -int32_t(source_32bpp_tile_half_pixels))))); + } + + // Transform the destination 32bpp tile index into the source. + spv::Id source_tile_index = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createBinOp( + spv::OpIAdd, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, dest_tile_index), + builder.createTriOp( + spv::OpBitFieldSExtract, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, address_constant), + builder.makeUintConstant(xenos::kEdramPitchTilesBits * 2), + builder.makeUintConstant(xenos::kEdramBaseTilesBits)))); + // Split the source 32bpp tile index into X and Y tile index within the source + // image. + spv::Id source_pitch_tiles = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, address_constant, + builder.makeUintConstant(xenos::kEdramPitchTilesBits), + builder.makeUintConstant(xenos::kEdramPitchTilesBits)); + spv::Id source_tile_index_y = builder.createBinOp( + spv::OpUDiv, type_uint, source_tile_index, source_pitch_tiles); + spv::Id source_tile_index_x = builder.createBinOp( + spv::OpUMod, type_uint, source_tile_index, source_pitch_tiles); + // Finally calculate the source texture coordinates. + spv::Id source_pixel_x_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant(tile_width_samples_scaled >> + source_pixel_width_dwords_log2), + source_tile_index_x), + source_tile_pixel_x)); + spv::Id source_pixel_y_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant( + tile_height_samples_scaled >> + uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)), + source_tile_index_y), + source_tile_pixel_y)); + + // Load the source. + + spv::Builder::TextureParameters source_texture_parameters = {}; + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(source_pixel_x_int); + id_vector_temp.push_back(source_pixel_y_int); + spv::Id source_coordinates[2] = { + builder.createCompositeConstruct(type_int2, id_vector_temp), + }; + spv::Id source_sample_ids_int[2] = {}; + if (key.source_msaa_samples != xenos::MsaaSamples::k1X) { + source_sample_ids_int[0] = + builder.createUnaryOp(spv::OpBitcast, type_int, source_sample_id); + } else { + source_texture_parameters.lod = builder.makeIntConstant(0); + } + // Go to the next sample or pixel along X if need to load two dwords. + bool source_load_is_two_32bpp_samples = !source_is_64bpp && dest_is_64bpp; + if (source_load_is_two_32bpp_samples) { + if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) { + source_coordinates[1] = source_coordinates[0]; + source_sample_ids_int[1] = builder.createBinOp( + spv::OpBitwiseOr, type_int, source_sample_ids_int[0], + builder.makeIntConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(builder.createBinOp(spv::OpBitwiseOr, type_int, + source_pixel_x_int, + builder.makeIntConstant(1))); + id_vector_temp.push_back(source_pixel_y_int); + source_coordinates[1] = + builder.createCompositeConstruct(type_int2, id_vector_temp); + source_sample_ids_int[1] = source_sample_ids_int[0]; + } + } + spv::Id source_color[2][4] = {}; + if (source_color_texture != spv::NoResult) { + source_texture_parameters.sampler = + builder.createLoad(source_color_texture, spv::NoPrecision); + assert_true(source_color_component_type != spv::NoType); + spv::Id source_color_vec4_type = + builder.makeVectorType(source_color_component_type, 4); + for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) { + source_texture_parameters.coords = source_coordinates[i]; + source_texture_parameters.sample = source_sample_ids_int[i]; + spv::Id source_color_vec4 = builder.createTextureCall( + spv::NoPrecision, source_color_vec4_type, false, true, false, false, + false, source_texture_parameters, spv::ImageOperandsMaskNone); + uint32_t source_color_components_remaining = + source_color_texture_component_mask; + uint32_t source_color_component_index; + while (xe::bit_scan_forward(source_color_components_remaining, + &source_color_component_index)) { + source_color_components_remaining &= + ~(uint32_t(1) << source_color_component_index); + source_color[i][source_color_component_index] = + builder.createCompositeExtract(source_color_vec4, + source_color_component_type, + source_color_component_index); + } + } + } + spv::Id source_depth_float[2] = {}; + if (source_depth_texture != spv::NoResult) { + source_texture_parameters.sampler = + builder.createLoad(source_depth_texture, spv::NoPrecision); + for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) { + source_texture_parameters.coords = source_coordinates[i]; + source_texture_parameters.sample = source_sample_ids_int[i]; + source_depth_float[i] = builder.createCompositeExtract( + builder.createTextureCall( + spv::NoPrecision, type_float4, false, true, false, false, false, + source_texture_parameters, spv::ImageOperandsMaskNone), + type_float, 0); + } + } + spv::Id source_stencil[2] = {}; + if (source_stencil_texture != spv::NoResult) { + source_texture_parameters.sampler = + builder.createLoad(source_stencil_texture, spv::NoPrecision); + for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) { + source_texture_parameters.coords = source_coordinates[i]; + source_texture_parameters.sample = source_sample_ids_int[i]; + source_stencil[i] = builder.createCompositeExtract( + builder.createTextureCall( + spv::NoPrecision, type_uint4, false, true, false, false, false, + source_texture_parameters, spv::ImageOperandsMaskNone), + type_uint, 0); + } + } + + // Pick the needed 32bpp half of the 64bpp color. + if (source_is_64bpp && !dest_is_64bpp) { + uint32_t source_color_half_component_count = + source_color_format_component_count >> 1; + assert_true(source_color_half != spv::NoResult); + spv::Id source_color_is_second_half = + builder.createBinOp(spv::OpINotEqual, type_bool, source_color_half, + builder.makeUintConstant(0)); + if (mode.output == TransferOutput::kStencilBit) { + source_color[0][0] = builder.createTriOp( + spv::OpSelect, source_color_component_type, + source_color_is_second_half, + source_color[0][source_color_half_component_count], + source_color[0][0]); + } else { + for (uint32_t i = 0; i < source_color_half_component_count; ++i) { + source_color[0][i] = builder.createTriOp( + spv::OpSelect, source_color_component_type, + source_color_is_second_half, + source_color[0][source_color_half_component_count + i], + source_color[0][i]); + } + } + } + + if (output_fragment_stencil_ref != spv::NoResult && + source_stencil[0] != spv::NoResult) { + // For the depth -> depth case, write the stencil directly to the output. + assert_true(mode.output == TransferOutput::kDepth); + builder.createStore(source_stencil[0], output_fragment_stencil_ref); + } + + if (dest_is_64bpp) { + // Construct the 64bpp color from two 32-bit samples or one 64-bit sample. + // If `packed` (two uints) are created, use the generic path involving + // unpacking. + // Otherwise, the fragment data output must be written to directly by the + // reached control flow path. + spv::Id packed[2] = {}; + if (source_is_color) { + switch (source_color_format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale = builder.makeFloatConstant(255.0f); + spv::Id component_width = builder.makeUintConstant(8); + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[i][0], unorm_scale), + unorm_round_offset)); + for (uint32_t j = 1; j < 4; ++j) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[i][j], unorm_scale), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(8 * j)); + id_vector_temp.push_back(component_width); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale_rgb = builder.makeFloatConstant(1023.0f); + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f); + spv::Id width_a = builder.makeUintConstant(2); + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[i][0], unorm_scale_rgb), + unorm_round_offset)); + for (uint32_t j = 1; j < 4; ++j) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, source_color[i][j], + j == 3 ? unorm_scale_a : unorm_scale_rgb), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(10 * j)); + id_vector_temp.push_back(j == 3 ? width_a : width_rgb); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id float_0 = builder.makeFloatConstant(0.0f); + spv::Id float_1 = builder.makeFloatConstant(1.0f); + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f); + spv::Id offset_a = builder.makeUintConstant(30); + spv::Id width_a = builder.makeUintConstant(2); + for (uint32_t i = 0; i < 2; ++i) { + // Float16 has a wider range for both color and alpha, also NaNs - + // clamp and convert. + packed[i] = SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[i][0], ext_inst_glsl_std_450); + for (uint32_t j = 1; j < 3; ++j) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back( + SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[i][j], ext_inst_glsl_std_450)); + id_vector_temp.push_back(builder.makeUintConstant(10 * j)); + id_vector_temp.push_back(width_rgb); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + // Saturate and convert the alpha. + id_vector_temp.clear(); + id_vector_temp.reserve(3); + id_vector_temp.push_back(source_color[i][3]); + id_vector_temp.push_back(float_0); + id_vector_temp.push_back(float_1); + spv::Id alpha_saturated = + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450NClamp, id_vector_temp); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed[i]); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + alpha_saturated, unorm_scale_a), + unorm_round_offset))); + id_vector_temp.push_back(offset_a); + id_vector_temp.push_back(width_a); + packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } break; + // All 64bpp formats, and all 16 bits per component formats, are + // represented as integers in ownership transfer for safe handling of + // NaN encodings and -32768 / -32767. + // TODO(Triang3l): Handle the case when that's not true (no multisampled + // sampled images, no 16-bit UNORM, no cross-packing 32bpp aliasing on a + // portability subset device or a 64bpp format where that wouldn't help + // anyway). + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: { + if (dest_color_format == + xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + spv::Id component_offset_width = builder.makeUintConstant(16); + spv::Id color_16_in_32[2]; + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_color[i][0]); + id_vector_temp.push_back(source_color[i][1]); + id_vector_temp.push_back(component_offset_width); + id_vector_temp.push_back(component_offset_width); + color_16_in_32[i] = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(color_16_in_32[0]); + id_vector_temp.push_back(color_16_in_32[1]); + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[i >> 1][i & 1]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: { + if (dest_color_format == + xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + spv::Id component_offset_width = builder.makeUintConstant(16); + spv::Id color_16_in_32[2]; + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_color[0][i << 1]); + id_vector_temp.push_back(source_color[0][(i << 1) + 1]); + id_vector_temp.push_back(component_offset_width); + id_vector_temp.push_back(component_offset_width); + color_16_in_32[i] = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(color_16_in_32[0]); + id_vector_temp.push_back(color_16_in_32[1]); + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } + } break; + // Float32 is transferred as uint32 to preserve NaN encodings. However, + // multisampled sampled image support is optional in Vulkan. + case xenos::ColorRenderTargetFormat::k_32_FLOAT: { + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = source_color[i][0]; + if (!source_color_is_uint) { + packed[i] = + builder.createUnaryOp(spv::OpBitcast, type_uint, packed[i]); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: { + for (uint32_t i = 0; i < 2; ++i) { + packed[i] = source_color[0][i]; + if (!source_color_is_uint) { + packed[i] = + builder.createUnaryOp(spv::OpBitcast, type_uint, packed[i]); + } + } + } break; + } + } else { + assert_true(source_depth_texture != spv::NoResult); + assert_true(source_stencil_texture != spv::NoResult); + spv::Id depth_offset = builder.makeUintConstant(8); + spv::Id depth_width = builder.makeUintConstant(24); + for (uint32_t i = 0; i < 2; ++i) { + spv::Id depth24 = spv::NoResult; + switch (source_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Round to the nearest even integer. This seems to be the + // correct, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. + id_vector_temp.clear(); + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, source_depth_float[i], + builder.makeFloatConstant(float(0xFFFFFF)))); + depth24 = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450RoundEven, id_vector_temp)); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4( + builder, source_depth_float[i], true, ext_inst_glsl_std_450); + } break; + } + // Merge depth and stencil. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_stencil[i]); + id_vector_temp.push_back(depth24); + id_vector_temp.push_back(depth_offset); + id_vector_temp.push_back(depth_width); + packed[i] = + builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } + // Common path unless there was a specialized one - unpack two packed 32-bit + // parts. + if (packed[0] != spv::NoResult) { + assert_true(packed[1] != spv::NoResult); + if (dest_color_format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(packed[0]); + id_vector_temp.push_back(packed[1]); + // Multisampled sampled images are optional in Vulkan, and image views + // of different formats can't be created separately for sampled image + // and color attachment usages, so no multisampled integer sampled image + // support implies no multisampled integer framebuffer attachment + // support in Xenia. + if (!dest_color_is_uint) { + for (spv::Id& float32 : id_vector_temp) { + float32 = + builder.createUnaryOp(spv::OpBitcast, type_float, float32); + } + } + builder.createStore(builder.createCompositeConstruct(type_fragment_data, + id_vector_temp), + output_fragment_data); + } else { + spv::Id const_uint_0 = builder.makeUintConstant(0); + spv::Id const_uint_16 = builder.makeUintConstant(16); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, packed[i >> 1], + (i & 1) ? const_uint_16 : const_uint_0, const_uint_16)); + } + // TODO(Triang3l): Handle the case when that's not true (no multisampled + // sampled images, no 16-bit UNORM, no cross-packing 32bpp aliasing on a + // portability subset device or a 64bpp format where that wouldn't help + // anyway). + builder.createStore(builder.createCompositeConstruct(type_fragment_data, + id_vector_temp), + output_fragment_data); + } + } + } else { + // If `packed` is created, use the generic path involving unpacking. + // - For a color destination, the packed 32bpp color. + // - For a depth / stencil destination, stencil in 0:7, depth in 8:31 + // normally, or depth in 0:23 and zeros in 24:31 with packed_only_depth. + // - For a stencil bit, stencil in 0:7. + // Otherwise, the fragment data or fragment depth / stencil output must be + // written to directly by the reached control flow path. + spv::Id packed = spv::NoResult; + bool packed_only_depth = false; + if (source_is_color) { + switch (source_color_format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + if (dest_is_color && + (dest_color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8 || + dest_color_format == + xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA)) { + // Same format - passthrough. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale = builder.makeFloatConstant(255.0f); + uint32_t packed_component_offset = 0; + if (mode.output == TransferOutput::kDepth) { + // When need only depth, not stencil, skip the red component, and + // put the depth from GBA directly in the lower bits. + packed_component_offset = 1; + packed_only_depth = true; + if (output_fragment_stencil_ref != spv::NoResult) { + builder.createStore( + builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[0][0], + unorm_scale), + unorm_round_offset)), + output_fragment_stencil_ref); + } + } + packed = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, + source_color[0][packed_component_offset], unorm_scale), + unorm_round_offset)); + if (mode.output != TransferOutput::kStencilBit) { + spv::Id component_width = builder.makeUintConstant(8); + for (uint32_t i = 1; i < 4 - packed_component_offset; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, + source_color[0][packed_component_offset + i], + unorm_scale), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(8 * i)); + id_vector_temp.push_back(component_width); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + if (dest_is_color && + (dest_color_format == + xenos::ColorRenderTargetFormat::k_2_10_10_10 || + dest_color_format == xenos::ColorRenderTargetFormat:: + k_2_10_10_10_AS_10_10_10_10)) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f); + spv::Id unorm_scale_rgb = builder.makeFloatConstant(1023.0f); + packed = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + source_color[0][0], unorm_scale_rgb), + unorm_round_offset)); + if (mode.output != TransferOutput::kStencilBit) { + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f); + spv::Id width_a = builder.makeUintConstant(2); + for (uint32_t i = 1; i < 4; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp( + spv::OpFMul, type_float, source_color[0][i], + i == 3 ? unorm_scale_a : unorm_scale_rgb), + unorm_round_offset))); + id_vector_temp.push_back(builder.makeUintConstant(10 * i)); + id_vector_temp.push_back(i == 3 ? width_a : width_rgb); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + if (dest_is_color && + (dest_color_format == + xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT || + dest_color_format == xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16)) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + // Float16 has a wider range for both color and alpha, also NaNs - + // clamp and convert. + packed = SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[0][0], ext_inst_glsl_std_450); + if (mode.output != TransferOutput::kStencilBit) { + spv::Id width_rgb = builder.makeUintConstant(10); + for (uint32_t i = 1; i < 3; ++i) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back( + SpirvShaderTranslator::UnclampedFloat32To7e3( + builder, source_color[0][i], ext_inst_glsl_std_450)); + id_vector_temp.push_back(builder.makeUintConstant(10 * i)); + id_vector_temp.push_back(width_rgb); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + // Saturate and convert the alpha. + id_vector_temp.clear(); + id_vector_temp.reserve(3); + id_vector_temp.push_back(source_color[0][3]); + id_vector_temp.push_back(builder.makeFloatConstant(0.0f)); + id_vector_temp.push_back(builder.makeFloatConstant(1.0f)); + spv::Id alpha_saturated = + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450NClamp, id_vector_temp); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBinOp( + spv::OpFAdd, type_float, + builder.createBinOp(spv::OpFMul, type_float, + alpha_saturated, + builder.makeFloatConstant(3.0f)), + builder.makeFloatConstant(0.5f)))); + id_vector_temp.push_back(builder.makeUintConstant(30)); + id_vector_temp.push_back(builder.makeUintConstant(2)); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: { + // All 64bpp formats, and all 16 bits per component formats, are + // represented as integers in ownership transfer for safe handling of + // NaN encodings and -32768 / -32767. + // TODO(Triang3l): Handle the case when that's not true (no + // multisampled sampled images, no 16-bit UNORM, no cross-packing + // 32bpp aliasing on a portability subset device or a 64bpp format + // where that wouldn't help anyway). + if (dest_is_color && + (dest_color_format == xenos::ColorRenderTargetFormat::k_16_16 || + dest_color_format == + xenos::ColorRenderTargetFormat::k_16_16_FLOAT)) { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.push_back(source_color[0][i]); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } else { + packed = source_color[0][0]; + if (mode.output != TransferOutput::kStencilBit) { + spv::Id component_offset_width = builder.makeUintConstant(16); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(source_color[0][1]); + id_vector_temp.push_back(component_offset_width); + id_vector_temp.push_back(component_offset_width); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } break; + // Float32 is transferred as uint32 to preserve NaN encodings. However, + // multisampled sampled image support is optional in Vulkan. + case xenos::ColorRenderTargetFormat::k_32_FLOAT: + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: { + packed = source_color[0][0]; + if (!source_color_is_uint) { + packed = builder.createUnaryOp(spv::OpBitcast, type_uint, packed); + } + } break; + } + } else if (source_depth_float[0] != spv::NoResult) { + if (mode.output == TransferOutput::kDepth && + dest_depth_format == source_depth_format) { + builder.createStore(source_depth_float[0], output_fragment_depth); + } else { + switch (source_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Round to the nearest even integer. This seems to be the correct, + // adding +0.5 and rounding towards zero results in red instead of + // black in the 4D5307E6 clear shader. + id_vector_temp.clear(); + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, source_depth_float[0], + builder.makeFloatConstant(float(0xFFFFFF)))); + packed = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450RoundEven, id_vector_temp)); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + packed = SpirvShaderTranslator::PreClampedDepthTo20e4( + builder, source_depth_float[0], true, ext_inst_glsl_std_450); + } break; + } + if (mode.output == TransferOutput::kDepth) { + packed_only_depth = true; + } else { + // Merge depth and stencil. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(source_stencil[0]); + id_vector_temp.push_back(packed); + id_vector_temp.push_back(builder.makeUintConstant(8)); + id_vector_temp.push_back(builder.makeUintConstant(24)); + packed = builder.createOp(spv::OpBitFieldInsert, type_uint, + id_vector_temp); + } + } + } + switch (mode.output) { + case TransferOutput::kColor: { + // Unless a special path was taken, unpack the raw 32bpp value into the + // 32bpp color output. + if (packed != spv::NoResult) { + switch (dest_color_format) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + spv::Id component_width = builder.makeUintConstant(8); + spv::Id unorm_scale = builder.makeFloatConstant(1.0f / 255.0f); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, packed, + builder.makeUintConstant(8 * i), component_width)), + unorm_scale)); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + spv::Id width_rgb = builder.makeUintConstant(10); + spv::Id unorm_scale_rgb = + builder.makeFloatConstant(1.0f / 1023.0f); + spv::Id width_a = builder.makeUintConstant(2); + spv::Id unorm_scale_a = builder.makeFloatConstant(1.0f / 3.0f); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + for (uint32_t i = 0; i < 4; ++i) { + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, + packed, + builder.makeUintConstant(10 * i), + i == 3 ? width_a : width_rgb)), + i == 3 ? unorm_scale_a : unorm_scale_rgb)); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + // Color. + spv::Id width_rgb = builder.makeUintConstant(10); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp.push_back(SpirvShaderTranslator::Float7e3To32( + builder, packed, 10 * i, false, ext_inst_glsl_std_450)); + } + // Alpha. + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createTriOp(spv::OpBitFieldUExtract, type_uint, + packed, builder.makeUintConstant(30), + builder.makeUintConstant(2))), + builder.makeFloatConstant(1.0f / 3.0f))); + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: { + // All 16 bits per component formats are represented as integers + // in ownership transfer for safe handling of NaN encodings and + // -32768 / -32767. + // TODO(Triang3l): Handle the case when that's not true (no + // multisampled sampled images, no 16-bit UNORM, no cross-packing + // 32bpp aliasing on a portability subset device or a 64bpp format + // where that wouldn't help anyway). + spv::Id component_offset_width = builder.makeUintConstant(16); + id_vector_temp.clear(); + id_vector_temp.reserve(2); + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp.push_back(builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, packed, + i ? component_offset_width : builder.makeUintConstant(0), + component_offset_width)); + } + builder.createStore(builder.createCompositeConstruct( + type_fragment_data, id_vector_temp), + output_fragment_data); + } break; + case xenos::ColorRenderTargetFormat::k_32_FLOAT: { + // Float32 is transferred as uint32 to preserve NaN encodings. + // However, multisampled sampled images are optional in Vulkan, + // and image views of different formats can't be created + // separately for sampled image and color attachment usages, so no + // multisampled integer sampled image support implies no + // multisampled integer framebuffer attachment support in Xenia. + spv::Id float32 = packed; + if (!dest_color_is_uint) { + float32 = + builder.createUnaryOp(spv::OpBitcast, type_float, float32); + } + builder.createStore(float32, output_fragment_data); + } break; + default: + // A 64bpp format (handled separately) or an invalid one. + assert_unhandled_case(dest_color_format); + } + } + } break; + case TransferOutput::kDepth: { + if (packed) { + spv::Id guest_depth24 = packed; + if (!packed_only_depth) { + // Extract the depth bits. + guest_depth24 = + builder.createBinOp(spv::OpShiftRightLogical, type_uint, + guest_depth24, builder.makeUintConstant(8)); + } + // Load the host float32 depth, check if, when converted to the guest + // format, it's the same as the guest source, thus up to date, and if + // it is, write host float32 depth, otherwise do the guest -> host + // conversion. + spv::Id host_depth32 = spv::NoResult; + if (host_depth_source_texture != spv::NoResult) { + // Convert position and sample index from within the destination + // tile to within the host depth source tile, like for the guest + // render target, but for 32bpp -> 32bpp only. + spv::Id host_depth_source_sample_id = dest_sample_id; + spv::Id host_depth_source_tile_pixel_x = dest_tile_pixel_x; + spv::Id host_depth_source_tile_pixel_y = dest_tile_pixel_y; + if (key.host_depth_source_msaa_samples != key.dest_msaa_samples) { + if (key.host_depth_source_msaa_samples >= + xenos::MsaaSamples::k4X) { + // 4x -> 1x/2x. + if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) { + // 4x -> 2x. + // Horizontal pixels to samples. Vertical sample (1/0 in the + // first bit for native 2x or 0/1 in the second bit for 2x as + // 4x) to second sample bit. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(0)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + host_depth_source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + } else { + // 4x -> 1x. + // Pixels to samples. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + host_depth_source_tile_pixel_x = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x, + builder.makeUintConstant(1)); + host_depth_source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } + } else { + // 1x/2x -> 1x/2x/4x (as long as they're different). + // Only the X part - Y is handled by common code. + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // Horizontal samples to pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_x = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } + // Host depth source Y and sample index for 1x/2x AA sources. + if (key.host_depth_source_msaa_samples < + xenos::MsaaSamples::k4X) { + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // 1x/2x -> 4x. + if (key.host_depth_source_msaa_samples == + xenos::MsaaSamples::k2X) { + // 2x -> 4x. + // Vertical samples (second bit) of 4x destination to + // vertical sample (1, 0 for native 2x, or 0, 3 for 2x as + // 4x) of 2x source. + host_depth_source_sample_id = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_sample_id, + builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + host_depth_source_sample_id = + builder.createBinOp(spv::OpBitwiseXor, type_uint, + host_depth_source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } else { + // 1x -> 4x. + // Vertical samples (second bit) to Y pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_y = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } else { + // 1x/2x -> different 1x/2x. + if (key.host_depth_source_msaa_samples == + xenos::MsaaSamples::k2X) { + // 2x -> 1x. + // Vertical pixels of 2x destination to vertical samples (1, + // 0 for native 2x, or 0, 3 for 2x as 4x) of 1x source. + host_depth_source_sample_id = builder.createBinOp( + spv::OpBitwiseAnd, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + if (msaa_2x_attachments_supported_) { + host_depth_source_sample_id = + builder.createBinOp(spv::OpBitwiseXor, type_uint, + host_depth_source_sample_id, + builder.makeUintConstant(1)); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(host_depth_source_sample_id); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(1)); + host_depth_source_sample_id = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + host_depth_source_tile_pixel_y = builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y, + builder.makeUintConstant(1)); + } else { + // 1x -> 2x. + // Vertical samples (1/0 in the first bit for native 2x or + // 0/1 in the second bit for 2x as 4x) of 2x destination to + // vertical pixels of 1x source. + if (msaa_2x_attachments_supported_) { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpBitwiseXor, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_y = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } else { + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + spv::OpShiftRightLogical, type_uint, dest_sample_id, + builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + host_depth_source_tile_pixel_y = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + } + } + } + } + assert_true(push_constants_member_host_depth_address != UINT32_MAX); + id_vector_temp.clear(); + id_vector_temp.push_back(builder.makeIntConstant( + int32_t(push_constants_member_host_depth_address))); + spv::Id host_depth_address_constant = builder.createLoad( + builder.createAccessChain(spv::StorageClassPushConstant, + push_constants, id_vector_temp), + spv::NoPrecision); + // Transform the destination tile index into the host depth source. + spv::Id host_depth_source_tile_index = builder.createUnaryOp( + spv::OpBitcast, type_uint, + builder.createBinOp( + spv::OpIAdd, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, + dest_tile_index), + builder.createTriOp( + spv::OpBitFieldSExtract, type_int, + builder.createUnaryOp(spv::OpBitcast, type_int, + host_depth_address_constant), + builder.makeUintConstant(xenos::kEdramPitchTilesBits * + 2), + builder.makeUintConstant(xenos::kEdramBaseTilesBits)))); + // Split the host depth source tile index into X and Y tile index + // within the source image. + spv::Id host_depth_source_pitch_tiles = builder.createTriOp( + spv::OpBitFieldUExtract, type_uint, host_depth_address_constant, + builder.makeUintConstant(xenos::kEdramPitchTilesBits), + builder.makeUintConstant(xenos::kEdramPitchTilesBits)); + spv::Id host_depth_source_tile_index_y = builder.createBinOp( + spv::OpUDiv, type_uint, host_depth_source_tile_index, + host_depth_source_pitch_tiles); + spv::Id host_depth_source_tile_index_x = builder.createBinOp( + spv::OpUMod, type_uint, host_depth_source_tile_index, + host_depth_source_pitch_tiles); + // Finally calculate the host depth source texture coordinates. + spv::Id host_depth_source_pixel_x_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIMul, type_uint, + builder.makeUintConstant( + tile_width_samples_scaled >> + uint32_t(key.source_msaa_samples >= + xenos::MsaaSamples::k4X)), + host_depth_source_tile_index_x), + host_depth_source_tile_pixel_x)); + spv::Id host_depth_source_pixel_y_int = builder.createUnaryOp( + spv::OpBitcast, type_int, + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp(spv::OpIMul, type_uint, + builder.makeUintConstant( + tile_height_samples_scaled >> + uint32_t(key.source_msaa_samples >= + xenos::MsaaSamples::k2X)), + host_depth_source_tile_index_y), + host_depth_source_tile_pixel_y)); + // Load the host depth source. + spv::Builder::TextureParameters + host_depth_source_texture_parameters = {}; + host_depth_source_texture_parameters.sampler = + builder.createLoad(host_depth_source_texture, spv::NoPrecision); + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(host_depth_source_pixel_x_int); + id_vector_temp.push_back(host_depth_source_pixel_y_int); + host_depth_source_texture_parameters.coords = + builder.createCompositeConstruct(type_int2, id_vector_temp); + if (key.host_depth_source_msaa_samples != xenos::MsaaSamples::k1X) { + host_depth_source_texture_parameters.sample = + builder.createUnaryOp(spv::OpBitcast, type_int, + host_depth_source_sample_id); + } else { + host_depth_source_texture_parameters.lod = + builder.makeIntConstant(0); + } + host_depth32 = builder.createCompositeExtract( + builder.createTextureCall(spv::NoPrecision, type_float4, false, + true, false, false, false, + host_depth_source_texture_parameters, + spv::ImageOperandsMaskNone), + type_float, 0); + } else if (host_depth_source_buffer != spv::NoResult) { + // Get the address in the EDRAM scratch buffer and load from there. + // The beginning of the buffer is (0, 0) of the destination. + // 40-sample columns are not swapped for addressing simplicity + // (because this is used for depth -> depth transfers, where + // swapping isn't needed). + // Convert samples to pixels. + assert_true(key.host_depth_source_msaa_samples == + xenos::MsaaSamples::k1X); + spv::Id dest_tile_sample_x = dest_tile_pixel_x; + spv::Id dest_tile_sample_y = dest_tile_pixel_y; + if (key.dest_msaa_samples >= xenos::MsaaSamples::k2X) { + if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) { + // Horizontal sample index in bit 0. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(dest_sample_id); + id_vector_temp.push_back(dest_tile_pixel_x); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + dest_tile_sample_x = builder.createOp( + spv::OpBitFieldInsert, type_uint, id_vector_temp); + } + // Vertical sample index as 1 or 0 in bit 0 for true 2x or as 0 + // or 1 in bit 1 for 4x or for 2x emulated as 4x. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(builder.createBinOp( + (key.dest_msaa_samples == xenos::MsaaSamples::k2X && + msaa_2x_attachments_supported_) + ? spv::OpBitwiseXor + : spv::OpShiftRightLogical, + type_uint, dest_sample_id, builder.makeUintConstant(1))); + id_vector_temp.push_back(dest_tile_pixel_y); + id_vector_temp.push_back(builder.makeUintConstant(1)); + id_vector_temp.push_back(builder.makeUintConstant(31)); + dest_tile_sample_y = builder.createOp(spv::OpBitFieldInsert, + type_uint, id_vector_temp); + } + // Combine the tile sample index and the tile index. + spv::Id host_depth_offset = builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant(tile_width_samples_scaled * + tile_height_samples_scaled), + dest_tile_index), + builder.createBinOp( + spv::OpIAdd, type_uint, + builder.createBinOp( + spv::OpIMul, type_uint, + builder.makeUintConstant(tile_width_samples_scaled), + dest_tile_sample_y), + dest_tile_sample_x)); + id_vector_temp.clear(); + id_vector_temp.reserve(2); + // The only SSBO structure member. + id_vector_temp.push_back(builder.makeIntConstant(0)); + id_vector_temp.push_back(builder.createUnaryOp( + spv::OpBitcast, type_int, host_depth_offset)); + // StorageBuffer since SPIR-V 1.3, but since SPIR-V 1.0 is + // generated, it's Uniform. + host_depth32 = builder.createUnaryOp( + spv::OpBitcast, type_float, + builder.createLoad( + builder.createAccessChain(spv::StorageClassUniform, + host_depth_source_buffer, + id_vector_temp), + spv::NoPrecision)); + } + spv::Block* depth24_to_depth32_header = builder.getBuildPoint(); + spv::Id depth24_to_depth32_convert_id = spv::NoResult; + spv::Block* depth24_to_depth32_merge = nullptr; + spv::Id host_depth24 = spv::NoResult; + if (host_depth32 != spv::NoResult) { + // Convert the host depth value to the guest format and check if it + // matches the value in the currently owning guest render target. + switch (dest_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Round to the nearest even integer. This seems to be the + // correct, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. + id_vector_temp.clear(); + id_vector_temp.push_back(builder.createBinOp( + spv::OpFMul, type_float, host_depth32, + builder.makeFloatConstant(float(0xFFFFFF)))); + host_depth24 = builder.createUnaryOp( + spv::OpConvertFToU, type_uint, + builder.createBuiltinCall(type_float, ext_inst_glsl_std_450, + GLSLstd450RoundEven, + id_vector_temp)); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + host_depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4( + builder, host_depth32, true, ext_inst_glsl_std_450); + } break; + } + assert_true(host_depth24 != spv::NoResult); + // Update the header block pointer after the conversion (to avoid + // assuming that the conversion doesn't branch). + depth24_to_depth32_header = builder.getBuildPoint(); + spv::Id host_depth_outdated = builder.createBinOp( + spv::OpINotEqual, type_bool, guest_depth24, host_depth24); + spv::Block& depth24_to_depth32_convert_entry = + builder.makeNewBlock(); + { + spv::Block& depth24_to_depth32_merge_block = + builder.makeNewBlock(); + depth24_to_depth32_merge = &depth24_to_depth32_merge_block; + } + { + std::unique_ptr depth24_to_depth32_merge_op = + std::make_unique(spv::OpSelectionMerge); + depth24_to_depth32_merge_op->addIdOperand( + depth24_to_depth32_merge->getId()); + depth24_to_depth32_merge_op->addImmediateOperand( + spv::SelectionControlMaskNone); + builder.getBuildPoint()->addInstruction( + std::move(depth24_to_depth32_merge_op)); + } + builder.createConditionalBranch(host_depth_outdated, + &depth24_to_depth32_convert_entry, + depth24_to_depth32_merge); + builder.setBuildPoint(&depth24_to_depth32_convert_entry); + } + // Convert the guest 24-bit depth to float32 (in an open conditional + // if the host depth is also loaded). + spv::Id guest_depth32 = spv::NoResult; + switch (dest_depth_format) { + case xenos::DepthRenderTargetFormat::kD24S8: { + // Multiplying by 1.0 / 0xFFFFFF produces an incorrect result (for + // 0xC00000, for instance - which is 2_10_10_10 clear to 0001) - + // rescale from 0...0xFFFFFF to 0...0x1000000 doing what true + // float division followed by multiplication does (on x86-64 MSVC + // with default SSE rounding) - values starting from 0x800000 + // become bigger by 1; then accurately bias the result's exponent. + guest_depth32 = builder.createBinOp( + spv::OpFMul, type_float, + builder.createUnaryOp( + spv::OpConvertUToF, type_float, + builder.createBinOp( + spv::OpIAdd, type_uint, guest_depth24, + builder.createBinOp(spv::OpShiftRightLogical, + type_uint, guest_depth24, + builder.makeUintConstant(23)))), + builder.makeFloatConstant(1.0f / float(1 << 24))); + } break; + case xenos::DepthRenderTargetFormat::kD24FS8: { + guest_depth32 = SpirvShaderTranslator::Depth20e4To32( + builder, guest_depth24, 0, true, false, + ext_inst_glsl_std_450); + } break; + } + assert_true(guest_depth32 != spv::NoResult); + spv::Id fragment_depth32 = guest_depth32; + if (host_depth32 != spv::NoResult) { + assert_not_null(depth24_to_depth32_merge); + spv::Id depth24_to_depth32_result_block_id = + builder.getBuildPoint()->getId(); + builder.createBranch(depth24_to_depth32_merge); + builder.setBuildPoint(depth24_to_depth32_merge); + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(guest_depth32); + id_vector_temp.push_back(depth24_to_depth32_result_block_id); + id_vector_temp.push_back(host_depth32); + id_vector_temp.push_back(depth24_to_depth32_header->getId()); + fragment_depth32 = + builder.createOp(spv::OpPhi, type_float, id_vector_temp); + } + builder.createStore(fragment_depth32, output_fragment_depth); + } + } break; + case TransferOutput::kStencilBit: { + if (packed) { + // Kill the sample if the needed stencil bit is not set. + assert_true(push_constants_member_stencil_mask != UINT32_MAX); + id_vector_temp.clear(); + id_vector_temp.push_back(builder.makeIntConstant( + int32_t(push_constants_member_stencil_mask))); + spv::Id stencil_mask_constant = builder.createLoad( + builder.createAccessChain(spv::StorageClassPushConstant, + push_constants, id_vector_temp), + spv::NoPrecision); + spv::Id stencil_sample_passed = builder.createBinOp( + spv::OpINotEqual, type_bool, + builder.createBinOp(spv::OpBitwiseAnd, type_uint, packed, + stencil_mask_constant), + builder.makeUintConstant(0)); + spv::Block& stencil_bit_kill_block = builder.makeNewBlock(); + spv::Block& stencil_bit_merge_block = builder.makeNewBlock(); + { + std::unique_ptr stencil_bit_merge_op = + std::make_unique(spv::OpSelectionMerge); + stencil_bit_merge_op->addIdOperand(stencil_bit_merge_block.getId()); + stencil_bit_merge_op->addImmediateOperand( + spv::SelectionControlMaskNone); + builder.getBuildPoint()->addInstruction( + std::move(stencil_bit_merge_op)); + } + builder.createConditionalBranch(stencil_sample_passed, + &stencil_bit_merge_block, + &stencil_bit_kill_block); + builder.setBuildPoint(&stencil_bit_kill_block); + builder.createNoResultOp(spv::OpKill); + builder.setBuildPoint(&stencil_bit_merge_block); + } + } break; + } + } + + // End the main function and make it the entry point. + builder.leaveFunction(); + builder.addExecutionMode(main_function, spv::ExecutionModeOriginUpperLeft); + if (output_fragment_depth != spv::NoResult) { + builder.addExecutionMode(main_function, spv::ExecutionModeDepthReplacing); + } + if (output_fragment_stencil_ref != spv::NoResult) { + builder.addExecutionMode(main_function, + spv::ExecutionModeStencilRefReplacingEXT); + } + spv::Instruction* entry_point = + builder.addEntryPoint(spv::ExecutionModelFragment, main_function, "main"); + for (spv::Id interface_id : main_interface) { + entry_point->addIdOperand(interface_id); + } + + // Serialize the shader code. + std::vector shader_code; + builder.dump(shader_code); + + // Create the shader module, and store the handle even if creation fails not + // to try to create it again later. + VkShaderModule shader_module = ui::vulkan::util::CreateShaderModule( + provider, reinterpret_cast(shader_code.data()), + sizeof(uint32_t) * shader_code.size()); + if (shader_module == VK_NULL_HANDLE) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target ownership " + "transfer shader 0x{:08X}", + key.key); + } + transfer_shaders_.emplace(key, shader_module); + return shader_module; +} + +VkPipeline const* VulkanRenderTargetCache::GetTransferPipelines( + TransferPipelineKey key) { + auto pipeline_it = transfer_pipelines_.find(key); + if (pipeline_it != transfer_pipelines_.end()) { + return pipeline_it->second[0] != VK_NULL_HANDLE ? pipeline_it->second.data() + : nullptr; + } + + VkRenderPass render_pass = GetRenderPass(key.render_pass_key); + VkShaderModule fragment_shader_module = GetTransferShader(key.shader_key); + if (render_pass == VK_NULL_HANDLE || + fragment_shader_module == VK_NULL_HANDLE) { + transfer_pipelines_.emplace(key, std::array{}); + return nullptr; + } + + const TransferModeInfo& mode = kTransferModes[size_t(key.shader_key.mode)]; + + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + + uint32_t dest_sample_count = uint32_t(1) + << uint32_t(key.shader_key.dest_msaa_samples); + bool dest_is_masked_sample = + dest_sample_count > 1 && !device_features.sampleRateShading; + + VkPipelineShaderStageCreateInfo shader_stages[2]; + shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stages[0].pNext = nullptr; + shader_stages[0].flags = 0; + shader_stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT; + shader_stages[0].module = transfer_passthrough_vertex_shader_; + shader_stages[0].pName = "main"; + shader_stages[0].pSpecializationInfo = nullptr; + shader_stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stages[1].pNext = nullptr; + shader_stages[1].flags = 0; + shader_stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT; + shader_stages[1].module = fragment_shader_module; + shader_stages[1].pName = "main"; + shader_stages[1].pSpecializationInfo = nullptr; + VkSpecializationMapEntry sample_id_specialization_map_entry; + uint32_t sample_id_specialization_constant; + VkSpecializationInfo sample_id_specialization_info; + if (dest_is_masked_sample) { + sample_id_specialization_map_entry.constantID = 0; + sample_id_specialization_map_entry.offset = 0; + sample_id_specialization_map_entry.size = sizeof(uint32_t); + sample_id_specialization_constant = 0; + sample_id_specialization_info.mapEntryCount = 1; + sample_id_specialization_info.pMapEntries = + &sample_id_specialization_map_entry; + sample_id_specialization_info.dataSize = + sizeof(sample_id_specialization_constant); + sample_id_specialization_info.pData = &sample_id_specialization_constant; + shader_stages[1].pSpecializationInfo = &sample_id_specialization_info; + } + + VkVertexInputBindingDescription vertex_input_binding; + vertex_input_binding.binding = 0; + vertex_input_binding.stride = sizeof(float) * 2; + vertex_input_binding.inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + VkVertexInputAttributeDescription vertex_input_attribute; + vertex_input_attribute.location = 0; + vertex_input_attribute.binding = 0; + vertex_input_attribute.format = VK_FORMAT_R32G32_SFLOAT; + vertex_input_attribute.offset = 0; + VkPipelineVertexInputStateCreateInfo vertex_input_state; + vertex_input_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vertex_input_state.pNext = nullptr; + vertex_input_state.flags = 0; + vertex_input_state.vertexBindingDescriptionCount = 1; + vertex_input_state.pVertexBindingDescriptions = &vertex_input_binding; + vertex_input_state.vertexAttributeDescriptionCount = 1; + vertex_input_state.pVertexAttributeDescriptions = &vertex_input_attribute; + + VkPipelineInputAssemblyStateCreateInfo input_assembly_state; + input_assembly_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + input_assembly_state.pNext = nullptr; + input_assembly_state.flags = 0; + input_assembly_state.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + input_assembly_state.primitiveRestartEnable = VK_FALSE; + + // Dynamic, to stay within maxViewportDimensions while preferring a + // power-of-two factor for converting from pixel coordinates to NDC for exact + // precision. + VkPipelineViewportStateCreateInfo viewport_state; + viewport_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewport_state.pNext = nullptr; + viewport_state.flags = 0; + viewport_state.viewportCount = 1; + viewport_state.pViewports = nullptr; + viewport_state.scissorCount = 1; + viewport_state.pScissors = nullptr; + + VkPipelineRasterizationStateCreateInfo rasterization_state = {}; + rasterization_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterization_state.polygonMode = VK_POLYGON_MODE_FILL; + rasterization_state.cullMode = VK_CULL_MODE_NONE; + rasterization_state.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rasterization_state.lineWidth = 1.0f; + + // For samples other than the first, will be changed for the pipelines for + // other samples. + VkSampleMask sample_mask = UINT32_MAX; + VkPipelineMultisampleStateCreateInfo multisample_state = {}; + multisample_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisample_state.rasterizationSamples = + (dest_sample_count == 2 && !msaa_2x_attachments_supported_) + ? VK_SAMPLE_COUNT_4_BIT + : VkSampleCountFlagBits(dest_sample_count); + if (dest_sample_count > 1) { + if (device_features.sampleRateShading) { + multisample_state.sampleShadingEnable = VK_TRUE; + multisample_state.minSampleShading = 1.0f; + if (dest_sample_count == 2 && !msaa_2x_attachments_supported_) { + // Emulating 2x MSAA as samples 0 and 3 of 4x MSAA when 2x is not + // supported. + sample_mask = 0b1001; + } + } else { + sample_mask = 0b1; + } + if (sample_mask != UINT32_MAX) { + multisample_state.pSampleMask = &sample_mask; + } + } + + // Whether the depth / stencil state is used depends on the presence of a + // depth attachment in the render pass - but not making assumptions about + // whether the render pass contains any specific attachments, so setting up + // valid depth / stencil state unconditionally. + VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {}; + depth_stencil_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + if (mode.output == TransferOutput::kDepth) { + depth_stencil_state.depthTestEnable = VK_TRUE; + depth_stencil_state.depthWriteEnable = VK_TRUE; + depth_stencil_state.depthCompareOp = cvars::depth_transfer_not_equal_test + ? VK_COMPARE_OP_NOT_EQUAL + : VK_COMPARE_OP_ALWAYS; + } + if ((mode.output == TransferOutput::kDepth && + provider.device_extensions().ext_shader_stencil_export) || + mode.output == TransferOutput::kStencilBit) { + depth_stencil_state.stencilTestEnable = VK_TRUE; + depth_stencil_state.front.failOp = VK_STENCIL_OP_KEEP; + depth_stencil_state.front.passOp = VK_STENCIL_OP_REPLACE; + depth_stencil_state.front.depthFailOp = VK_STENCIL_OP_REPLACE; + // Using ALWAYS, not NOT_EQUAL, so depth writing is unaffected by stencil + // being different. + depth_stencil_state.front.compareOp = VK_COMPARE_OP_ALWAYS; + // Will be dynamic for stencil bit output. + depth_stencil_state.front.writeMask = UINT8_MAX; + depth_stencil_state.front.reference = UINT8_MAX; + depth_stencil_state.back = depth_stencil_state.front; + } + + // Whether the color blend state is used depends on the presence of color + // attachments in the render pass - but not making assumptions about whether + // the render pass contains any specific attachments, so setting up valid + // color blend state unconditionally. + VkPipelineColorBlendAttachmentState + color_blend_attachments[xenos::kMaxColorRenderTargets] = {}; + VkPipelineColorBlendStateCreateInfo color_blend_state = {}; + color_blend_state.sType = + VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + color_blend_state.attachmentCount = + 32 - xe::lzcnt(key.render_pass_key.depth_and_color_used >> 1); + color_blend_state.pAttachments = color_blend_attachments; + if (mode.output == TransferOutput::kColor) { + if (device_features.independentBlend) { + // State the intention more explicitly. + color_blend_attachments[key.shader_key.dest_color_rt_index] + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + } else { + // The blend state for all attachments must be identical, but other render + // targets are not written to by the shader. + for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) { + color_blend_attachments[i].colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + } + } + } + + std::array dynamic_states; + VkPipelineDynamicStateCreateInfo dynamic_state; + dynamic_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamic_state.pNext = nullptr; + dynamic_state.flags = 0; + dynamic_state.dynamicStateCount = 0; + dynamic_state.pDynamicStates = dynamic_states.data(); + dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_VIEWPORT; + dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_SCISSOR; + if (mode.output == TransferOutput::kStencilBit) { + dynamic_states[dynamic_state.dynamicStateCount++] = + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK; + } + + std::array pipelines{}; + VkGraphicsPipelineCreateInfo pipeline_create_info; + pipeline_create_info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipeline_create_info.pNext = nullptr; + pipeline_create_info.flags = 0; + if (dest_is_masked_sample) { + pipeline_create_info.flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT; + } + pipeline_create_info.stageCount = uint32_t(xe::countof(shader_stages)); + pipeline_create_info.pStages = shader_stages; + pipeline_create_info.pVertexInputState = &vertex_input_state; + pipeline_create_info.pInputAssemblyState = &input_assembly_state; + pipeline_create_info.pTessellationState = nullptr; + pipeline_create_info.pViewportState = &viewport_state; + pipeline_create_info.pRasterizationState = &rasterization_state; + pipeline_create_info.pMultisampleState = &multisample_state; + pipeline_create_info.pDepthStencilState = &depth_stencil_state; + pipeline_create_info.pColorBlendState = &color_blend_state; + pipeline_create_info.pDynamicState = &dynamic_state; + pipeline_create_info.layout = + transfer_pipeline_layouts_[size_t(mode.pipeline_layout)]; + pipeline_create_info.renderPass = render_pass; + pipeline_create_info.subpass = 0; + pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; + pipeline_create_info.basePipelineIndex = -1; + if (dfn.vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, + &pipeline_create_info, nullptr, + &pipelines[0]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target ownership " + "transfer pipeline for render pass 0x{:08X}, shader 0x{:08X}", + key.render_pass_key.key, key.shader_key.key); + transfer_pipelines_.emplace(key, std::array{}); + return nullptr; + } + if (dest_is_masked_sample) { + assert_true(multisample_state.pSampleMask == &sample_mask); + pipeline_create_info.flags = (pipeline_create_info.flags & + ~VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT) | + VK_PIPELINE_CREATE_DERIVATIVE_BIT; + pipeline_create_info.basePipelineHandle = pipelines[0]; + for (uint32_t i = 1; i < dest_sample_count; ++i) { + // Emulating 2x MSAA as samples 0 and 3 of 4x MSAA when 2x is not + // supported. + uint32_t host_sample_index = + (dest_sample_count == 2 && !msaa_2x_attachments_supported_ && i == 1) + ? 3 + : i; + sample_id_specialization_constant = host_sample_index; + sample_mask = uint32_t(1) << host_sample_index; + if (dfn.vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, + &pipeline_create_info, nullptr, + &pipelines[i]) != VK_SUCCESS) { + XELOGE( + "VulkanRenderTargetCache: Failed to create the render target " + "ownership transfer pipeline for render pass 0x{:08X}, shader " + "0x{:08X}, sample {}", + key.render_pass_key.key, key.shader_key.key, i); + for (uint32_t j = 0; j < i; ++j) { + dfn.vkDestroyPipeline(device, pipelines[j], nullptr); + } + transfer_pipelines_.emplace(key, std::array{}); + return nullptr; + } + } + } + return transfer_pipelines_.emplace(key, pipelines).first->second.data(); +} + +void VulkanRenderTargetCache::PerformTransfersAndResolveClears( + uint32_t render_target_count, RenderTarget* const* render_targets, + const std::vector* render_target_transfers, + const uint64_t* render_target_resolve_clear_values, + const Transfer::Rectangle* resolve_clear_rectangle) { + assert_true(GetPath() == Path::kHostRenderTargets); + + const ui::vulkan::VulkanProvider& provider = + command_processor_.GetVulkanProvider(); + const VkPhysicalDeviceLimits& device_limits = + provider.device_properties().limits; + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); + bool shader_stencil_export = + provider.device_extensions().ext_shader_stencil_export; + uint64_t current_submission = command_processor_.GetCurrentSubmission(); + DeferredCommandBuffer& command_buffer = + command_processor_.deferred_command_buffer(); + + bool resolve_clear_needed = + render_target_resolve_clear_values && resolve_clear_rectangle; + VkClearRect resolve_clear_rect; + if (resolve_clear_needed) { + // Assuming the rectangle is already clamped by the setup function from the + // common render target cache. + resolve_clear_rect.rect.offset.x = + int32_t(resolve_clear_rectangle->x_pixels * resolution_scale_x_); + resolve_clear_rect.rect.offset.y = + int32_t(resolve_clear_rectangle->y_pixels * resolution_scale_y_); + resolve_clear_rect.rect.extent.width = + resolve_clear_rectangle->width_pixels * resolution_scale_x_; + resolve_clear_rect.rect.extent.height = + resolve_clear_rectangle->height_pixels * resolution_scale_y_; + resolve_clear_rect.baseArrayLayer = 0; + resolve_clear_rect.layerCount = 1; + } + + // Do host depth storing for the depth destination (assuming there can be only + // one depth destination) where depth destination == host depth source. + bool host_depth_store_set_up = false; + for (uint32_t i = 0; i < render_target_count; ++i) { + RenderTarget* dest_rt = render_targets[i]; + if (!dest_rt) { + continue; + } + auto& dest_vulkan_rt = *static_cast(dest_rt); + RenderTargetKey dest_rt_key = dest_vulkan_rt.key(); + if (!dest_rt_key.is_depth) { + continue; + } + const std::vector& depth_transfers = render_target_transfers[i]; + for (const Transfer& transfer : depth_transfers) { + if (transfer.host_depth_source != dest_rt) { + continue; + } + if (!host_depth_store_set_up) { + // Pipeline. + command_processor_.BindExternalComputePipeline( + host_depth_store_pipelines_[size_t(dest_rt_key.msaa_samples)]); + // Descriptor set bindings. + VkDescriptorSet host_depth_store_descriptor_sets[] = { + edram_storage_buffer_descriptor_set_, + dest_vulkan_rt.GetDescriptorSetTransferSource(), + }; + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_COMPUTE, host_depth_store_pipeline_layout_, + 0, uint32_t(xe::countof(host_depth_store_descriptor_sets)), + host_depth_store_descriptor_sets, 0, nullptr); + // Render target constant. + HostDepthStoreRenderTargetConstant + host_depth_store_render_target_constant = + GetHostDepthStoreRenderTargetConstant( + dest_rt_key.pitch_tiles_at_32bpp, + msaa_2x_attachments_supported_); + command_buffer.CmdVkPushConstants( + host_depth_store_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + uint32_t(offsetof(HostDepthStoreConstants, render_target)), + sizeof(host_depth_store_render_target_constant), + &host_depth_store_render_target_constant); + // Barriers - don't need to try to combine them with the rest of + // render target transfer barriers now - if this happens, after host + // depth storing, SHADER_READ -> DEPTH_STENCIL_ATTACHMENT_WRITE will be + // done anyway even in the best case, so it's not possible to have all + // the barriers in one place here. + UseEdramBuffer(EdramBufferUsage::kComputeWrite); + // Always transitioning both depth and stencil, not storing separate + // usage flags for depth and stencil. + command_processor_.PushImageMemoryBarrier( + dest_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT), + dest_vulkan_rt.current_stage_mask(), + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + dest_vulkan_rt.current_access_mask(), VK_ACCESS_SHADER_READ_BIT, + dest_vulkan_rt.current_layout(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + dest_vulkan_rt.SetUsage(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + host_depth_store_set_up = true; + } + Transfer::Rectangle + transfer_rectangles[Transfer::kMaxRectanglesWithCutout]; + uint32_t transfer_rectangle_count = transfer.GetRectangles( + dest_rt_key.base_tiles, dest_rt_key.pitch_tiles_at_32bpp, + dest_rt_key.msaa_samples, false, transfer_rectangles, + resolve_clear_rectangle); + assert_not_zero(transfer_rectangle_count); + HostDepthStoreRectangleConstant host_depth_store_rectangle_constant; + for (uint32_t j = 0; j < transfer_rectangle_count; ++j) { + uint32_t group_count_x, group_count_y; + GetHostDepthStoreRectangleInfo( + transfer_rectangles[j], dest_rt_key.msaa_samples, + host_depth_store_rectangle_constant, group_count_x, group_count_y); + command_buffer.CmdVkPushConstants( + host_depth_store_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT, + uint32_t(offsetof(HostDepthStoreConstants, rectangle)), + sizeof(host_depth_store_rectangle_constant), + &host_depth_store_rectangle_constant); + command_processor_.SubmitBarriers(true); + command_buffer.CmdVkDispatch(group_count_x, group_count_y, 1); + MarkEdramBufferModified(); + } + } + break; + } + + constexpr VkPipelineStageFlags kSourceStageMask = + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + constexpr VkAccessFlags kSourceAccessMask = VK_ACCESS_SHADER_READ_BIT; + constexpr VkImageLayout kSourceLayout = + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + // Try to insert as many barriers as possible in one place, hoping that in the + // best case (no cross-copying between current render targets), barriers will + // need to be only inserted here, not between transfers. In case of + // cross-copying, if the destination use is going to happen before the source + // use, choose the destination state, otherwise the source state - to match + // the order in which transfers will actually happen (otherwise there will be + // just a useless switch back and forth). + for (uint32_t i = 0; i < render_target_count; ++i) { + RenderTarget* dest_rt = render_targets[i]; + if (!dest_rt) { + continue; + } + const std::vector& dest_transfers = render_target_transfers[i]; + if (!resolve_clear_needed && dest_transfers.empty()) { + continue; + } + // Transition the destination, only if not going to be used as a source + // earlier. + bool dest_used_previously_as_source = false; + for (uint32_t j = 0; j < i; ++j) { + for (const Transfer& previous_transfer : render_target_transfers[j]) { + if (previous_transfer.source == dest_rt || + previous_transfer.host_depth_source == dest_rt) { + dest_used_previously_as_source = true; + break; + } + } + } + if (!dest_used_previously_as_source) { + auto& dest_vulkan_rt = *static_cast(dest_rt); + VkPipelineStageFlags dest_dst_stage_mask; + VkAccessFlags dest_dst_access_mask; + VkImageLayout dest_new_layout; + dest_vulkan_rt.GetDrawUsage(&dest_dst_stage_mask, &dest_dst_access_mask, + &dest_new_layout); + command_processor_.PushImageMemoryBarrier( + dest_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + dest_vulkan_rt.key().is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + dest_vulkan_rt.current_stage_mask(), dest_dst_stage_mask, + dest_vulkan_rt.current_access_mask(), dest_dst_access_mask, + dest_vulkan_rt.current_layout(), dest_new_layout); + dest_vulkan_rt.SetUsage(dest_dst_stage_mask, dest_dst_access_mask, + dest_new_layout); + } + // Transition the sources, only if not going to be used as destinations + // earlier. + for (const Transfer& transfer : dest_transfers) { + bool source_previously_used_as_dest = false; + bool host_depth_source_previously_used_as_dest = false; + for (uint32_t j = 0; j < i; ++j) { + if (render_target_transfers[j].empty()) { + continue; + } + const RenderTarget* previous_rt = render_targets[j]; + if (transfer.source == previous_rt) { + source_previously_used_as_dest = true; + } + if (transfer.host_depth_source == previous_rt) { + host_depth_source_previously_used_as_dest = true; + } + } + if (!source_previously_used_as_dest) { + auto& source_vulkan_rt = + *static_cast(transfer.source); + command_processor_.PushImageMemoryBarrier( + source_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + source_vulkan_rt.key().is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + source_vulkan_rt.current_stage_mask(), kSourceStageMask, + source_vulkan_rt.current_access_mask(), kSourceAccessMask, + source_vulkan_rt.current_layout(), kSourceLayout); + source_vulkan_rt.SetUsage(kSourceStageMask, kSourceAccessMask, + kSourceLayout); + } + // transfer.host_depth_source == dest_rt means the EDRAM buffer will be + // used instead, no need to transition. + if (transfer.host_depth_source && transfer.host_depth_source != dest_rt && + !host_depth_source_previously_used_as_dest) { + auto& host_depth_source_vulkan_rt = + *static_cast(transfer.host_depth_source); + command_processor_.PushImageMemoryBarrier( + host_depth_source_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT), + host_depth_source_vulkan_rt.current_stage_mask(), kSourceStageMask, + host_depth_source_vulkan_rt.current_access_mask(), + kSourceAccessMask, host_depth_source_vulkan_rt.current_layout(), + kSourceLayout); + host_depth_source_vulkan_rt.SetUsage(kSourceStageMask, + kSourceAccessMask, kSourceLayout); + } + } + } + if (host_depth_store_set_up) { + // Will be reading copied host depth from the EDRAM buffer. + UseEdramBuffer(EdramBufferUsage::kFragmentRead); + } + + // Perform the transfers and clears. + + TransferPipelineLayoutIndex last_transfer_pipeline_layout_index = + TransferPipelineLayoutIndex::kCount; + uint32_t transfer_descriptor_sets_bound = 0; + uint32_t transfer_push_constants_set = 0; + VkDescriptorSet last_descriptor_set_host_depth_stencil_textures = + VK_NULL_HANDLE; + VkDescriptorSet last_descriptor_set_depth_stencil_textures = VK_NULL_HANDLE; + VkDescriptorSet last_descriptor_set_color_texture = VK_NULL_HANDLE; + TransferAddressConstant last_host_depth_address_constant; + TransferAddressConstant last_address_constant; + + for (uint32_t i = 0; i < render_target_count; ++i) { + RenderTarget* dest_rt = render_targets[i]; + if (!dest_rt) { + continue; + } + + const std::vector& current_transfers = render_target_transfers[i]; + if (current_transfers.empty() && !resolve_clear_needed) { + continue; + } + + auto& dest_vulkan_rt = *static_cast(dest_rt); + RenderTargetKey dest_rt_key = dest_vulkan_rt.key(); + + // Late barriers in case there was cross-copying that prevented merging of + // barriers. + { + VkPipelineStageFlags dest_dst_stage_mask; + VkAccessFlags dest_dst_access_mask; + VkImageLayout dest_new_layout; + dest_vulkan_rt.GetDrawUsage(&dest_dst_stage_mask, &dest_dst_access_mask, + &dest_new_layout); + command_processor_.PushImageMemoryBarrier( + dest_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + dest_rt_key.is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + dest_vulkan_rt.current_stage_mask(), dest_dst_stage_mask, + dest_vulkan_rt.current_access_mask(), dest_dst_access_mask, + dest_vulkan_rt.current_layout(), dest_new_layout); + dest_vulkan_rt.SetUsage(dest_dst_stage_mask, dest_dst_access_mask, + dest_new_layout); + } + + // Get the objects needed for transfers to the destination. + // TODO(Triang3l): Reuse the guest render pass for transfers where possible + // (if the Vulkan format used for drawing is also usable for transfers - for + // instance, R8G8B8A8_UNORM can be used for both, so the guest pass can be + // reused, but R16G16B16A16_SFLOAT render targets use R16G16B16A16_UINT for + // transfers, so the transfer pass has to be separate) to avoid stores and + // loads on tile-based devices to make this actually applicable. Also + // overall perform all non-cross-copying transfers for the current + // framebuffer configuration in a single pass, to load / store only once. + RenderPassKey transfer_render_pass_key; + transfer_render_pass_key.msaa_samples = dest_rt_key.msaa_samples; + if (dest_rt_key.is_depth) { + transfer_render_pass_key.depth_and_color_used = 0b1; + transfer_render_pass_key.depth_format = dest_rt_key.GetDepthFormat(); + } else { + transfer_render_pass_key.depth_and_color_used = 0b1 << 1; + transfer_render_pass_key.color_0_view_format = + dest_rt_key.GetColorFormat(); + transfer_render_pass_key.color_rts_use_transfer_formats = 1; + } + VkRenderPass transfer_render_pass = GetRenderPass(transfer_render_pass_key); + if (transfer_render_pass == VK_NULL_HANDLE) { + continue; + } + const RenderTarget* + transfer_framebuffer_render_targets[1 + xenos::kMaxColorRenderTargets] = + {}; + transfer_framebuffer_render_targets[dest_rt_key.is_depth ? 0 : 1] = dest_rt; + const Framebuffer* transfer_framebuffer = GetFramebuffer( + transfer_render_pass_key, dest_rt_key.pitch_tiles_at_32bpp, + transfer_framebuffer_render_targets); + if (!transfer_framebuffer) { + continue; + } + // Don't enter the render pass immediately - may still insert source + // barriers later. + + if (!current_transfers.empty()) { + uint32_t dest_pitch_tiles = dest_rt_key.GetPitchTiles(); + bool dest_is_64bpp = dest_rt_key.Is64bpp(); + + // Gather shader keys and sort to reduce pipeline state and binding + // switches. Also gather stencil rectangles to clear if needed. + bool need_stencil_bit_draws = + dest_rt_key.is_depth && !shader_stencil_export; + current_transfer_invocations_.clear(); + current_transfer_invocations_.reserve( + current_transfers.size() << uint32_t(need_stencil_bit_draws)); + uint32_t rt_sort_index = 0; + TransferShaderKey new_transfer_shader_key; + new_transfer_shader_key.dest_msaa_samples = dest_rt_key.msaa_samples; + new_transfer_shader_key.dest_resource_format = + dest_rt_key.resource_format; + uint32_t stencil_clear_rectangle_count = 0; + for (uint32_t j = 0; j <= uint32_t(need_stencil_bit_draws); ++j) { + // j == 0 - color or depth. + // j == 1 - stencil bits. + // Stencil bit writing always requires a different root signature, + // handle these separately. Stencil never has a host depth source. + // Clear previously set sort indices. + for (const Transfer& transfer : current_transfers) { + auto host_depth_source_vulkan_rt = + static_cast(transfer.host_depth_source); + if (host_depth_source_vulkan_rt) { + host_depth_source_vulkan_rt->SetTemporarySortIndex(UINT32_MAX); + } + assert_not_null(transfer.source); + auto& source_vulkan_rt = + *static_cast(transfer.source); + source_vulkan_rt.SetTemporarySortIndex(UINT32_MAX); + } + for (const Transfer& transfer : current_transfers) { + assert_not_null(transfer.source); + auto& source_vulkan_rt = + *static_cast(transfer.source); + VulkanRenderTarget* host_depth_source_vulkan_rt = + j ? nullptr + : static_cast(transfer.host_depth_source); + if (host_depth_source_vulkan_rt && + host_depth_source_vulkan_rt->temporary_sort_index() == + UINT32_MAX) { + host_depth_source_vulkan_rt->SetTemporarySortIndex(rt_sort_index++); + } + if (source_vulkan_rt.temporary_sort_index() == UINT32_MAX) { + source_vulkan_rt.SetTemporarySortIndex(rt_sort_index++); + } + RenderTargetKey source_rt_key = source_vulkan_rt.key(); + new_transfer_shader_key.source_msaa_samples = + source_rt_key.msaa_samples; + new_transfer_shader_key.source_resource_format = + source_rt_key.resource_format; + bool host_depth_source_is_copy = + host_depth_source_vulkan_rt == &dest_vulkan_rt; + // The host depth copy buffer has only raw samples. + new_transfer_shader_key.host_depth_source_msaa_samples = + (host_depth_source_vulkan_rt && !host_depth_source_is_copy) + ? host_depth_source_vulkan_rt->key().msaa_samples + : xenos::MsaaSamples::k1X; + if (j) { + new_transfer_shader_key.mode = + source_rt_key.is_depth ? TransferMode::kDepthToStencilBit + : TransferMode::kColorToStencilBit; + stencil_clear_rectangle_count += + transfer.GetRectangles(dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, + nullptr, resolve_clear_rectangle); + } else { + if (dest_rt_key.is_depth) { + if (host_depth_source_vulkan_rt) { + if (host_depth_source_is_copy) { + new_transfer_shader_key.mode = + source_rt_key.is_depth + ? TransferMode::kDepthAndHostDepthCopyToDepth + : TransferMode::kColorAndHostDepthCopyToDepth; + } else { + new_transfer_shader_key.mode = + source_rt_key.is_depth + ? TransferMode::kDepthAndHostDepthToDepth + : TransferMode::kColorAndHostDepthToDepth; + } + } else { + new_transfer_shader_key.mode = + source_rt_key.is_depth ? TransferMode::kDepthToDepth + : TransferMode::kColorToDepth; + } + } else { + new_transfer_shader_key.mode = source_rt_key.is_depth + ? TransferMode::kDepthToColor + : TransferMode::kColorToColor; + } + } + current_transfer_invocations_.emplace_back(transfer, + new_transfer_shader_key); + if (j) { + current_transfer_invocations_.back().transfer.host_depth_source = + nullptr; + } + } + } + std::sort(current_transfer_invocations_.begin(), + current_transfer_invocations_.end()); + + for (auto it = current_transfer_invocations_.cbegin(); + it != current_transfer_invocations_.cend(); ++it) { + assert_not_null(it->transfer.source); + auto& source_vulkan_rt = + *static_cast(it->transfer.source); + command_processor_.PushImageMemoryBarrier( + source_vulkan_rt.image(), + ui::vulkan::util::InitializeSubresourceRange( + source_vulkan_rt.key().is_depth + ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) + : VK_IMAGE_ASPECT_COLOR_BIT), + source_vulkan_rt.current_stage_mask(), kSourceStageMask, + source_vulkan_rt.current_access_mask(), kSourceAccessMask, + source_vulkan_rt.current_layout(), kSourceLayout); + source_vulkan_rt.SetUsage(kSourceStageMask, kSourceAccessMask, + kSourceLayout); + auto host_depth_source_vulkan_rt = + static_cast(it->transfer.host_depth_source); + if (host_depth_source_vulkan_rt) { + TransferShaderKey transfer_shader_key = it->shader_key; + if (transfer_shader_key.mode == + TransferMode::kDepthAndHostDepthCopyToDepth || + transfer_shader_key.mode == + TransferMode::kColorAndHostDepthCopyToDepth) { + // Reading copied host depth from the EDRAM buffer. + UseEdramBuffer(EdramBufferUsage::kFragmentRead); + } else { + // Reading host depth from the texture. + command_processor_.PushImageMemoryBarrier( + host_depth_source_vulkan_rt->image(), + ui::vulkan::util::InitializeSubresourceRange( + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT), + host_depth_source_vulkan_rt->current_stage_mask(), + kSourceStageMask, + host_depth_source_vulkan_rt->current_access_mask(), + kSourceAccessMask, + host_depth_source_vulkan_rt->current_layout(), kSourceLayout); + host_depth_source_vulkan_rt->SetUsage( + kSourceStageMask, kSourceAccessMask, kSourceLayout); + } + } + } + + // Perform the transfers for the render target. + + command_processor_.SubmitBarriersAndEnterRenderTargetCacheRenderPass( + transfer_render_pass, transfer_framebuffer); + + if (stencil_clear_rectangle_count) { + VkClearAttachment* stencil_clear_attachment; + VkClearRect* stencil_clear_rect_write_ptr; + command_buffer.CmdClearAttachmentsEmplace(1, stencil_clear_attachment, + stencil_clear_rectangle_count, + stencil_clear_rect_write_ptr); + stencil_clear_attachment->aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; + stencil_clear_attachment->colorAttachment = 0; + stencil_clear_attachment->clearValue.depthStencil.depth = 0.0f; + stencil_clear_attachment->clearValue.depthStencil.stencil = 0; + for (const Transfer& transfer : current_transfers) { + Transfer::Rectangle transfer_stencil_clear_rectangles + [Transfer::kMaxRectanglesWithCutout]; + uint32_t transfer_stencil_clear_rectangle_count = + transfer.GetRectangles(dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, + transfer_stencil_clear_rectangles, + resolve_clear_rectangle); + for (uint32_t j = 0; j < transfer_stencil_clear_rectangle_count; + ++j) { + const Transfer::Rectangle& stencil_clear_rectangle = + transfer_stencil_clear_rectangles[j]; + stencil_clear_rect_write_ptr->rect.offset.x = + int32_t(stencil_clear_rectangle.x_pixels * resolution_scale_x_); + stencil_clear_rect_write_ptr->rect.offset.y = + int32_t(stencil_clear_rectangle.y_pixels * resolution_scale_y_); + stencil_clear_rect_write_ptr->rect.extent.width = + stencil_clear_rectangle.width_pixels * resolution_scale_x_; + stencil_clear_rect_write_ptr->rect.extent.height = + stencil_clear_rectangle.height_pixels * resolution_scale_y_; + stencil_clear_rect_write_ptr->baseArrayLayer = 0; + stencil_clear_rect_write_ptr->layerCount = 1; + ++stencil_clear_rect_write_ptr; + } + } + } + + // Prefer power of two viewports for exact division by simply biasing the + // exponent. + VkViewport transfer_viewport; + transfer_viewport.x = 0.0f; + transfer_viewport.y = 0.0f; + transfer_viewport.width = + float(std::min(xe::next_pow2(transfer_framebuffer->host_extent.width), + device_limits.maxViewportDimensions[0])); + transfer_viewport.height = float( + std::min(xe::next_pow2(transfer_framebuffer->host_extent.height), + device_limits.maxViewportDimensions[1])); + transfer_viewport.minDepth = 0.0f; + transfer_viewport.maxDepth = 1.0f; + command_processor_.SetViewport(transfer_viewport); + float pixels_to_ndc_x = 2.0f / transfer_viewport.width; + float pixels_to_ndc_y = 2.0f / transfer_viewport.height; + VkRect2D transfer_scissor; + transfer_scissor.offset.x = 0; + transfer_scissor.offset.y = 0; + transfer_scissor.extent = transfer_framebuffer->host_extent; + command_processor_.SetScissor(transfer_scissor); + + for (auto it = current_transfer_invocations_.cbegin(); + it != current_transfer_invocations_.cend(); ++it) { + const TransferInvocation& transfer_invocation_first = *it; + // Will be merging transfers from the same source into one mesh. + auto it_merged_first = it, it_merged_last = it; + uint32_t transfer_rectangle_count = + transfer_invocation_first.transfer.GetRectangles( + dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, nullptr, + resolve_clear_rectangle); + for (auto it_merge = std::next(it_merged_first); + it_merge != current_transfer_invocations_.cend(); ++it_merge) { + if (!transfer_invocation_first.CanBeMergedIntoOneDraw(*it_merge)) { + break; + } + transfer_rectangle_count += it_merge->transfer.GetRectangles( + dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, nullptr, + resolve_clear_rectangle); + it_merged_last = it_merge; + } + assert_not_zero(transfer_rectangle_count); + // Skip the merged transfers in the subsequent iterations. + it = it_merged_last; + + assert_not_null(it->transfer.source); + auto& source_vulkan_rt = + *static_cast(it->transfer.source); + auto host_depth_source_vulkan_rt = + static_cast(it->transfer.host_depth_source); + TransferShaderKey transfer_shader_key = it->shader_key; + const TransferModeInfo& transfer_mode_info = + kTransferModes[size_t(transfer_shader_key.mode)]; + TransferPipelineLayoutIndex transfer_pipeline_layout_index = + transfer_mode_info.pipeline_layout; + const TransferPipelineLayoutInfo& transfer_pipeline_layout_info = + kTransferPipelineLayoutInfos[size_t( + transfer_pipeline_layout_index)]; + uint32_t transfer_sample_pipeline_count = + device_features.sampleRateShading + ? 1 + : uint32_t(1) << uint32_t(dest_rt_key.msaa_samples); + bool transfer_is_stencil_bit = + (transfer_pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordStencilMaskBit) != 0; + + uint32_t transfer_vertex_count = 6 * transfer_rectangle_count; + VkBuffer transfer_vertex_buffer; + VkDeviceSize transfer_vertex_buffer_offset; + float* transfer_rectangle_write_ptr = + reinterpret_cast(transfer_vertex_buffer_pool_->Request( + current_submission, sizeof(float) * 2 * transfer_vertex_count, + sizeof(float), transfer_vertex_buffer, + transfer_vertex_buffer_offset)); + if (!transfer_rectangle_write_ptr) { + continue; + } + for (auto it_merged = it_merged_first; it_merged <= it_merged_last; + ++it_merged) { + Transfer::Rectangle transfer_invocation_rectangles + [Transfer::kMaxRectanglesWithCutout]; + uint32_t transfer_invocation_rectangle_count = + it_merged->transfer.GetRectangles( + dest_rt_key.base_tiles, dest_pitch_tiles, + dest_rt_key.msaa_samples, dest_is_64bpp, + transfer_invocation_rectangles, resolve_clear_rectangle); + assert_not_zero(transfer_invocation_rectangle_count); + for (uint32_t j = 0; j < transfer_invocation_rectangle_count; ++j) { + const Transfer::Rectangle& transfer_rectangle = + transfer_invocation_rectangles[j]; + float transfer_rectangle_x0 = + -1.0f + transfer_rectangle.x_pixels * pixels_to_ndc_x; + float transfer_rectangle_y0 = + -1.0f + transfer_rectangle.y_pixels * pixels_to_ndc_y; + float transfer_rectangle_x1 = + transfer_rectangle_x0 + + transfer_rectangle.width_pixels * pixels_to_ndc_x; + float transfer_rectangle_y1 = + transfer_rectangle_y0 + + transfer_rectangle.height_pixels * pixels_to_ndc_y; + // O-* + // |/ + // * + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0; + // *-* + // |/ + // O + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1; + // *-O + // |/ + // * + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0; + // O + // /| + // *-* + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0; + // * + // /| + // O-* + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1; + // * + // /| + // *-O + *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1; + *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1; + } + } + command_buffer.CmdVkBindVertexBuffers(0, 1, &transfer_vertex_buffer, + &transfer_vertex_buffer_offset); + + const VkPipeline* transfer_pipelines = GetTransferPipelines( + TransferPipelineKey(transfer_render_pass_key, transfer_shader_key)); + if (!transfer_pipelines) { + continue; + } + command_processor_.BindExternalGraphicsPipeline(transfer_pipelines[0]); + if (last_transfer_pipeline_layout_index != + transfer_pipeline_layout_index) { + last_transfer_pipeline_layout_index = transfer_pipeline_layout_index; + transfer_descriptor_sets_bound = 0; + transfer_push_constants_set = 0; + } + + // Invalidate outdated bindings. + if (transfer_pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetHostDepthStencilTexturesBit) { + assert_not_null(host_depth_source_vulkan_rt); + VkDescriptorSet descriptor_set_host_depth_stencil_textures = + host_depth_source_vulkan_rt->GetDescriptorSetTransferSource(); + if (last_descriptor_set_host_depth_stencil_textures != + descriptor_set_host_depth_stencil_textures) { + last_descriptor_set_host_depth_stencil_textures = + descriptor_set_host_depth_stencil_textures; + transfer_descriptor_sets_bound &= + ~kTransferUsedDescriptorSetHostDepthStencilTexturesBit; + } + } + if (transfer_pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetDepthStencilTexturesBit) { + VkDescriptorSet descriptor_set_depth_stencil_textures = + source_vulkan_rt.GetDescriptorSetTransferSource(); + if (last_descriptor_set_depth_stencil_textures != + descriptor_set_depth_stencil_textures) { + last_descriptor_set_depth_stencil_textures = + descriptor_set_depth_stencil_textures; + transfer_descriptor_sets_bound &= + ~kTransferUsedDescriptorSetDepthStencilTexturesBit; + } + } + if (transfer_pipeline_layout_info.used_descriptor_sets & + kTransferUsedDescriptorSetColorTextureBit) { + VkDescriptorSet descriptor_set_color_texture = + source_vulkan_rt.GetDescriptorSetTransferSource(); + if (last_descriptor_set_color_texture != + descriptor_set_color_texture) { + last_descriptor_set_color_texture = descriptor_set_color_texture; + transfer_descriptor_sets_bound &= + ~kTransferUsedDescriptorSetColorTextureBit; + } + } + if (transfer_pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + assert_not_null(host_depth_source_vulkan_rt); + RenderTargetKey host_depth_source_rt_key = + host_depth_source_vulkan_rt->key(); + TransferAddressConstant host_depth_address_constant; + host_depth_address_constant.dest_pitch = dest_pitch_tiles; + host_depth_address_constant.source_pitch = + host_depth_source_rt_key.GetPitchTiles(); + host_depth_address_constant.source_to_dest = + int32_t(dest_rt_key.base_tiles) - + int32_t(host_depth_source_rt_key.base_tiles); + if (last_host_depth_address_constant != host_depth_address_constant) { + last_host_depth_address_constant = host_depth_address_constant; + transfer_push_constants_set &= + ~kTransferUsedPushConstantDwordHostDepthAddressBit; + } + } + if (transfer_pipeline_layout_info.used_push_constant_dwords & + kTransferUsedPushConstantDwordAddressBit) { + RenderTargetKey source_rt_key = source_vulkan_rt.key(); + TransferAddressConstant address_constant; + address_constant.dest_pitch = dest_pitch_tiles; + address_constant.source_pitch = source_rt_key.GetPitchTiles(); + address_constant.source_to_dest = int32_t(dest_rt_key.base_tiles) - + int32_t(source_rt_key.base_tiles); + if (last_address_constant != address_constant) { + last_address_constant = address_constant; + transfer_push_constants_set &= + ~kTransferUsedPushConstantDwordAddressBit; + } + } + + // Apply the new bindings. + // TODO(Triang3l): Merge binding updates into spans. + VkPipelineLayout transfer_pipeline_layout = + transfer_pipeline_layouts_[size_t(transfer_pipeline_layout_index)]; + uint32_t transfer_descriptor_sets_unbound = + transfer_pipeline_layout_info.used_descriptor_sets & + ~transfer_descriptor_sets_bound; + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetHostDepthBufferBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count(transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthBufferBit - 1)), + 1, &edram_storage_buffer_descriptor_set_, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetHostDepthBufferBit; + } + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetHostDepthStencilTexturesBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count( + transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetHostDepthStencilTexturesBit - 1)), + 1, &last_descriptor_set_host_depth_stencil_textures, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetHostDepthStencilTexturesBit; + } + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetDepthStencilTexturesBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count( + transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetDepthStencilTexturesBit - 1)), + 1, &last_descriptor_set_depth_stencil_textures, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetDepthStencilTexturesBit; + } + if (transfer_descriptor_sets_unbound & + kTransferUsedDescriptorSetColorTextureBit) { + command_buffer.CmdVkBindDescriptorSets( + VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout, + xe::bit_count(transfer_pipeline_layout_info.used_descriptor_sets & + (kTransferUsedDescriptorSetColorTextureBit - 1)), + 1, &last_descriptor_set_color_texture, 0, nullptr); + transfer_descriptor_sets_bound |= + kTransferUsedDescriptorSetColorTextureBit; + } + uint32_t transfer_push_constants_unset = + transfer_pipeline_layout_info.used_push_constant_dwords & + ~transfer_push_constants_set; + if (transfer_push_constants_unset & + kTransferUsedPushConstantDwordHostDepthAddressBit) { + command_buffer.CmdVkPushConstants( + transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordHostDepthAddressBit - 1)), + sizeof(uint32_t), &last_host_depth_address_constant); + transfer_push_constants_set |= + kTransferUsedPushConstantDwordHostDepthAddressBit; + } + if (transfer_push_constants_unset & + kTransferUsedPushConstantDwordAddressBit) { + command_buffer.CmdVkPushConstants( + transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info.used_push_constant_dwords & + (kTransferUsedPushConstantDwordAddressBit - 1)), + sizeof(uint32_t), &last_address_constant); + transfer_push_constants_set |= + kTransferUsedPushConstantDwordAddressBit; + } + + for (uint32_t j = 0; j < transfer_sample_pipeline_count; ++j) { + if (j) { + command_processor_.BindExternalGraphicsPipeline( + transfer_pipelines[j]); + } + for (uint32_t k = 0; k < uint32_t(transfer_is_stencil_bit ? 8 : 1); + ++k) { + if (transfer_is_stencil_bit) { + uint32_t transfer_stencil_bit = uint32_t(1) << k; + command_buffer.CmdVkPushConstants( + transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, + sizeof(uint32_t) * + xe::bit_count( + transfer_pipeline_layout_info + .used_push_constant_dwords & + (kTransferUsedPushConstantDwordStencilMaskBit - 1)), + sizeof(uint32_t), &transfer_stencil_bit); + command_buffer.CmdVkSetStencilWriteMask( + VK_STENCIL_FACE_FRONT_AND_BACK, transfer_stencil_bit); + } + command_buffer.CmdVkDraw(transfer_vertex_count, 1, 0, 0); + } + } + } + } + + // Perform the clear. + if (resolve_clear_needed) { + command_processor_.SubmitBarriersAndEnterRenderTargetCacheRenderPass( + transfer_render_pass, transfer_framebuffer); + VkClearAttachment resolve_clear_attachment; + resolve_clear_attachment.colorAttachment = 0; + std::memset(&resolve_clear_attachment.clearValue, 0, + sizeof(resolve_clear_attachment.clearValue)); + uint64_t clear_value = render_target_resolve_clear_values[i]; + if (dest_rt_key.is_depth) { + resolve_clear_attachment.aspectMask = + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + uint32_t depth_guest_clear_value = + (uint32_t(clear_value) >> 8) & 0xFFFFFF; + switch (dest_rt_key.GetDepthFormat()) { + case xenos::DepthRenderTargetFormat::kD24S8: + resolve_clear_attachment.clearValue.depthStencil.depth = + xenos::UNorm24To32(depth_guest_clear_value); + break; + case xenos::DepthRenderTargetFormat::kD24FS8: + // Taking [0, 2) -> [0, 1) remapping into account. + resolve_clear_attachment.clearValue.depthStencil.depth = + xenos::Float20e4To32(depth_guest_clear_value) * 0.5f; + break; + } + resolve_clear_attachment.clearValue.depthStencil.stencil = + uint32_t(clear_value) & 0xFF; + } else { + resolve_clear_attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + switch (dest_rt_key.GetColorFormat()) { + case xenos::ColorRenderTargetFormat::k_8_8_8_8: + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: { + for (uint32_t j = 0; j < 4; ++j) { + resolve_clear_attachment.clearValue.color.float32[j] = + ((clear_value >> (j * 8)) & 0xFF) * (1.0f / 0xFF); + } + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10: + case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: { + for (uint32_t j = 0; j < 3; ++j) { + resolve_clear_attachment.clearValue.color.float32[j] = + ((clear_value >> (j * 10)) & 0x3FF) * (1.0f / 0x3FF); + } + resolve_clear_attachment.clearValue.color.float32[3] = + ((clear_value >> 30) & 0x3) * (1.0f / 0x3); + } break; + case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case xenos::ColorRenderTargetFormat:: + k_2_10_10_10_FLOAT_AS_16_16_16_16: { + for (uint32_t j = 0; j < 3; ++j) { + resolve_clear_attachment.clearValue.color.float32[j] = + xenos::Float7e3To32((clear_value >> (j * 10)) & 0x3FF); + } + resolve_clear_attachment.clearValue.color.float32[3] = + ((clear_value >> 30) & 0x3) * (1.0f / 0x3); + } break; + case xenos::ColorRenderTargetFormat::k_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: { + // Using uint for transfers and clears of both. Disregarding the + // current -32...32 vs. -1...1 settings for consistency with color + // clear via depth aliasing. + // TODO(Triang3l): Handle cases of unsupported multisampled 16_UINT + // and completely unsupported 16_UNORM. + for (uint32_t j = 0; j < 2; ++j) { + resolve_clear_attachment.clearValue.color.uint32[j] = + uint32_t(clear_value >> (j * 16)) & 0xFFFF; + } + } break; + case xenos::ColorRenderTargetFormat::k_16_16_16_16: + case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: { + // Using uint for transfers and clears of both. Disregarding the + // current -32...32 vs. -1...1 settings for consistency with color + // clear via depth aliasing. + // TODO(Triang3l): Handle cases of unsupported multisampled 16_UINT + // and completely unsupported 16_UNORM. + for (uint32_t j = 0; j < 4; ++j) { + resolve_clear_attachment.clearValue.color.uint32[j] = + uint32_t(clear_value >> (j * 16)) & 0xFFFF; + } + } break; + case xenos::ColorRenderTargetFormat::k_32_FLOAT: { + // Using uint for proper denormal and NaN handling. + resolve_clear_attachment.clearValue.color.uint32[0] = + uint32_t(clear_value); + } break; + case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: { + // Using uint for proper denormal and NaN handling. + resolve_clear_attachment.clearValue.color.uint32[0] = + uint32_t(clear_value); + resolve_clear_attachment.clearValue.color.uint32[1] = + uint32_t(clear_value >> 32); + } break; + } + } + command_buffer.CmdVkClearAttachments(1, &resolve_clear_attachment, 1, + &resolve_clear_rect); + } + } +} + } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h index 97bb690af..c98da4974 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h @@ -10,13 +10,20 @@ #ifndef XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_ #define XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_ +#include #include #include +#include +#include #include #include "xenia/base/hash.h" +#include "xenia/base/xxhash.h" #include "xenia/gpu/render_target_cache.h" +#include "xenia/gpu/xenos.h" +#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h" #include "xenia/ui/vulkan/vulkan_provider.h" +#include "xenia/ui/vulkan/vulkan_upload_buffer_pool.h" namespace xe { namespace gpu { @@ -28,8 +35,12 @@ class VulkanRenderTargetCache final : public RenderTargetCache { public: union RenderPassKey { struct { - // If emulating 2x as 4x, set this to 4x for 2x not to create unnecessary - // render pass objects. + // If emulating 2x as 4x, this is still 2x for simplicity of using this + // field to make guest-related decisions. Render pass objects are not very + // expensive, and their dependencies can't be shared between 2x-as-4x and + // true 4x MSAA passes (framebuffers because render target cache render + // targets are different for 2x and 4x guest MSAA, pipelines because the + // sample mask will have 2 samples excluded for 2x-as-4x). xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits; // 2 // << 0 is depth, << 1...4 is color. uint32_t depth_and_color_used : 1 + xenos::kMaxColorRenderTargets; // 7 @@ -46,7 +57,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache { xenos::ColorRenderTargetFormat color_2_view_format : xenos::kColorRenderTargetFormatBits; // 20 xenos::ColorRenderTargetFormat color_3_view_format - : xenos::kColorRenderTargetFormatBits; // 24 + : xenos::kColorRenderTargetFormatBits; // 24 + uint32_t color_rts_use_transfer_formats : 1; // 25 }; uint32_t key = 0; struct Hasher { @@ -60,6 +72,9 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool operator!=(const RenderPassKey& other_key) const { return !(*this == other_key); } + bool operator<(const RenderPassKey& other_key) const { + return key < other_key.key; + } }; static_assert_size(RenderPassKey, sizeof(uint32_t)); @@ -78,12 +93,14 @@ class VulkanRenderTargetCache final : public RenderTargetCache { void Shutdown(bool from_destructor = false); void ClearCache() override; - // TOOD(Triang3l): Fragment shader interlock. + void CompletedSubmissionUpdated(); + void EndSubmission(); + + // TODO(Triang3l): Fragment shader interlock. Path GetPath() const override { return Path::kHostRenderTargets; } - // TODO(Triang3l): Resolution scaling. - uint32_t GetResolutionScaleX() const override { return 1; } - uint32_t GetResolutionScaleY() const override { return 1; } + uint32_t GetResolutionScaleX() const override { return resolution_scale_x_; } + uint32_t GetResolutionScaleY() const override { return resolution_scale_y_; } bool Update(bool is_rasterization_done, uint32_t shader_writes_color_targets) override; @@ -98,6 +115,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache { return last_update_framebuffer_; } + bool msaa_2x_attachments_supported() const { + return msaa_2x_attachments_supported_; + } + bool msaa_2x_no_attachments_supported() const { + return msaa_2x_no_attachments_supported_; + } + bool IsMsaa2xSupported(bool subpass_has_attachments) const { + return subpass_has_attachments ? msaa_2x_attachments_supported_ + : msaa_2x_no_attachments_supported_; + } + // Returns the render pass object, or VK_NULL_HANDLE if failed to create. // A render pass managed by the render target cache may be ended and resumed // at any time (to allow for things like copying and texture loading). @@ -110,6 +138,99 @@ class VulkanRenderTargetCache final : public RenderTargetCache { bool* is_integer_out = nullptr) const; protected: + uint32_t GetMaxRenderTargetWidth() const override; + uint32_t GetMaxRenderTargetHeight() const override; + + RenderTarget* CreateRenderTarget(RenderTargetKey key) override; + + // TODO(Triang3l): Check actual unorm24 support. + bool IsHostDepthEncodingDifferent( + xenos::DepthRenderTargetFormat format) const override { + return true; + } + + private: + enum class EdramBufferUsage { + // There's no need for combined fragment and compute usages. + // With host render targets, the usual usage sequence is as follows: + // - Optionally compute writes - host depth copy storing for EDRAM range + // ownership transfers. + // - Optionally fragment reads - host depth copy storing for EDRAM range + // ownership transfers. + // - Compute writes - copying from host render targets during resolving. + // - Compute reads - writing to the shared memory during resolving. + // With the render backend implementation based on fragment shader + // interlocks, it's: + // - Fragment reads and writes - depth / stencil and color operations. + // - Compute reads - writing to the shared memory during resolving. + // So, fragment reads and compute reads normally don't follow each other, + // and there's no need to amortize the cost of a read > read barrier in an + // exceptional situation by using a wider barrier in the normal scenario. + + // Host depth copy storing. + kFragmentRead, + // Fragment shader interlock depth / stencil and color operations. + kFragmentReadWrite, + // Resolve - copying to the shared memory. + kComputeRead, + // Resolve - copying from host render targets. + kComputeWrite, + // Trace recording. + kTransferRead, + // Trace playback. + kTransferWrite, + }; + enum class EdramBufferModificationStatus { + // The values are ordered by how strong the barrier conditions are. + // No uncommitted shader writes. + kUnmodified, + // Need to commit before the next fragment shader interlock usage with + // overlap. + kViaFragmentShaderInterlock, + // Need to commit before any next fragment shader interlock usage. + kViaUnordered, + }; + static void GetEdramBufferUsageMasks(EdramBufferUsage usage, + VkPipelineStageFlags& stage_mask_out, + VkAccessFlags& access_mask_out); + void UseEdramBuffer(EdramBufferUsage new_usage); + void MarkEdramBufferModified( + EdramBufferModificationStatus modification_status = + EdramBufferModificationStatus::kViaUnordered); + void CommitEdramBufferShaderWrites( + EdramBufferModificationStatus commit_status = + EdramBufferModificationStatus::kViaFragmentShaderInterlock); + + VulkanCommandProcessor& command_processor_; + + uint32_t resolution_scale_x_ = 1; + uint32_t resolution_scale_y_ = 1; + + // Accessible in fragment and compute shaders. + VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE; + VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE; + VkDescriptorSetLayout descriptor_set_layout_sampled_image_x2_ = + VK_NULL_HANDLE; + + std::unique_ptr + descriptor_set_pool_sampled_image_; + std::unique_ptr + descriptor_set_pool_sampled_image_x2_; + + VkDeviceMemory edram_buffer_memory_ = VK_NULL_HANDLE; + VkBuffer edram_buffer_ = VK_NULL_HANDLE; + EdramBufferUsage edram_buffer_usage_; + EdramBufferModificationStatus edram_buffer_modification_status_ = + EdramBufferModificationStatus::kUnmodified; + VkDescriptorPool edram_storage_buffer_descriptor_pool_ = VK_NULL_HANDLE; + VkDescriptorSet edram_storage_buffer_descriptor_set_; + + // RenderPassKey::key -> VkRenderPass. + // VK_NULL_HANDLE if failed to create. + std::unordered_map render_passes_; + + // For host render targets. + // Can only be destroyed when framebuffers referencing it are destroyed! class VulkanRenderTarget final : public RenderTarget { public: @@ -131,27 +252,45 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // Takes ownership of the Vulkan objects passed to the constructor. VulkanRenderTarget(RenderTargetKey key, - const ui::vulkan::VulkanProvider& provider, + VulkanRenderTargetCache& render_target_cache, VkImage image, VkDeviceMemory memory, VkImageView view_depth_color, VkImageView view_depth_stencil, VkImageView view_stencil, VkImageView view_srgb, - VkImageView view_color_transfer_separate) + VkImageView view_color_transfer_separate, + size_t descriptor_set_index_transfer_source) : RenderTarget(key), - provider_(provider), + render_target_cache_(render_target_cache), image_(image), memory_(memory), view_depth_color_(view_depth_color), view_depth_stencil_(view_depth_stencil), view_stencil_(view_stencil), view_srgb_(view_srgb), - view_color_transfer_separate_(view_color_transfer_separate) {} + view_color_transfer_separate_(view_color_transfer_separate), + descriptor_set_index_transfer_source_( + descriptor_set_index_transfer_source) {} ~VulkanRenderTarget(); VkImage image() const { return image_; } VkImageView view_depth_color() const { return view_depth_color_; } VkImageView view_depth_stencil() const { return view_depth_stencil_; } + VkImageView view_color_transfer_separate() const { + return view_color_transfer_separate_; + } + VkImageView view_color_transfer() const { + return view_color_transfer_separate_ != VK_NULL_HANDLE + ? view_color_transfer_separate_ + : view_depth_color_; + } + VkDescriptorSet GetDescriptorSetTransferSource() const { + ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool = + key().is_depth + ? *render_target_cache_.descriptor_set_pool_sampled_image_x2_ + : *render_target_cache_.descriptor_set_pool_sampled_image_; + return descriptor_set_pool.Get(descriptor_set_index_transfer_source_); + } static void GetDrawUsage(bool is_depth, VkPipelineStageFlags* stage_mask_out, @@ -185,8 +324,13 @@ class VulkanRenderTargetCache final : public RenderTargetCache { current_layout_ = layout; } + uint32_t temporary_sort_index() const { return temporary_sort_index_; } + void SetTemporarySortIndex(uint32_t index) { + temporary_sort_index_ = index; + } + private: - const ui::vulkan::VulkanProvider& provider_; + VulkanRenderTargetCache& render_target_cache_; VkImage image_; VkDeviceMemory memory_; @@ -200,30 +344,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache { VkImageView view_srgb_; VkImageView view_color_transfer_separate_; + // 2 sampled images for depth / stencil, 1 sampled image for color. + size_t descriptor_set_index_transfer_source_; + VkPipelineStageFlags current_stage_mask_ = 0; VkAccessFlags current_access_mask_ = 0; VkImageLayout current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED; + + // Temporary storage for indices in operations like transfers and dumps. + uint32_t temporary_sort_index_ = 0; }; - uint32_t GetMaxRenderTargetWidth() const override; - uint32_t GetMaxRenderTargetHeight() const override; - - RenderTarget* CreateRenderTarget(RenderTargetKey key) override; - - // TODO(Triang3l): Check actual unorm24 support. - bool IsHostDepthEncodingDifferent( - xenos::DepthRenderTargetFormat format) const override { - return true; - } - - private: - VulkanCommandProcessor& command_processor_; - - // RenderPassKey::key -> VkRenderPass. - std::unordered_map render_passes_; - - // For host render targets. - struct FramebufferKey { RenderPassKey render_pass_key; @@ -254,13 +385,276 @@ class VulkanRenderTargetCache final : public RenderTargetCache { void Reset() { std::memset(this, 0, sizeof(*this)); } }; + enum TransferUsedDescriptorSet : uint32_t { + // Ordered from the least to the most frequently changed. + kTransferUsedDescriptorSetHostDepthBuffer, + kTransferUsedDescriptorSetHostDepthStencilTextures, + kTransferUsedDescriptorSetDepthStencilTextures, + // Mutually exclusive with kTransferUsedDescriptorSetDepthStencilTextures. + kTransferUsedDescriptorSetColorTexture, + + kTransferUsedDescriptorSetCount, + + kTransferUsedDescriptorSetHostDepthBufferBit = + uint32_t(1) << kTransferUsedDescriptorSetHostDepthBuffer, + kTransferUsedDescriptorSetHostDepthStencilTexturesBit = + uint32_t(1) << kTransferUsedDescriptorSetHostDepthStencilTextures, + kTransferUsedDescriptorSetDepthStencilTexturesBit = + uint32_t(1) << kTransferUsedDescriptorSetDepthStencilTextures, + kTransferUsedDescriptorSetColorTextureBit = + uint32_t(1) << kTransferUsedDescriptorSetColorTexture, + }; + + // 32-bit push constants (for simplicity of size calculation and to avoid + // std140 packing issues). + enum TransferUsedPushConstantDword : uint32_t { + kTransferUsedPushConstantDwordHostDepthAddress, + kTransferUsedPushConstantDwordAddress, + // Changed 8 times per transfer. + kTransferUsedPushConstantDwordStencilMask, + + kTransferUsedPushConstantDwordCount, + + kTransferUsedPushConstantDwordHostDepthAddressBit = + uint32_t(1) << kTransferUsedPushConstantDwordHostDepthAddress, + kTransferUsedPushConstantDwordAddressBit = + uint32_t(1) << kTransferUsedPushConstantDwordAddress, + kTransferUsedPushConstantDwordStencilMaskBit = + uint32_t(1) << kTransferUsedPushConstantDwordStencilMask, + }; + + enum class TransferPipelineLayoutIndex { + kColor, + kDepth, + kColorToStencilBit, + kDepthToStencilBit, + kColorAndHostDepthTexture, + kColorAndHostDepthBuffer, + kDepthAndHostDepthTexture, + kDepthAndHostDepthBuffer, + + kCount, + }; + + struct TransferPipelineLayoutInfo { + uint32_t used_descriptor_sets; + uint32_t used_push_constant_dwords; + }; + + static const TransferPipelineLayoutInfo + kTransferPipelineLayoutInfos[size_t(TransferPipelineLayoutIndex::kCount)]; + + enum class TransferMode : uint32_t { + kColorToDepth, + kColorToColor, + + kDepthToDepth, + kDepthToColor, + + kColorToStencilBit, + kDepthToStencilBit, + + // Two-source modes, using the host depth if it, when converted to the guest + // format, matches what's in the owner source (not modified, keep host + // precision), or the guest data otherwise (significantly modified, possibly + // cleared). Stencil for FragStencilRef is always taken from the guest + // source. + + kColorAndHostDepthToDepth, + // When using different source and destination depth formats. + kDepthAndHostDepthToDepth, + + // If host depth is fetched, but it's the same image as the destination, + // it's copied to the EDRAM buffer (but since it's just a scratch buffer, + // with tiles laid out linearly with the same pitch as in the original + // render target; also no swapping of 40-sample columns as opposed to the + // host render target - this is done only for the color source) and fetched + // from there instead of the host depth texture. + kColorAndHostDepthCopyToDepth, + kDepthAndHostDepthCopyToDepth, + + kCount, + }; + + enum class TransferOutput { + kColor, + kDepth, + kStencilBit, + }; + + struct TransferModeInfo { + TransferOutput output; + TransferPipelineLayoutIndex pipeline_layout; + }; + + static const TransferModeInfo kTransferModes[size_t(TransferMode::kCount)]; + + union TransferShaderKey { + uint32_t key; + struct { + xenos::MsaaSamples dest_msaa_samples : xenos::kMsaaSamplesBits; + uint32_t dest_color_rt_index : xenos::kColorRenderTargetIndexBits; + uint32_t dest_resource_format : xenos::kRenderTargetFormatBits; + xenos::MsaaSamples source_msaa_samples : xenos::kMsaaSamplesBits; + // Always 1x when the host depth is a copy from a buffer rather than an + // image, not to create the same pipeline for different MSAA sample counts + // as it doesn't matter in this case. + xenos::MsaaSamples host_depth_source_msaa_samples + : xenos::kMsaaSamplesBits; + uint32_t source_resource_format : xenos::kRenderTargetFormatBits; + + // Last bits because this affects the pipeline layout - after sorting, + // only change it as fewer times as possible. Depth buffers have an + // additional stencil texture. + static_assert(size_t(TransferMode::kCount) <= (size_t(1) << 4)); + TransferMode mode : 4; + }; + + TransferShaderKey() : key(0) { static_assert_size(*this, sizeof(key)); } + + struct Hasher { + size_t operator()(const TransferShaderKey& key) const { + return std::hash{}(key.key); + } + }; + bool operator==(const TransferShaderKey& other_key) const { + return key == other_key.key; + } + bool operator!=(const TransferShaderKey& other_key) const { + return !(*this == other_key); + } + bool operator<(const TransferShaderKey& other_key) const { + return key < other_key.key; + } + }; + + struct TransferPipelineKey { + RenderPassKey render_pass_key; + TransferShaderKey shader_key; + + TransferPipelineKey(RenderPassKey render_pass_key, + TransferShaderKey shader_key) + : render_pass_key(render_pass_key), shader_key(shader_key) {} + + struct Hasher { + size_t operator()(const TransferPipelineKey& key) const { + XXH3_state_t hash_state; + XXH3_64bits_reset(&hash_state); + XXH3_64bits_update(&hash_state, &key.render_pass_key, + sizeof(key.render_pass_key)); + XXH3_64bits_update(&hash_state, &key.shader_key, + sizeof(key.shader_key)); + return static_cast(XXH3_64bits_digest(&hash_state)); + } + }; + bool operator==(const TransferPipelineKey& other_key) const { + return render_pass_key == other_key.render_pass_key && + shader_key == other_key.shader_key; + } + bool operator!=(const TransferPipelineKey& other_key) const { + return !(*this == other_key); + } + bool operator<(const TransferPipelineKey& other_key) const { + if (render_pass_key != other_key.render_pass_key) { + return render_pass_key < other_key.render_pass_key; + } + return shader_key < other_key.shader_key; + } + }; + + union TransferAddressConstant { + uint32_t constant; + struct { + // All in tiles. + uint32_t dest_pitch : xenos::kEdramPitchTilesBits; + uint32_t source_pitch : xenos::kEdramPitchTilesBits; + // Safe to use 12 bits for signed difference - no ownership transfer can + // ever occur between render targets with EDRAM base >= 2048 as this would + // result in 0-length spans. 10 + 10 + 12 is exactly 32, any more bits, + // and more root 32-bit constants will be used. + // Destination base in tiles minus source base in tiles (not vice versa + // because this is a transform of the coordinate system, not addresses + // themselves). + // 0 for host_depth_source_is_copy (ignored in this case anyway as + // destination == source anyway). + int32_t source_to_dest : xenos::kEdramBaseTilesBits; + }; + TransferAddressConstant() : constant(0) { + static_assert_size(*this, sizeof(constant)); + } + bool operator==(const TransferAddressConstant& other_constant) const { + return constant == other_constant.constant; + } + bool operator!=(const TransferAddressConstant& other_constant) const { + return !(*this == other_constant); + } + }; + + struct TransferInvocation { + Transfer transfer; + TransferShaderKey shader_key; + TransferInvocation(const Transfer& transfer, + const TransferShaderKey& shader_key) + : transfer(transfer), shader_key(shader_key) {} + bool operator<(const TransferInvocation& other_invocation) { + // TODO(Triang3l): See if it may be better to sort by the source in the + // first place, especially when reading the same data multiple times (like + // to write the stencil bits after depth) for better read locality. + // Sort by the shader key primarily to reduce pipeline state (context) + // switches. + if (shader_key != other_invocation.shader_key) { + return shader_key < other_invocation.shader_key; + } + // Host depth render targets are changed rarely if they exist, won't save + // many binding changes, ignore them for simplicity (their existence is + // caught by the shader key change). + assert_not_null(transfer.source); + assert_not_null(other_invocation.transfer.source); + uint32_t source_index = + static_cast(transfer.source) + ->temporary_sort_index(); + uint32_t other_source_index = static_cast( + other_invocation.transfer.source) + ->temporary_sort_index(); + if (source_index != other_source_index) { + return source_index < other_source_index; + } + return transfer.start_tiles < other_invocation.transfer.start_tiles; + } + bool CanBeMergedIntoOneDraw( + const TransferInvocation& other_invocation) const { + return shader_key == other_invocation.shader_key && + transfer.AreSourcesSame(other_invocation.transfer); + } + }; + // Returns the framebuffer object, or VK_NULL_HANDLE if failed to create. const Framebuffer* GetFramebuffer( RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp, const RenderTarget* const* depth_and_color_render_targets); + VkShaderModule GetTransferShader(TransferShaderKey key); + // With sample-rate shading, returns a pointer to one pipeline. Without + // sample-rate shading, returns a pointer to as many pipelines as there are + // samples. If there was a failure to create a pipeline, returns nullptr. + VkPipeline const* GetTransferPipelines(TransferPipelineKey key); + + // Do ownership transfers for render targets - each render target / vector may + // be null / empty in case there's nothing to do for them. + // resolve_clear_rectangle is expected to be provided by + // PrepareHostRenderTargetsResolveClear which should do all the needed size + // bound checks. + void PerformTransfersAndResolveClears( + uint32_t render_target_count, RenderTarget* const* render_targets, + const std::vector* render_target_transfers, + const uint64_t* render_target_resolve_clear_values = nullptr, + const Transfer::Rectangle* resolve_clear_rectangle = nullptr); + bool gamma_render_target_as_srgb_ = false; + bool msaa_2x_attachments_supported_ = false; + bool msaa_2x_no_attachments_supported_ = false; + std::unordered_map framebuffers_; @@ -271,6 +665,32 @@ class VulkanRenderTargetCache final : public RenderTargetCache { last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] = {}; const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE; + + // Set 0 - EDRAM storage buffer, set 1 - source depth sampled image (and + // unused stencil from the transfer descriptor set), HostDepthStoreConstants + // passed via push constants. + VkPipelineLayout host_depth_store_pipeline_layout_ = VK_NULL_HANDLE; + VkPipeline host_depth_store_pipelines_[size_t(xenos::MsaaSamples::k4X) + 1] = + {}; + + std::unique_ptr + transfer_vertex_buffer_pool_; + VkShaderModule transfer_passthrough_vertex_shader_ = VK_NULL_HANDLE; + VkPipelineLayout transfer_pipeline_layouts_[size_t( + TransferPipelineLayoutIndex::kCount)] = {}; + // VK_NULL_HANDLE if failed to create. + std::unordered_map + transfer_shaders_; + // With sample-rate shading, one pipeline per entry. Without sample-rate + // shading, one pipeline per sample per entry. VK_NULL_HANDLE if failed to + // create. + std::unordered_map, + TransferPipelineKey::Hasher> + transfer_pipelines_; + + // Temporary storage for PerformTransfersAndResolveClears. + std::vector current_transfer_invocations_; }; } // namespace vulkan diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc index 0d95189da..788b8166a 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc @@ -177,6 +177,10 @@ bool VulkanSharedMemory::Initialize() { } } + // The first usage will likely be uploading. + last_usage_ = Usage::kTransferDestination; + last_written_range_ = std::make_pair(0, 0); + upload_buffer_pool_ = std::make_unique( provider, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, xe::align(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize, @@ -190,9 +194,6 @@ void VulkanSharedMemory::Shutdown(bool from_destructor) { upload_buffer_pool_.reset(); - last_written_range_ = std::make_pair(0, 0); - last_usage_ = Usage::kTransferDestination; - const ui::vulkan::VulkanProvider& provider = command_processor_.GetVulkanProvider(); const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); @@ -226,8 +227,8 @@ void VulkanSharedMemory::Use(Usage usage, if (last_usage_ != usage || last_written_range_.second) { VkPipelineStageFlags src_stage_mask, dst_stage_mask; VkAccessFlags src_access_mask, dst_access_mask; - GetBarrier(last_usage_, src_stage_mask, src_access_mask); - GetBarrier(usage, dst_stage_mask, dst_access_mask); + GetUsageMasks(last_usage_, src_stage_mask, src_access_mask); + GetUsageMasks(usage, dst_stage_mask, dst_access_mask); VkDeviceSize offset, size; if (last_usage_ == usage) { // Committing the previous write, while not changing the access mask @@ -447,9 +448,9 @@ bool VulkanSharedMemory::UploadRanges( return successful; } -void VulkanSharedMemory::GetBarrier(Usage usage, - VkPipelineStageFlags& stage_mask, - VkAccessFlags& access_mask) const { +void VulkanSharedMemory::GetUsageMasks(Usage usage, + VkPipelineStageFlags& stage_mask, + VkAccessFlags& access_mask) const { switch (usage) { case Usage::kComputeWrite: stage_mask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.h b/src/xenia/gpu/vulkan/vulkan_shared_memory.h index 0d8e90813..b37949ec8 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h @@ -47,8 +47,8 @@ class VulkanSharedMemory : public SharedMemory { kComputeWrite, kTransferDestination, }; - // Places pipeline barrier for the target usage, also ensuring writes of - // adjacent are ordered with writes of each other and reads. + // Inserts a pipeline barrier for the target usage, also ensuring consecutive + // read-write accesses are ordered with each other. void Use(Usage usage, std::pair written_range = {}); VkBuffer buffer() const { return buffer_; } @@ -65,8 +65,8 @@ class VulkanSharedMemory : public SharedMemory { upload_page_ranges) override; private: - void GetBarrier(Usage usage, VkPipelineStageFlags& stage_mask, - VkAccessFlags& access_mask) const; + void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask, + VkAccessFlags& access_mask) const; VulkanCommandProcessor& command_processor_; TraceWriter& trace_writer_; @@ -76,9 +76,8 @@ class VulkanSharedMemory : public SharedMemory { // Single for non-sparse, every allocation so far for sparse. std::vector buffer_memory_; - // First usage will likely be uploading. - Usage last_usage_ = Usage::kTransferDestination; - std::pair last_written_range_ = {}; + Usage last_usage_; + std::pair last_written_range_; std::unique_ptr upload_buffer_pool_; std::vector upload_regions_; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index d2279a7b8..2f88bc74c 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -248,6 +248,7 @@ enum class MsaaSamples : uint32_t { constexpr uint32_t kMsaaSamplesBits = 2; +constexpr uint32_t kColorRenderTargetIndexBits = 2; constexpr uint32_t kMaxColorRenderTargets = 4; enum class ColorRenderTargetFormat : uint32_t { diff --git a/src/xenia/ui/vulkan/functions/device_1_0.inc b/src/xenia/ui/vulkan/functions/device_1_0.inc index 2a979f55f..148d6dd52 100644 --- a/src/xenia/ui/vulkan/functions/device_1_0.inc +++ b/src/xenia/ui/vulkan/functions/device_1_0.inc @@ -15,6 +15,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdClearColorImage) XE_UI_VULKAN_FUNCTION(vkCmdCopyBuffer) XE_UI_VULKAN_FUNCTION(vkCmdCopyBufferToImage) XE_UI_VULKAN_FUNCTION(vkCmdCopyImageToBuffer) +XE_UI_VULKAN_FUNCTION(vkCmdDispatch) XE_UI_VULKAN_FUNCTION(vkCmdDraw) XE_UI_VULKAN_FUNCTION(vkCmdDrawIndexed) XE_UI_VULKAN_FUNCTION(vkCmdEndRenderPass) @@ -29,6 +30,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdSetStencilWriteMask) XE_UI_VULKAN_FUNCTION(vkCmdSetViewport) XE_UI_VULKAN_FUNCTION(vkCreateBuffer) XE_UI_VULKAN_FUNCTION(vkCreateCommandPool) +XE_UI_VULKAN_FUNCTION(vkCreateComputePipelines) XE_UI_VULKAN_FUNCTION(vkCreateDescriptorPool) XE_UI_VULKAN_FUNCTION(vkCreateDescriptorSetLayout) XE_UI_VULKAN_FUNCTION(vkCreateFence) diff --git a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc new file mode 100644 index 000000000..8dfff2a3f --- /dev/null +++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc @@ -0,0 +1,120 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h" + +#include "xenia/base/assert.h" +#include "xenia/base/logging.h" + +namespace xe { +namespace ui { +namespace vulkan { + +SingleLayoutDescriptorSetPool::SingleLayoutDescriptorSetPool( + const VulkanProvider& provider, uint32_t pool_set_count, + uint32_t set_layout_descriptor_counts_count, + const VkDescriptorPoolSize* set_layout_descriptor_counts, + VkDescriptorSetLayout set_layout) + : provider_(provider), + pool_set_count_(pool_set_count), + set_layout_(set_layout) { + assert_not_zero(pool_set_count); + pool_descriptor_counts_.resize(set_layout_descriptor_counts_count); + for (uint32_t i = 0; i < set_layout_descriptor_counts_count; ++i) { + VkDescriptorPoolSize& pool_descriptor_type_count = + pool_descriptor_counts_[i]; + const VkDescriptorPoolSize& set_layout_descriptor_type_count = + set_layout_descriptor_counts[i]; + pool_descriptor_type_count.type = set_layout_descriptor_type_count.type; + pool_descriptor_type_count.descriptorCount = + set_layout_descriptor_type_count.descriptorCount * pool_set_count; + } +} + +SingleLayoutDescriptorSetPool::~SingleLayoutDescriptorSetPool() { + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn(); + VkDevice device = provider_.device(); + if (current_pool_ != VK_NULL_HANDLE) { + dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr); + } + for (VkDescriptorPool pool : full_pools_) { + dfn.vkDestroyDescriptorPool(device, pool, nullptr); + } +} + +size_t SingleLayoutDescriptorSetPool::Allocate() { + if (!descriptor_sets_free_.empty()) { + size_t free_index = descriptor_sets_free_.back(); + descriptor_sets_free_.pop_back(); + return free_index; + } + + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn(); + VkDevice device = provider_.device(); + + // Two iterations so if vkAllocateDescriptorSets fails even with a non-zero + // current_pool_sets_remaining_, another attempt will be made in a new pool. + for (uint32_t i = 0; i < 2; ++i) { + if (current_pool_ != VK_NULL_HANDLE && !current_pool_sets_remaining_) { + full_pools_.push_back(current_pool_); + current_pool_ = VK_NULL_HANDLE; + } + if (current_pool_ == VK_NULL_HANDLE) { + VkDescriptorPoolCreateInfo pool_create_info; + pool_create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + pool_create_info.pNext = nullptr; + pool_create_info.flags = 0; + pool_create_info.maxSets = pool_set_count_; + pool_create_info.poolSizeCount = uint32_t(pool_descriptor_counts_.size()); + pool_create_info.pPoolSizes = pool_descriptor_counts_.data(); + if (dfn.vkCreateDescriptorPool(device, &pool_create_info, nullptr, + ¤t_pool_) != VK_SUCCESS) { + XELOGE( + "SingleLayoutDescriptorSetPool: Failed to create a descriptor " + "pool"); + return SIZE_MAX; + } + current_pool_sets_remaining_ = pool_set_count_; + } + + VkDescriptorSetAllocateInfo descriptor_set_allocate_info; + descriptor_set_allocate_info.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + descriptor_set_allocate_info.pNext = nullptr; + descriptor_set_allocate_info.descriptorPool = current_pool_; + descriptor_set_allocate_info.descriptorSetCount = 1; + descriptor_set_allocate_info.pSetLayouts = &set_layout_; + VkDescriptorSet descriptor_set; + if (dfn.vkAllocateDescriptorSets(device, &descriptor_set_allocate_info, + &descriptor_set) != VK_SUCCESS) { + XELOGE( + "SingleLayoutDescriptorSetPool: Failed to allocate a descriptor " + "layout"); + if (current_pool_sets_remaining_ >= pool_set_count_) { + // Failed to allocate in a new pool - something completely wrong, don't + // store empty pools as full. + dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr); + current_pool_ = VK_NULL_HANDLE; + return SIZE_MAX; + } + full_pools_.push_back(current_pool_); + current_pool_ = VK_NULL_HANDLE; + } + --current_pool_sets_remaining_; + descriptor_sets_.push_back(descriptor_set); + return descriptor_sets_.size() - 1; + } + + // Both attempts have failed. + return SIZE_MAX; +} + +} // namespace vulkan +} // namespace ui +} // namespace xe diff --git a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h new file mode 100644 index 000000000..c3f3eb080 --- /dev/null +++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h @@ -0,0 +1,63 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_ +#define XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_ + +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/ui/vulkan/vulkan_provider.h" + +namespace xe { +namespace ui { +namespace vulkan { + +class SingleLayoutDescriptorSetPool { + public: + // set_layout_descriptor_counts must contain the numbers of descriptors of + // each type in a single set with the layout (the multiplication by the pool + // set count will be done internally). The descriptor set layout must not be + // destroyed until this object is also destroyed. + SingleLayoutDescriptorSetPool( + const VulkanProvider& provider, uint32_t pool_set_count, + uint32_t set_layout_descriptor_counts_count, + const VkDescriptorPoolSize* set_layout_descriptor_counts, + VkDescriptorSetLayout set_layout); + ~SingleLayoutDescriptorSetPool(); + + // Returns SIZE_MAX in case of a failure. + size_t Allocate(); + void Free(size_t index) { + assert_true(index < descriptor_sets_.size()); + descriptor_sets_free_.push_back(index); + } + VkDescriptorSet Get(size_t index) const { return descriptor_sets_[index]; } + + private: + const VulkanProvider& provider_; + uint32_t pool_set_count_; + std::vector pool_descriptor_counts_; + VkDescriptorSetLayout set_layout_; + + std::vector full_pools_; + VkDescriptorPool current_pool_ = VK_NULL_HANDLE; + uint32_t current_pool_sets_remaining_ = 0; + + std::vector descriptor_sets_; + std::vector descriptor_sets_free_; +}; + +} // namespace vulkan +} // namespace ui +} // namespace xe + +#endif // XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_ diff --git a/src/xenia/ui/vulkan/vulkan_provider.cc b/src/xenia/ui/vulkan/vulkan_provider.cc index 2d93485ff..eb48cfa23 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.cc +++ b/src/xenia/ui/vulkan/vulkan_provider.cc @@ -715,6 +715,8 @@ bool VulkanProvider::Initialize() { static const std::pair kUsedDeviceExtensions[] = { {"VK_EXT_fragment_shader_interlock", offsetof(DeviceExtensions, ext_fragment_shader_interlock)}, + {"VK_EXT_shader_stencil_export", + offsetof(DeviceExtensions, ext_shader_stencil_export)}, {"VK_KHR_dedicated_allocation", offsetof(DeviceExtensions, khr_dedicated_allocation)}, {"VK_KHR_image_format_list", @@ -946,6 +948,8 @@ bool VulkanProvider::Initialize() { XELOGVK("Vulkan device extensions:"); XELOGVK("* VK_EXT_fragment_shader_interlock: {}", device_extensions_.ext_fragment_shader_interlock ? "yes" : "no"); + XELOGVK("* VK_EXT_shader_stencil_export: {}", + device_extensions_.ext_shader_stencil_export ? "yes" : "no"); XELOGVK("* VK_KHR_dedicated_allocation: {}", device_extensions_.khr_dedicated_allocation ? "yes" : "no"); XELOGVK("* VK_KHR_image_format_list: {}", diff --git a/src/xenia/ui/vulkan/vulkan_provider.h b/src/xenia/ui/vulkan/vulkan_provider.h index 0887b88ac..83f4d587f 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.h +++ b/src/xenia/ui/vulkan/vulkan_provider.h @@ -132,6 +132,7 @@ class VulkanProvider : public GraphicsProvider { } struct DeviceExtensions { bool ext_fragment_shader_interlock; + bool ext_shader_stencil_export; // Core since 1.1.0. bool khr_dedicated_allocation; // Core since 1.2.0. diff --git a/src/xenia/ui/vulkan/vulkan_util.cc b/src/xenia/ui/vulkan/vulkan_util.cc index f8dd5846e..b4eb02c3f 100644 --- a/src/xenia/ui/vulkan/vulkan_util.cc +++ b/src/xenia/ui/vulkan/vulkan_util.cc @@ -189,6 +189,53 @@ bool CreateDedicatedAllocationImage(const VulkanProvider& provider, return true; } +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + VkShaderModule shader, const VkSpecializationInfo* specialization_info, + const char* entry_point) { + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + VkComputePipelineCreateInfo pipeline_create_info; + pipeline_create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + pipeline_create_info.pNext = nullptr; + pipeline_create_info.flags = 0; + pipeline_create_info.stage.sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + pipeline_create_info.stage.pNext = nullptr; + pipeline_create_info.stage.flags = 0; + pipeline_create_info.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + pipeline_create_info.stage.module = shader; + pipeline_create_info.stage.pName = entry_point; + pipeline_create_info.stage.pSpecializationInfo = specialization_info; + pipeline_create_info.layout = layout; + pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE; + pipeline_create_info.basePipelineIndex = -1; + VkPipeline pipeline; + if (dfn.vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, + &pipeline_create_info, nullptr, + &pipeline) != VK_SUCCESS) { + return VK_NULL_HANDLE; + } + return pipeline; +} + +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + const uint32_t* shader_code, size_t shader_code_size_bytes, + const VkSpecializationInfo* specialization_info, const char* entry_point) { + VkShaderModule shader = + CreateShaderModule(provider, shader_code, shader_code_size_bytes); + if (shader == VK_NULL_HANDLE) { + return VK_NULL_HANDLE; + } + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + VkPipeline pipeline = CreateComputePipeline(provider, layout, shader, + specialization_info, entry_point); + dfn.vkDestroyShaderModule(device, shader, nullptr); + return pipeline; +} + } // namespace util } // namespace vulkan } // namespace ui diff --git a/src/xenia/ui/vulkan/vulkan_util.h b/src/xenia/ui/vulkan/vulkan_util.h index fda575305..7af10f65f 100644 --- a/src/xenia/ui/vulkan/vulkan_util.h +++ b/src/xenia/ui/vulkan/vulkan_util.h @@ -164,6 +164,17 @@ inline VkShaderModule CreateShaderModule(const VulkanProvider& provider, : VK_NULL_HANDLE; } +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + VkShaderModule shader, + const VkSpecializationInfo* specialization_info = nullptr, + const char* entry_point = "main"); +VkPipeline CreateComputePipeline( + const VulkanProvider& provider, VkPipelineLayout layout, + const uint32_t* shader_code, size_t shader_code_size_bytes, + const VkSpecializationInfo* specialization_info = nullptr, + const char* entry_point = "main"); + } // namespace util } // namespace vulkan } // namespace ui