Enable native MSAA

Copy back EDRAM buffers in order by base offset.
This commit is contained in:
Dr. Chat 2016-04-01 21:52:39 -05:00
parent 2eca3ce9e6
commit 50f72b4e42
6 changed files with 333 additions and 167 deletions

View File

@ -187,6 +187,10 @@ PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline(
VkCommandBuffer command_buffer, const RenderState* render_state,
VulkanShader* vertex_shader, VulkanShader* pixel_shader,
PrimitiveType primitive_type, VkPipeline* pipeline_out) {
#if FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // FINE_GRAINED_DRAW_SCOPES
assert_not_null(pipeline_out);
// Perform a pass over all registers and state updating our cached structures.
@ -323,6 +327,10 @@ VkShaderModule PipelineCache::GetGeometryShader(PrimitiveType primitive_type,
bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
bool full_update) {
#if FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // FINE_GRAINED_DRAW_SCOPES
auto& regs = set_dynamic_state_registers_;
bool window_offset_dirty = SetShadowRegister(&regs.pa_sc_window_offset,
@ -393,20 +401,25 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
auto surface_msaa =
static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
// TODO(benvanik): ??
// FIXME: Some games depend on these for proper clears (e.g. only clearing
// half the size they actually want with 4x MSAA), but others don't.
// Figure out how these games are expecting clears to be done.
float window_width_scalar = 1;
float window_height_scalar = 1;
switch (surface_msaa) {
case MsaaSamples::k1X:
break;
case MsaaSamples::k2X:
window_width_scalar = 2;
// ??
window_width_scalar = window_height_scalar = 1.41421356f;
break;
case MsaaSamples::k4X:
window_width_scalar = 2;
window_height_scalar = 2;
window_width_scalar = window_height_scalar = 2;
break;
}
// window_width_scalar = window_height_scalar = 1;
// Whether each of the viewport settings are enabled.
// http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
bool vport_xscale_enable = (regs.pa_cl_vte_cntl & (1 << 0)) > 0;
@ -434,6 +447,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
float voy = vport_yoffset_enable ? regs.pa_cl_vport_yoffset : 0;
float vsx = vport_xscale_enable ? regs.pa_cl_vport_xscale : 1;
float vsy = vport_yscale_enable ? regs.pa_cl_vport_yscale : 1;
window_width_scalar = window_height_scalar = 1;
float vpw = 2 * window_width_scalar * vsx;
float vph = -2 * window_height_scalar * vsy;
@ -481,25 +495,25 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
vkCmdSetBlendConstants(command_buffer, regs.rb_blend_rgba);
}
// VK_DYNAMIC_STATE_LINE_WIDTH
vkCmdSetLineWidth(command_buffer, 1.0f);
if (full_update) {
// VK_DYNAMIC_STATE_LINE_WIDTH
vkCmdSetLineWidth(command_buffer, 1.0f);
// VK_DYNAMIC_STATE_DEPTH_BIAS
vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f);
// VK_DYNAMIC_STATE_DEPTH_BIAS
vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f);
// VK_DYNAMIC_STATE_DEPTH_BOUNDS
vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f);
// VK_DYNAMIC_STATE_DEPTH_BOUNDS
vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f);
// VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK
vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
// VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK
vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
// VK_DYNAMIC_STATE_STENCIL_REFERENCE
vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
// VK_DYNAMIC_STATE_STENCIL_REFERENCE
vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
// VK_DYNAMIC_STATE_STENCIL_WRITE_MASK
vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
// TODO(benvanik): push constants.
// VK_DYNAMIC_STATE_STENCIL_WRITE_MASK
vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
}
bool push_constants_dirty = full_update || viewport_state_dirty;
push_constants_dirty |=
@ -530,7 +544,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
push_constants.window_scale[1] = -1.0f;
} else {
push_constants.window_scale[0] = 1.0f / 2560.0f;
push_constants.window_scale[1] = -1.0f / 2560.0f;
push_constants.window_scale[1] = 1.0f / 2560.0f;
}
// http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
@ -756,7 +770,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState(
: VK_FORMAT_A2R10G10B10_UNORM_PACK32;
break;
case VertexFormat::k_10_11_11:
assert_always("unsupported?");
// assert_always("unsupported?");
vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32;
break;
case VertexFormat::k_11_11_10:
@ -934,6 +948,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR);
dirty |= SetShadowRegister(&regs.multi_prim_ib_reset_index,
XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX);
dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
regs.primitive_type = primitive_type;
XXH64_update(&hash_state_, &regs, sizeof(regs));
if (!dirty) {
@ -947,7 +962,13 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
// TODO(benvanik): right setting?
state_info.depthClampEnable = VK_FALSE;
// TODO(benvanik): use in depth-only mode?
// Discard rasterizer output in depth-only mode.
// TODO(DrChat): Figure out how to make this work properly.
/*
auto enable_mode = static_cast<xenos::ModeControl>(regs.rb_modecontrol & 0x7);
state_info.rasterizerDiscardEnable =
enable_mode == xenos::ModeControl::kColorDepth ? VK_FALSE : VK_TRUE;
//*/
state_info.rasterizerDiscardEnable = VK_FALSE;
bool poly_mode = ((regs.pa_su_sc_mode_cntl >> 3) & 0x3) != 0;
@ -1004,20 +1025,49 @@ PipelineCache::UpdateStatus PipelineCache::UpdateMultisampleState() {
auto& regs = update_multisample_state_regs_;
auto& state_info = update_multisample_state_info_;
bool dirty = false;
dirty |= SetShadowRegister(&regs.pa_sc_aa_config, XE_GPU_REG_PA_SC_AA_CONFIG);
dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
XE_GPU_REG_PA_SU_SC_MODE_CNTL);
dirty |= SetShadowRegister(&regs.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO);
XXH64_update(&hash_state_, &regs, sizeof(regs));
if (!dirty) {
return UpdateStatus::kCompatible;
}
state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
state_info.pNext = nullptr;
state_info.flags = 0;
// PA_SC_AA_CONFIG MSAA_NUM_SAMPLES
// PA_SU_SC_MODE_CNTL MSAA_ENABLE
state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
// state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
//*
auto msaa_num_samples =
static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
switch (msaa_num_samples) {
case MsaaSamples::k1X:
state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
break;
case MsaaSamples::k2X:
state_info.rasterizationSamples = VK_SAMPLE_COUNT_2_BIT;
break;
case MsaaSamples::k4X:
state_info.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT;
break;
default:
assert_unhandled_case(msaa_num_samples);
break;
}
//*/
state_info.sampleShadingEnable = VK_FALSE;
state_info.minSampleShading = 0;
state_info.pSampleMask = nullptr;
state_info.alphaToCoverageEnable = VK_FALSE;
state_info.alphaToOneEnable = VK_FALSE;
return UpdateStatus::kCompatible;
return UpdateStatus::kMismatch;
}
PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() {

View File

@ -211,6 +211,7 @@ class PipelineCache {
uint32_t pa_sc_screen_scissor_tl;
uint32_t pa_sc_screen_scissor_br;
uint32_t multi_prim_ib_reset_index;
uint32_t rb_modecontrol;
UpdateRasterizationStateRegisters() { Reset(); }
void Reset() { std::memset(this, 0, sizeof(*this)); }
@ -218,6 +219,10 @@ class PipelineCache {
VkPipelineRasterizationStateCreateInfo update_rasterization_state_info_;
struct UpdateMultisampleStateeRegisters {
uint32_t pa_sc_aa_config;
uint32_t pa_su_sc_mode_cntl;
uint32_t rb_surface_info;
UpdateMultisampleStateeRegisters() { Reset(); }
void Reset() { std::memset(this, 0, sizeof(*this)); }
} update_multisample_state_regs_;

View File

@ -165,8 +165,23 @@ CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device,
image_info.extent.depth = 1;
image_info.mipLevels = 1;
image_info.arrayLayers = 1;
image_info.samples =
static_cast<VkSampleCountFlagBits>(VK_SAMPLE_COUNT_1_BIT);
// image_info.samples = VK_SAMPLE_COUNT_1_BIT;
//*
auto msaa_samples = static_cast<MsaaSamples>(key.msaa_samples);
switch (msaa_samples) {
case MsaaSamples::k1X:
image_info.samples = VK_SAMPLE_COUNT_1_BIT;
break;
case MsaaSamples::k2X:
image_info.samples = VK_SAMPLE_COUNT_2_BIT;
break;
case MsaaSamples::k4X:
image_info.samples = VK_SAMPLE_COUNT_4_BIT;
break;
default:
assert_unhandled_case(msaa_samples);
}
//*/
image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
image_info.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
VK_IMAGE_USAGE_TRANSFER_DST_BIT |
@ -322,13 +337,29 @@ CachedRenderPass::CachedRenderPass(VkDevice device,
: device_(device) {
std::memcpy(&config, &desired_config, sizeof(config));
VkSampleCountFlagBits sample_count;
switch (desired_config.surface_msaa) {
case MsaaSamples::k1X:
sample_count = VK_SAMPLE_COUNT_1_BIT;
break;
case MsaaSamples::k2X:
sample_count = VK_SAMPLE_COUNT_2_BIT;
break;
case MsaaSamples::k4X:
sample_count = VK_SAMPLE_COUNT_4_BIT;
break;
default:
assert_unhandled_case(desired_config.surface_msaa);
break;
}
// Initialize all attachments to default unused.
// As we set layout(location=RT) in shaders we must always provide 4.
VkAttachmentDescription attachments[5];
for (int i = 0; i < 4; ++i) {
attachments[i].flags = 0;
attachments[i].format = VK_FORMAT_UNDEFINED;
attachments[i].samples = VK_SAMPLE_COUNT_1_BIT;
attachments[i].samples = sample_count;
attachments[i].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
attachments[i].storeOp = VK_ATTACHMENT_STORE_OP_STORE;
attachments[i].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
@ -339,7 +370,7 @@ CachedRenderPass::CachedRenderPass(VkDevice device,
auto& depth_stencil_attachment = attachments[4];
depth_stencil_attachment.flags = 0;
depth_stencil_attachment.format = VK_FORMAT_UNDEFINED;
depth_stencil_attachment.samples = VK_SAMPLE_COUNT_1_BIT;
depth_stencil_attachment.samples = sample_count;
depth_stencil_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
depth_stencil_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
depth_stencil_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
@ -404,6 +435,10 @@ CachedRenderPass::~CachedRenderPass() {
bool CachedRenderPass::IsCompatible(
const RenderConfiguration& desired_config) const {
if (config.surface_msaa != desired_config.surface_msaa) {
return false;
}
for (int i = 0; i < 4; ++i) {
// TODO(benvanik): allow compatible vulkan formats.
if (config.color[i].format != desired_config.color[i].format) {
@ -503,12 +538,18 @@ bool RenderCache::dirty() const {
regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
dirty |= cur_regs.pa_sc_window_scissor_br !=
regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
dirty |= (cur_regs.rb_depthcontrol & (0x4 | 0x2)) !=
(regs[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
return dirty;
}
const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
VulkanShader* vertex_shader,
VulkanShader* pixel_shader) {
#if FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // FINE_GRAINED_DRAW_SCOPES
assert_null(current_command_buffer_);
current_command_buffer_ = command_buffer;
@ -520,6 +561,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
bool dirty = false;
dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
dirty |= SetShadowRegister(&regs.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO);
dirty |= SetShadowRegister(&regs.rb_color_mask, XE_GPU_REG_RB_COLOR_MASK);
dirty |= SetShadowRegister(&regs.rb_color_info, XE_GPU_REG_RB_COLOR_INFO);
dirty |= SetShadowRegister(&regs.rb_color1_info, XE_GPU_REG_RB_COLOR1_INFO);
dirty |= SetShadowRegister(&regs.rb_color2_info, XE_GPU_REG_RB_COLOR2_INFO);
@ -529,7 +571,11 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL);
dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_br,
XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR);
regs.rb_depthcontrol = register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32;
dirty |=
(regs.rb_depthcontrol & (0x4 | 0x2)) !=
(register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
regs.rb_depthcontrol =
register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2);
if (!dirty && current_state_.render_pass) {
// No registers have changed so we can reuse the previous render pass -
// just begin with what we had.
@ -549,7 +595,10 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
// Speculatively see if targets are actually used so we can skip copies
for (int i = 0; i < 4; i++) {
config->color[i].used = pixel_shader->writes_color_target(i);
uint32_t color_mask = (regs.rb_color_mask >> (i * 4)) & 0xF;
config->color[i].used =
config->mode_control == xenos::ModeControl::kColorDepth &&
color_mask != 0;
}
config->depth_stencil.used = !!(regs.rb_depthcontrol & (0x4 | 0x2));
@ -558,66 +607,20 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
current_state_.framebuffer = framebuffer;
current_state_.framebuffer_handle = framebuffer->handle;
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.buffer = edram_buffer_;
barrier.offset = 0;
barrier.size = 0;
// Copy EDRAM buffer into render targets with tight packing.
VkBufferImageCopy region;
region.bufferRowLength = 0;
region.bufferImageHeight = 0;
region.imageOffset = {0, 0, 0};
// Depth
auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
if (depth_target && current_state_.config.depth_stencil.used) {
region.imageSubresource = {
VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
region.bufferOffset = depth_target->key.tile_offset * 5120;
// Wait for any potential copies to finish.
barrier.offset = region.bufferOffset;
barrier.size = depth_target->key.tile_width * 80 *
depth_target->key.tile_height * 16 * 4;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
&barrier, 0, nullptr);
region.imageExtent = {depth_target->key.tile_width * 80u,
depth_target->key.tile_height * 16u, 1};
vkCmdCopyBufferToImage(command_buffer, edram_buffer_, depth_target->image,
VK_IMAGE_LAYOUT_GENERAL, 1, &region);
UpdateTileView(command_buffer, depth_target, true);
}
// Color
region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
for (int i = 0; i < 4; i++) {
auto target = current_state_.framebuffer->color_attachments[i];
if (!target || !current_state_.config.color[i].used) {
continue;
}
region.bufferOffset = target->key.tile_offset * 5120;
// Wait for any potential copies to finish.
barrier.offset = region.bufferOffset;
barrier.size =
target->key.tile_width * 80 * target->key.tile_height * 16 * 4;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
&barrier, 0, nullptr);
region.imageExtent = {target->key.tile_width * 80u,
target->key.tile_height * 16u, 1};
vkCmdCopyBufferToImage(command_buffer, edram_buffer_, target->image,
VK_IMAGE_LAYOUT_GENERAL, 1, &region);
UpdateTileView(command_buffer, target, true);
}
}
if (!render_pass) {
@ -758,6 +761,7 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
color_key.tile_width = xe::round_up(config->surface_pitch_px, 80) / 80;
color_key.tile_height = xe::round_up(config->surface_height_px, 16) / 16;
color_key.color_or_depth = 1;
color_key.msaa_samples = static_cast<uint16_t>(config->surface_msaa);
color_key.edram_format = static_cast<uint16_t>(config->color[i].format);
target_color_attachments[i] =
FindOrCreateTileView(command_buffer, color_key);
@ -774,6 +778,8 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
depth_stencil_key.tile_height =
xe::round_up(config->surface_height_px, 16) / 16;
depth_stencil_key.color_or_depth = 0;
depth_stencil_key.msaa_samples =
static_cast<uint16_t>(config->surface_msaa);
depth_stencil_key.edram_format =
static_cast<uint16_t>(config->depth_stencil.format);
auto target_depth_stencil_attachment =
@ -810,6 +816,51 @@ CachedTileView* RenderCache::FindOrCreateTileView(
return tile_view;
}
void RenderCache::UpdateTileView(VkCommandBuffer command_buffer,
CachedTileView* view, bool load,
bool insert_barrier) {
if (insert_barrier) {
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
if (load) {
barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
} else {
barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
}
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.buffer = edram_buffer_;
barrier.offset = view->key.tile_offset * 5120;
barrier.size = view->key.tile_width * 80 * view->key.tile_height * 16 * 4;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
&barrier, 0, nullptr);
}
VkBufferImageCopy region;
region.bufferOffset = view->key.tile_offset * 5120;
region.bufferRowLength = 0;
region.bufferImageHeight = 0;
region.imageSubresource = {0, 0, 0, 1};
region.imageSubresource.aspectMask =
view->key.color_or_depth
? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
region.imageOffset = {0, 0, 0};
region.imageExtent = {view->key.tile_width * 80u, view->key.tile_height * 16u,
1};
if (load) {
vkCmdCopyBufferToImage(command_buffer, edram_buffer_, view->image,
VK_IMAGE_LAYOUT_GENERAL, 1, &region);
} else {
vkCmdCopyImageToBuffer(command_buffer, view->image, VK_IMAGE_LAYOUT_GENERAL,
edram_buffer_, 1, &region);
}
}
CachedTileView* RenderCache::FindTileView(const TileViewKey& view_key) const {
// Check the cache.
// TODO(benvanik): better lookup.
@ -837,35 +888,31 @@ void RenderCache::EndRenderPass() {
// can't get the correct height atm) and we may end up overwriting the valid
// contents of another render target by mistake! Need to reorder copy commands
// to avoid this.
VkBufferImageCopy region;
region.bufferRowLength = 0;
region.bufferImageHeight = 0;
region.imageOffset = {0, 0, 0};
// Depth/stencil
std::vector<CachedTileView*> cached_views;
// Depth
auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
if (depth_target && current_state_.config.depth_stencil.used) {
region.imageSubresource = {
VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
region.bufferOffset = depth_target->key.tile_offset * 5120;
region.imageExtent = {depth_target->key.tile_width * 80u,
depth_target->key.tile_height * 16u, 1};
vkCmdCopyImageToBuffer(current_command_buffer_, depth_target->image,
VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
cached_views.push_back(depth_target);
}
// Color
region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
for (int i = 0; i < 4; i++) {
auto target = current_state_.framebuffer->color_attachments[i];
if (!target || !current_state_.config.color[i].used) {
continue;
}
region.bufferOffset = target->key.tile_offset * 5120;
region.imageExtent = {target->key.tile_width * 80u,
target->key.tile_height * 16u, 1};
vkCmdCopyImageToBuffer(current_command_buffer_, target->image,
VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
cached_views.push_back(target);
}
std::sort(
cached_views.begin(), cached_views.end(),
[](CachedTileView const* a, CachedTileView const* b) { return *a < *b; });
for (auto view : cached_views) {
UpdateTileView(current_command_buffer_, view, false, false);
}
current_command_buffer_ = nullptr;
@ -920,6 +967,7 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
&buffer_barrier, 0, nullptr);
// Issue the copy command.
// TODO(DrChat): Stencil copies.
VkBufferImageCopy region;
region.bufferOffset = edram_base * 5120;
region.bufferImageHeight = 0;
@ -928,8 +976,7 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
region.imageExtent = extents;
region.imageSubresource = {0, 0, 0, 1};
region.imageSubresource.aspectMask =
color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
vkCmdCopyBufferToImage(command_buffer, edram_buffer_, image, image_layout, 1,
&region);
@ -947,13 +994,15 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
uint32_t edram_base, uint32_t pitch,
uint32_t height, VkImage image,
VkImageLayout image_layout, bool color_or_depth,
uint32_t format, VkFilter filter,
VkOffset3D offset, VkExtent3D extents) {
uint32_t height, MsaaSamples num_samples,
VkImage image, VkImageLayout image_layout,
bool color_or_depth, uint32_t format,
VkFilter filter, VkOffset3D offset,
VkExtent3D extents) {
// Grab a tile view that represents the source image.
TileViewKey key;
key.color_or_depth = color_or_depth ? 1 : 0;
key.msaa_samples = static_cast<uint16_t>(num_samples);
key.edram_format = format;
key.tile_offset = edram_base;
key.tile_width = xe::round_up(pitch, 80) / 80;
@ -979,14 +1028,14 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
// Update the tile view with current EDRAM contents.
// TODO: Heuristics to determine if this copy is avoidable.
// TODO(DrChat): Stencil copies.
VkBufferImageCopy buffer_copy;
buffer_copy.bufferOffset = edram_base * 5120;
buffer_copy.bufferImageHeight = 0;
buffer_copy.bufferRowLength = 0;
buffer_copy.imageSubresource = {0, 0, 0, 1};
buffer_copy.imageSubresource.aspectMask =
color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
buffer_copy.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
buffer_copy.imageOffset = {0, 0, 0};
vkCmdCopyBufferToImage(command_buffer, edram_buffer_, tile_view->image,
@ -1018,26 +1067,48 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
assert_true(extents.height <= key.tile_height * 16u);
// Now issue the blit to the destination.
// TODO: Resolve to destination if necessary.
VkImageBlit image_blit;
image_blit.srcSubresource = {0, 0, 0, 1};
image_blit.srcSubresource.aspectMask =
color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
image_blit.srcOffsets[0] = {0, 0, 0};
image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height),
int32_t(extents.depth)};
if (num_samples == MsaaSamples::k1X) {
VkImageBlit image_blit;
image_blit.srcSubresource = {0, 0, 0, 1};
image_blit.srcSubresource.aspectMask =
color_or_depth
? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
image_blit.srcOffsets[0] = {0, 0, 0};
image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height),
int32_t(extents.depth)};
image_blit.dstSubresource = {0, 0, 0, 1};
image_blit.dstSubresource.aspectMask =
color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
image_blit.dstOffsets[0] = offset;
image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width),
offset.y + int32_t(extents.height),
offset.z + int32_t(extents.depth)};
vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL,
image, image_layout, 1, &image_blit, filter);
image_blit.dstSubresource = {0, 0, 0, 1};
image_blit.dstSubresource.aspectMask =
color_or_depth
? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
image_blit.dstOffsets[0] = offset;
image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width),
offset.y + int32_t(extents.height),
offset.z + int32_t(extents.depth)};
vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL,
image, image_layout, 1, &image_blit, filter);
} else {
VkImageResolve image_resolve;
image_resolve.srcSubresource = {0, 0, 0, 1};
image_resolve.srcSubresource.aspectMask =
color_or_depth
? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
image_resolve.srcOffset = {0, 0, 0};
image_resolve.dstSubresource = {0, 0, 0, 1};
image_resolve.dstSubresource.aspectMask =
color_or_depth
? VK_IMAGE_ASPECT_COLOR_BIT
: VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
image_resolve.dstOffset = offset;
image_resolve.extent = extents;
vkCmdResolveImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL,
image, image_layout, 1, &image_resolve);
}
// Transition the image back into its previous layout.
image_barrier.srcAccessMask = image_barrier.dstAccessMask;
@ -1052,13 +1123,14 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
uint32_t edram_base,
ColorRenderTargetFormat format,
uint32_t pitch, uint32_t height,
float* color) {
MsaaSamples num_samples, float* color) {
// TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
// need to detect this and calculate a value.
// Grab a tile view (as we need to clear an image first)
TileViewKey key;
key.color_or_depth = 1;
key.msaa_samples = static_cast<uint16_t>(num_samples);
key.edram_format = static_cast<uint16_t>(format);
key.tile_offset = edram_base;
key.tile_width = xe::round_up(pitch, 80) / 80;
@ -1091,13 +1163,15 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
uint32_t edram_base,
DepthRenderTargetFormat format,
uint32_t pitch, uint32_t height,
float depth, uint32_t stencil) {
MsaaSamples num_samples, float depth,
uint32_t stencil) {
// TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
// need to detect this and calculate a value.
// Grab a tile view (as we need to clear an image first)
TileViewKey key;
key.color_or_depth = 0;
key.msaa_samples = static_cast<uint16_t>(num_samples);
key.edram_format = static_cast<uint16_t>(format);
key.tile_offset = edram_base;
key.tile_width = xe::round_up(pitch, 80) / 80;
@ -1117,12 +1191,13 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range);
// Copy image back into EDRAM buffer
// TODO(DrChat): Stencil copies.
VkBufferImageCopy copy_range;
copy_range.bufferOffset = edram_base * 5120;
copy_range.bufferImageHeight = 0;
copy_range.bufferRowLength = 0;
copy_range.imageSubresource = {
VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1,
VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0, 1,
};
copy_range.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
copy_range.imageOffset = {0, 0, 0};
@ -1131,6 +1206,11 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
&copy_range);
}
void RenderCache::FillEDRAM(VkCommandBuffer command_buffer, uint32_t value) {
vkCmdFillBuffer(command_buffer, edram_buffer_, 0, kEdramBufferCapacity,
value);
}
bool RenderCache::SetShadowRegister(uint32_t* dest, uint32_t register_name) {
uint32_t value = register_file_->values[register_name].u32;
if (*dest == value) {

View File

@ -38,9 +38,9 @@ struct TileViewKey {
// 1 if format is ColorRenderTargetFormat, else DepthRenderTargetFormat.
uint16_t color_or_depth : 1;
// Surface MSAA samples
// uint16_t msaa_samples : 2;
uint16_t msaa_samples : 2;
// Either ColorRenderTargetFormat or DepthRenderTargetFormat.
uint16_t edram_format : 15; // 13;
uint16_t edram_format : 13;
};
static_assert(sizeof(TileViewKey) == 8, "Key must be tightly packed");
@ -69,6 +69,10 @@ class CachedTileView {
return *a == *b;
}
bool operator<(const CachedTileView& other) const {
return key.tile_offset < other.key.tile_offset;
}
private:
VkDevice device_ = nullptr;
};
@ -278,22 +282,26 @@ class RenderCache {
// Queues commands to blit EDRAM contents into an image.
// The command buffer must not be inside of a render pass when calling this.
void BlitToImage(VkCommandBuffer command_buffer, uint32_t edram_base,
uint32_t pitch, uint32_t height, VkImage image,
VkImageLayout image_layout, bool color_or_depth,
uint32_t format, VkFilter filter, VkOffset3D offset,
VkExtent3D extents);
uint32_t pitch, uint32_t height, MsaaSamples num_samples,
VkImage image, VkImageLayout image_layout,
bool color_or_depth, uint32_t format, VkFilter filter,
VkOffset3D offset, VkExtent3D extents);
// Queues commands to clear EDRAM contents with a solid color.
// The command buffer must not be inside of a render pass when calling this.
void ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base,
ColorRenderTargetFormat format, uint32_t pitch,
uint32_t height, float* color);
uint32_t height, MsaaSamples num_samples, float* color);
// Queues commands to clear EDRAM contents with depth/stencil values.
// The command buffer must not be inside of a render pass when calling this.
void ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
uint32_t edram_base,
DepthRenderTargetFormat format, uint32_t pitch,
uint32_t height, float depth, uint32_t stencil);
uint32_t height, MsaaSamples num_samples,
float depth, uint32_t stencil);
// Queues commands to fill EDRAM contents with a constant value.
// The command buffer must not be inside of a render pass when calling this.
void FillEDRAM(VkCommandBuffer command_buffer, uint32_t value);
private:
// Parses the current state into a configuration object.
@ -306,6 +314,9 @@ class RenderCache {
CachedTileView* FindOrCreateTileView(VkCommandBuffer command_buffer,
const TileViewKey& view_key);
void UpdateTileView(VkCommandBuffer command_buffer, CachedTileView* view,
bool load, bool insert_barrier = true);
// Gets or creates a render pass and frame buffer for the given configuration.
// This attempts to reuse as much as possible across render passes and
// framebuffers.
@ -335,6 +346,7 @@ class RenderCache {
struct ShadowRegisters {
uint32_t rb_modecontrol;
uint32_t rb_surface_info;
uint32_t rb_color_mask;
uint32_t rb_color_info;
uint32_t rb_color1_info;
uint32_t rb_color2_info;

View File

@ -152,19 +152,8 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
// TODO(benvanik): move to CP or to host (trace dump, etc).
// This only needs to surround a vkQueueSubmit.
static uint32_t frame = 0;
if (device_->is_renderdoc_attached() &&
(FLAGS_vulkan_renderdoc_capture_all ||
trace_state_ == TraceState::kSingleFrame)) {
if (queue_mutex_) {
queue_mutex_->lock();
}
device_->BeginRenderDocFrameCapture();
if (queue_mutex_) {
queue_mutex_->unlock();
}
if (queue_mutex_) {
queue_mutex_->lock();
}
// TODO(DrChat): If setup buffer is empty, don't bother queueing it up.
@ -182,45 +171,37 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
submit_info.signalSemaphoreCount = 0;
submit_info.pSignalSemaphores = nullptr;
if (queue_mutex_) {
queue_mutex_->lock();
// queue_mutex_->lock();
}
status = vkQueueSubmit(queue_, 1, &submit_info, *current_batch_fence_);
if (queue_mutex_) {
queue_mutex_->unlock();
// queue_mutex_->unlock();
}
CheckResult(status, "vkQueueSubmit");
// TODO(DrChat): Disable this completely.
VkFence fences[] = {*current_batch_fence_};
status = vkWaitForFences(*device_, 1, fences, true, -1);
CheckResult(status, "vkWaitForFences");
if (device_->is_renderdoc_attached() &&
(FLAGS_vulkan_renderdoc_capture_all ||
trace_state_ == TraceState::kSingleFrame)) {
if (queue_mutex_) {
queue_mutex_->lock();
}
if (device_->is_renderdoc_attached() && capturing_) {
device_->EndRenderDocFrameCapture();
capturing_ = false;
// HACK(DrChat): Used b/c I disabled trace saving code in the CP.
// Remove later.
if (!trace_writer_.is_open()) {
trace_state_ = TraceState::kDisabled;
}
if (queue_mutex_) {
queue_mutex_->unlock();
}
}
if (queue_mutex_) {
queue_mutex_->unlock();
}
// Scavenging.
current_command_buffer_ = nullptr;
current_setup_buffer_ = nullptr;
while (command_buffer_pool_->has_pending()) {
command_buffer_pool_->Scavenge();
xe::threading::MaybeYield();
}
command_buffer_pool_->Scavenge();
texture_cache_->Scavenge();
current_batch_fence_ = nullptr;
@ -331,6 +312,22 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
vkBeginCommandBuffer(current_setup_buffer_, &command_buffer_begin_info);
CheckResult(status, "vkBeginCommandBuffer");
static uint32_t frame = 0;
if (device_->is_renderdoc_attached() && !capturing_ &&
(FLAGS_vulkan_renderdoc_capture_all ||
trace_state_ == TraceState::kSingleFrame)) {
if (queue_mutex_) {
queue_mutex_->lock();
}
capturing_ = true;
device_->BeginRenderDocFrameCapture();
if (queue_mutex_) {
queue_mutex_->unlock();
}
}
started_command_buffer = true;
}
auto command_buffer = current_command_buffer_;
@ -357,6 +354,10 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
current_render_state_ = render_cache_->BeginRenderPass(
command_buffer, vertex_shader, pixel_shader);
if (!current_render_state_) {
command_buffer_pool_->CancelBatch();
current_command_buffer_ = nullptr;
current_setup_buffer_ = nullptr;
current_batch_fence_ = nullptr;
return false;
}
}
@ -378,18 +379,30 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
// Pass registers to the shaders.
if (!PopulateConstants(command_buffer, vertex_shader, pixel_shader)) {
render_cache_->EndRenderPass();
command_buffer_pool_->CancelBatch();
current_command_buffer_ = nullptr;
current_setup_buffer_ = nullptr;
current_batch_fence_ = nullptr;
return false;
}
// Upload and bind index buffer data (if we have any).
if (!PopulateIndexBuffer(command_buffer, index_buffer_info)) {
render_cache_->EndRenderPass();
command_buffer_pool_->CancelBatch();
current_command_buffer_ = nullptr;
current_setup_buffer_ = nullptr;
current_batch_fence_ = nullptr;
return false;
}
// Upload and bind all vertex buffer data.
if (!PopulateVertexBuffers(command_buffer, vertex_shader)) {
render_cache_->EndRenderPass();
command_buffer_pool_->CancelBatch();
current_command_buffer_ = nullptr;
current_setup_buffer_ = nullptr;
current_batch_fence_ = nullptr;
return false;
}
@ -423,6 +436,10 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
bool VulkanCommandProcessor::PopulateConstants(VkCommandBuffer command_buffer,
VulkanShader* vertex_shader,
VulkanShader* pixel_shader) {
#if FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // FINE_GRAINED_DRAW_SCOPES
// Upload the constants the shaders require.
// These are optional, and if none are defined 0 will be returned.
auto constant_offsets = buffer_cache_->UploadConstantRegisters(
@ -742,7 +759,7 @@ bool VulkanCommandProcessor::IssueCopy() {
tex_info.size_2d.input_height = dest_block_height;
tex_info.size_2d.input_pitch = copy_dest_pitch * 4;
auto texture = texture_cache_->DemandResolveTexture(
tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr, nullptr);
tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr);
if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
// Transition the image to a general layout.
VkImageMemoryBarrier image_barrier;
@ -810,8 +827,9 @@ bool VulkanCommandProcessor::IssueCopy() {
case CopyCommand::kConvert:
render_cache_->BlitToImage(
command_buffer, edram_base, surface_pitch, resolve_extent.height,
texture->image, texture->image_layout, copy_src_select <= 3,
src_format, VK_FILTER_LINEAR, resolve_offset, resolve_extent);
surface_msaa, texture->image, texture->image_layout,
copy_src_select <= 3, src_format, VK_FILTER_LINEAR, resolve_offset,
resolve_extent);
break;
case CopyCommand::kConstantOne:
@ -839,7 +857,7 @@ bool VulkanCommandProcessor::IssueCopy() {
// TODO(DrChat): Do we know the surface height at this point?
render_cache_->ClearEDRAMColor(command_buffer, color_edram_base,
color_format, surface_pitch,
resolve_extent.height, color);
resolve_extent.height, surface_msaa, color);
}
if (depth_clear_enabled) {
@ -850,7 +868,7 @@ bool VulkanCommandProcessor::IssueCopy() {
// TODO(DrChat): Do we know the surface height at this point?
render_cache_->ClearEDRAMDepthStencil(
command_buffer, depth_edram_base, depth_format, surface_pitch,
resolve_extent.height, depth, stencil);
resolve_extent.height, surface_msaa, depth, stencil);
}
return true;

View File

@ -94,6 +94,7 @@ class VulkanCommandProcessor : public CommandProcessor {
// Last copy base address, for debugging only.
uint32_t last_copy_base_ = 0;
bool capturing_ = false;
std::unique_ptr<BufferCache> buffer_cache_;
std::unique_ptr<PipelineCache> pipeline_cache_;