GS:HW: Add option to spin CPU during readbacks

This commit is contained in:
TellowKrinkle 2022-09-27 23:54:33 -07:00 committed by refractionpcsx2
parent 89dd7f19ef
commit ac1f31f2cd
20 changed files with 133 additions and 38 deletions

View File

@ -17,6 +17,7 @@
#include "common/D3D12/Context.h"
#include "common/Assertions.h"
#include "common/General.h"
#include "common/ScopedGuard.h"
#include "common/Console.h"
#include "D3D12MemAlloc.h"
@ -382,7 +383,7 @@ void Context::MoveToNextCommandList()
// We may have to wait if this command list hasn't finished on the GPU.
CommandListResources& res = m_command_lists[m_current_command_list];
WaitForFence(res.ready_fence_value);
WaitForFence(res.ready_fence_value, false);
res.ready_fence_value = m_current_fence_value;
res.init_command_list_used = false;
@ -445,7 +446,7 @@ ID3D12GraphicsCommandList4* Context::GetInitCommandList()
return res.command_lists[0].get();
}
void Context::ExecuteCommandList(bool wait_for_completion)
void Context::ExecuteCommandList(WaitType wait_for_completion)
{
CommandListResources& res = m_command_lists[m_current_command_list];
HRESULT hr;
@ -485,8 +486,8 @@ void Context::ExecuteCommandList(bool wait_for_completion)
pxAssertRel(SUCCEEDED(hr), "Signal fence");
MoveToNextCommandList();
if (wait_for_completion)
WaitForFence(res.ready_fence_value);
if (wait_for_completion != WaitType::None)
WaitForFence(res.ready_fence_value, wait_for_completion == WaitType::Spin);
}
void Context::InvalidateSamplerGroups()
@ -547,7 +548,7 @@ void Context::DestroyPendingResources(CommandListResources& cmdlist)
void Context::DestroyResources()
{
ExecuteCommandList(true);
ExecuteCommandList(WaitType::Sleep);
m_texture_stream_buffer.Destroy(false);
m_descriptor_heap_manager.Free(&m_null_srv_descriptor);
@ -573,20 +574,30 @@ void Context::DestroyResources()
m_device.reset();
}
void Context::WaitForFence(u64 fence)
void Context::WaitForFence(u64 fence, bool spin)
{
if (m_completed_fence_value >= fence)
return;
// Try non-blocking check.
m_completed_fence_value = m_fence->GetCompletedValue();
if (m_completed_fence_value < fence)
if (spin)
{
// Fall back to event.
HRESULT hr = m_fence->SetEventOnCompletion(fence, m_fence_event);
pxAssertRel(SUCCEEDED(hr), "Set fence event on completion");
WaitForSingleObject(m_fence_event, INFINITE);
u64 value;
while ((value = m_fence->GetCompletedValue()) < fence)
ShortSpin();
m_completed_fence_value = value;
}
else
{
// Try non-blocking check.
m_completed_fence_value = m_fence->GetCompletedValue();
if (m_completed_fence_value < fence)
{
// Fall back to event.
HRESULT hr = m_fence->SetEventOnCompletion(fence, m_fence_event);
pxAssertRel(SUCCEEDED(hr), "Set fence event on completion");
WaitForSingleObject(m_fence_event, INFINITE);
m_completed_fence_value = m_fence->GetCompletedValue();
}
}
// Release resources for as many command lists which have completed.
@ -607,7 +618,7 @@ void Context::WaitForGPUIdle()
u32 index = (m_current_command_list + 1) % NUM_COMMAND_LISTS;
for (u32 i = 0; i < (NUM_COMMAND_LISTS - 1); i++)
{
WaitForFence(m_command_lists[index].ready_fence_value);
WaitForFence(m_command_lists[index].ready_fence_value, false);
index = (index + 1) % NUM_COMMAND_LISTS;
}
}

View File

@ -122,11 +122,18 @@ namespace D3D12
/// Test for support for the specified texture format.
bool SupportsTextureFormat(DXGI_FORMAT format);
enum class WaitType
{
None, ///< Don't wait (async)
Sleep, ///< Wait normally
Spin, ///< Wait by spinning
};
/// Executes the current command list.
void ExecuteCommandList(bool wait_for_completion);
void ExecuteCommandList(WaitType wait_for_completion);
/// Waits for a specific fence.
void WaitForFence(u64 fence);
void WaitForFence(u64 fence, bool spin);
/// Waits for any in-flight command buffers to complete.
void WaitForGPUIdle();

View File

@ -273,7 +273,7 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
return false;
// Wait until this fence is signaled. This will fire the callback, updating the GPU position.
g_d3d12_context->WaitForFence(iter->first);
g_d3d12_context->WaitForFence(iter->first, false);
m_tracked_fences.erase(m_tracked_fences.begin(), m_current_offset == iter->second ? m_tracked_fences.end() : ++iter);
m_current_offset = new_offset;
m_current_space = new_space;

View File

@ -293,7 +293,7 @@ ID3D12GraphicsCommandList* Texture::BeginStreamUpdate(ID3D12GraphicsCommandList*
{
DevCon.WriteLn("Executing command buffer while waiting for %u bytes (%ux%u) in upload buffer", upload_size, width,
height);
g_d3d12_context->ExecuteCommandList(false);
g_d3d12_context->ExecuteCommandList(Context::WaitType::None);
if (!g_d3d12_context->GetTextureStreamBuffer().ReserveMemory(upload_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
Console.Error("Failed to reserve %u bytes for %ux%u upload", upload_size, width, height);

View File

@ -17,6 +17,7 @@
#include "common/Align.h"
#include "common/Assertions.h"
#include "common/Console.h"
#include "common/General.h"
#include "common/StringUtil.h"
#include "common/Vulkan/ShaderCompiler.h"
#include "common/Vulkan/SwapChain.h"
@ -1437,18 +1438,23 @@ namespace Vulkan
vmaSetCurrentFrameIndex(m_allocator, static_cast<u32>(m_next_fence_counter));
}
void Context::ExecuteCommandBuffer(bool wait_for_completion)
void Context::ExecuteCommandBuffer(WaitType wait_for_completion)
{
// If we're waiting for completion, don't bother waking the worker thread.
const u32 current_frame = m_current_frame;
SubmitCommandBuffer();
MoveToNextCommandBuffer();
if (wait_for_completion)
if (wait_for_completion != WaitType::None)
{
// Calibrate while we wait
if (m_wants_new_timestamp_calibration)
CalibrateSpinTimestamp();
if (wait_for_completion == WaitType::Spin)
{
while (vkGetFenceStatus(m_device, m_frame_resources[current_frame].fence) == VK_NOT_READY)
ShortSpin();
}
WaitForCommandBufferCompletion(current_frame);
}
}

View File

@ -198,7 +198,14 @@ namespace Vulkan
uint32_t present_image_index = 0xFFFFFFFF, bool submit_on_thread = false);
void MoveToNextCommandBuffer();
void ExecuteCommandBuffer(bool wait_for_completion);
enum class WaitType
{
None,
Sleep,
Spin,
};
void ExecuteCommandBuffer(WaitType wait_for_completion);
void WaitForPresentComplete();
// Was the last present submitted to the queue a failure? If so, we must recreate our swapchain.

View File

@ -99,6 +99,7 @@ GraphicsSettingsWidget::GraphicsSettingsWidget(SettingsDialog* dialog, QWidget*
SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.vsync, "EmuCore/GS", "VsyncEnable", 0);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.enableHWFixes, "EmuCore/GS", "UserHacks", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.spinGPUDuringReadbacks, "EmuCore/GS", "HWSpinGPUForReadbacks", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.spinCPUDuringReadbacks, "EmuCore/GS", "HWSpinCPUForReadbacks", false);
//////////////////////////////////////////////////////////////////////////
// Game Display Settings
@ -415,6 +416,10 @@ GraphicsSettingsWidget::GraphicsSettingsWidget(SettingsDialog* dialog, QWidget*
tr("Submits useless work to the GPU during readbacks to prevent it from going into powersave modes. "
"May improve performance but with a significant increase in power usage."));
dialog->registerWidgetHelp(m_ui.spinGPUDuringReadbacks, tr("Spin CPU During Readbacks"), tr("Unchecked"),
tr("Does useless work on the CPU during readbacks to prevent it from going to into powersave modes. "
"May improve performance but with a significant increase in power usage."));
// Software
dialog->registerWidgetHelp(m_ui.extraSWThreads, tr("Extra Rendering Threads"), tr("2 threads"),
tr("Number of rendering threads: 0 for single thread, 2 or more for multithread (1 is for debugging). "

View File

@ -650,6 +650,13 @@
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QCheckBox" name="spinCPUDuringReadbacks">
<property name="text">
<string>Spin CPU During Readbacks</string>
</property>
</widget>
</item>
</layout>
</item>
</layout>

View File

@ -497,6 +497,7 @@ struct Pcsx2Config
bool
HWSpinGPUForReadbacks : 1,
HWSpinCPUForReadbacks : 1,
GPUPaletteConversion : 1,
AutoFlushSW : 1,
PreloadFrameWithGSData : 1,

View File

@ -333,7 +333,7 @@ bool D3D12HostDisplay::ChangeWindow(const WindowInfo& new_wi)
void D3D12HostDisplay::DestroySurface()
{
// For some reason if we don't execute the command list here, the swap chain is in use.. not sure where.
g_d3d12_context->ExecuteCommandList(true);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::Sleep);
if (IsFullscreen())
SetFullscreen(false, 0, 0, 0.0f);
@ -438,7 +438,7 @@ void D3D12HostDisplay::ResizeWindow(s32 new_window_width, s32 new_window_height,
return;
// For some reason if we don't execute the command list here, the swap chain is in use.. not sure where.
g_d3d12_context->ExecuteCommandList(true);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::Sleep);
DestroySwapChainRTVs();
@ -509,7 +509,7 @@ bool D3D12HostDisplay::SetFullscreen(bool fullscreen, u32 width, u32 height, flo
return true;
}
g_d3d12_context->ExecuteCommandList(true);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::Sleep);
DestroySwapChainRTVs();
m_swap_chain.reset();
@ -585,7 +585,7 @@ void D3D12HostDisplay::EndPresent()
m_current_swap_chain_buffer = ((m_current_swap_chain_buffer + 1) % static_cast<u32>(m_swap_chain_buffers.size()));
swap_chain_buf.TransitionToState(g_d3d12_context->GetCommandList(), D3D12_RESOURCE_STATE_PRESENT);
g_d3d12_context->ExecuteCommandList(false);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::None);
const bool vsync = static_cast<UINT>(m_vsync_mode != VsyncMode::Off);
if (!vsync && m_using_allow_tearing)

View File

@ -86,7 +86,7 @@ bool VulkanHostDisplay::ChangeWindow(const WindowInfo& new_wi)
if (new_wi.type == WindowInfo::Type::Surfaceless)
{
g_vulkan_context->ExecuteCommandBuffer(true);
g_vulkan_context->ExecuteCommandBuffer(Vulkan::Context::WaitType::Sleep);
m_swap_chain.reset();
m_window_info = new_wi;
return true;
@ -209,7 +209,7 @@ static bool UploadBufferToTexture(
if (!buf.ReserveMemory(upload_size, g_vulkan_context->GetBufferCopyOffsetAlignment()))
{
Console.WriteLn("Executing command buffer for UploadBufferToTexture()");
g_vulkan_context->ExecuteCommandBuffer(false);
g_vulkan_context->ExecuteCommandBuffer(Vulkan::Context::WaitType::None);
if (!buf.ReserveMemory(upload_size, g_vulkan_context->GetBufferCopyOffsetAlignment()))
{
Console.WriteLn("Failed to allocate %u bytes in stream buffer for UploadBufferToTexture()", upload_size);
@ -365,7 +365,7 @@ bool VulkanHostDisplay::BeginPresent(bool frame_skip)
if (!m_swap_chain->RecreateSurface(m_window_info))
{
Console.Error("Failed to recreate surface after loss");
g_vulkan_context->ExecuteCommandBuffer(false);
g_vulkan_context->ExecuteCommandBuffer(Vulkan::Context::WaitType::None);
return false;
}
@ -378,7 +378,7 @@ bool VulkanHostDisplay::BeginPresent(bool frame_skip)
{
// Still submit the command buffer, otherwise we'll end up with several frames waiting.
LOG_VULKAN_ERROR(res, "vkAcquireNextImageKHR() failed: ");
g_vulkan_context->ExecuteCommandBuffer(false);
g_vulkan_context->ExecuteCommandBuffer(Vulkan::Context::WaitType::None);
return false;
}
}

View File

@ -165,7 +165,7 @@ void ImGui_ImplDX12_RenderDrawData(ImDrawData* draw_data)
if (!bd->VertexStreamBuffer.ReserveMemory(needed_vb, sizeof(ImDrawVert)) ||
!bd->IndexStreamBuffer.ReserveMemory(needed_ib, sizeof(ImDrawIdx)))
{
g_d3d12_context->ExecuteCommandList(false);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::None);
if (!bd->VertexStreamBuffer.ReserveMemory(needed_vb, sizeof(ImDrawVert)) ||
!bd->IndexStreamBuffer.ReserveMemory(needed_ib, sizeof(ImDrawIdx)))
{
@ -230,7 +230,7 @@ void ImGui_ImplDX12_RenderDrawData(ImDrawData* draw_data)
if (!g_d3d12_context->GetDescriptorAllocator().Allocate(1, &handle))
{
// ugh.
g_d3d12_context->ExecuteCommandList(false);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::None);
ctx = g_d3d12_context->GetCommandList();
ImGui_ImplDX12_SetupRenderState(draw_data, ctx);
if (!g_d3d12_context->GetDescriptorAllocator().Allocate(1, &handle))

View File

@ -1446,6 +1446,7 @@ void GSApp::Init()
m_default_configuration["HWDownloadMode"] = std::to_string(static_cast<u8>(GSHardwareDownloadMode::Enabled));
m_default_configuration["GSDumpCompression"] = std::to_string(static_cast<u8>(GSDumpCompressionMethod::LZMA));
m_default_configuration["HWSpinGPUForReadbacks"] = "0";
m_default_configuration["HWSpinCPUForReadbacks"] = "0";
m_default_configuration["pcrtc_antiblur"] = "1";
m_default_configuration["disable_interlace_offset"] = "0";
m_default_configuration["pcrtc_offsets"] = "0";

View File

@ -1445,7 +1445,7 @@ void GSDevice12::DestroyStagingBuffer()
void GSDevice12::DestroyResources()
{
g_d3d12_context->ExecuteCommandList(true);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::Sleep);
for (auto& it : m_tfx_pipelines)
g_d3d12_context->DeferObjectDestruction(it.second.get());
@ -1734,10 +1734,20 @@ void GSDevice12::InitializeSamplers()
pxFailRel("Failed to initialize samplers");
}
static D3D12::Context::WaitType GetWaitType(bool wait, bool spin)
{
if (!wait)
return D3D12::Context::WaitType::None;
if (spin)
return D3D12::Context::WaitType::Spin;
else
return D3D12::Context::WaitType::Sleep;
}
void GSDevice12::ExecuteCommandList(bool wait_for_completion)
{
EndRenderPass();
g_d3d12_context->ExecuteCommandList(wait_for_completion);
g_d3d12_context->ExecuteCommandList(GetWaitType(wait_for_completion, GSConfig.HWSpinCPUForReadbacks));
InvalidateCachedState();
}
@ -1758,7 +1768,7 @@ void GSDevice12::ExecuteCommandListAndRestartRenderPass(const char* reason)
const bool was_in_render_pass = m_in_render_pass;
EndRenderPass();
g_d3d12_context->ExecuteCommandList(false);
g_d3d12_context->ExecuteCommandList(D3D12::Context::WaitType::None);
InvalidateCachedState();
if (was_in_render_pass)

View File

@ -26,6 +26,21 @@
#ifdef __APPLE__
#include "GSMTLSharedHeader.h"
static constexpr bool IsCommandBufferCompleted(MTLCommandBufferStatus status)
{
switch (status)
{
case MTLCommandBufferStatusNotEnqueued:
case MTLCommandBufferStatusEnqueued:
case MTLCommandBufferStatusCommitted:
case MTLCommandBufferStatusScheduled:
return false;
case MTLCommandBufferStatusCompleted:
case MTLCommandBufferStatusError:
return true;
}
}
GSDevice* MakeGSDeviceMTL()
{
return new GSDeviceMTL();
@ -1055,7 +1070,15 @@ bool GSDeviceMTL::DownloadTexture(GSTexture* src, const GSVector4i& rect, GSText
m_spin_timer = 30;
}
}
[cmdbuf waitUntilCompleted];
if (GSConfig.HWSpinCPUForReadbacks)
{
while (!IsCommandBufferCompleted([cmdbuf status]))
ShortSpin();
}
else
{
[cmdbuf waitUntilCompleted];
}
out_map.bits = static_cast<u8*>([m_texture_download_buf contents]);
g_perfmon.Put(GSPerfMon::Readbacks, 1);

View File

@ -1822,7 +1822,7 @@ void GSDeviceVK::DestroyStagingBuffer()
void GSDeviceVK::DestroyResources()
{
g_vulkan_context->ExecuteCommandBuffer(true);
g_vulkan_context->ExecuteCommandBuffer(Vulkan::Context::WaitType::Sleep);
if (m_tfx_descriptor_sets[0] != VK_NULL_HANDLE)
g_vulkan_context->FreeGlobalDescriptorSet(m_tfx_descriptor_sets[0]);
@ -2202,10 +2202,20 @@ bool GSDeviceVK::CreatePersistentDescriptorSets()
return true;
}
static Vulkan::Context::WaitType GetWaitType(bool wait, bool spin)
{
if (!wait)
return Vulkan::Context::WaitType::None;
if (spin)
return Vulkan::Context::WaitType::Spin;
else
return Vulkan::Context::WaitType::Sleep;
}
void GSDeviceVK::ExecuteCommandBuffer(bool wait_for_completion)
{
EndRenderPass();
g_vulkan_context->ExecuteCommandBuffer(wait_for_completion);
g_vulkan_context->ExecuteCommandBuffer(GetWaitType(wait_for_completion, GSConfig.HWSpinCPUForReadbacks));
InvalidateCachedState();
}
@ -2227,7 +2237,7 @@ void GSDeviceVK::ExecuteCommandBufferAndRestartRenderPass(const char* reason)
const VkRenderPass render_pass = m_current_render_pass;
const GSVector4i render_pass_area(m_current_render_pass_area);
EndRenderPass();
g_vulkan_context->ExecuteCommandBuffer(false);
g_vulkan_context->ExecuteCommandBuffer(Vulkan::Context::WaitType::None);
InvalidateCachedState();
if (render_pass != VK_NULL_HANDLE)

View File

@ -167,6 +167,9 @@ const char* dialog_message(int ID, bool* updateText)
case IDC_SPIN_GPU:
return cvtString("Submits useless work to the GPU during readbacks to prevent it from going into powersave modes.\n"
"May improve performance but with a significant increase in power usage.");
case IDC_SPIN_CPU:
return cvtString("Does useless work on the CPU during readbacks to prevent it from going to into powersave modes.\n"
"May improve performance but with a significant increase in power usage.");
case IDC_LINEAR_PRESENT:
return cvtString("Use bilinear filtering when Upscaling/Downscaling the image to the screen. Disable it if you want a sharper/pixelated output.");
// Exclusive for Hardware Renderer

View File

@ -88,6 +88,7 @@ enum
// OpenGL Advanced Settings
IDC_GEOMETRY_SHADER_OVERRIDE,
IDC_SPIN_GPU,
IDC_SPIN_CPU,
// On-screen Display
IDC_OSD_LOG,
IDC_OSD_MONITOR,

View File

@ -285,6 +285,7 @@ RendererTab::RendererTab(wxWindow* parent)
auto* paltex_prereq = m_ui.addCheckBox(hw_checks_box, "GPU Palette Conversion", "paltex", IDC_PALTEX, hw_prereq);
m_ui.addCheckBox(hw_checks_box, "Spin GPU During Readbacks", "HWSpinGPUForReadbacks", IDC_SPIN_GPU);
m_ui.addCheckBox(hw_checks_box, "Spin CPU During Readbacks", "HWSpinCPUForReadbacks", IDC_SPIN_CPU);
auto aniso_prereq = [this, paltex_prereq]{ return m_is_hardware && paltex_prereq->GetValue() == false; };
auto* hw_choice_grid = new wxFlexGridSizer(2, space, space);

View File

@ -326,6 +326,7 @@ Pcsx2Config::GSOptions::GSOptions()
HWDownloadMode = GSHardwareDownloadMode::Enabled;
HWSpinGPUForReadbacks = false;
HWSpinCPUForReadbacks = false;
GPUPaletteConversion = false;
AutoFlushSW = true;
PreloadFrameWithGSData = false;
@ -550,6 +551,7 @@ void Pcsx2Config::GSOptions::ReloadIniSettings()
GSSettingBool(OsdShowInputs);
GSSettingBool(HWSpinGPUForReadbacks);
GSSettingBool(HWSpinCPUForReadbacks);
GSSettingBoolEx(GPUPaletteConversion, "paltex");
GSSettingBoolEx(AutoFlushSW, "autoflush_sw");
GSSettingBoolEx(PreloadFrameWithGSData, "preload_frame_with_gs_data");