GS:MTL: Add option to spin GPU during readbacks

This commit is contained in:
TellowKrinkle 2022-09-17 20:46:27 -05:00 committed by refractionpcsx2
parent ea35619a78
commit 73044dffed
13 changed files with 173 additions and 11 deletions

View File

@ -98,6 +98,7 @@ GraphicsSettingsWidget::GraphicsSettingsWidget(SettingsDialog* dialog, QWidget*
SettingWidgetBinder::BindWidgetToStringSetting(sif, m_ui.adapter, "EmuCore/GS", "Adapter");
SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.vsync, "EmuCore/GS", "VsyncEnable", 0);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.enableHWFixes, "EmuCore/GS", "UserHacks", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.spinGPUDuringReadbacks, "EmuCore/GS", "HWSpinGPUForReadbacks", false);
//////////////////////////////////////////////////////////////////////////
// Game Display Settings
@ -410,6 +411,10 @@ GraphicsSettingsWidget::GraphicsSettingsWidget(SettingsDialog* dialog, QWidget*
"to your games. However IF you have ENABLED this, you WILL DISABLE AUTOMATIC "
"SETTINGS and you can re-enable automatic settings by unchecking this option."));
dialog->registerWidgetHelp(m_ui.spinGPUDuringReadbacks, tr("Spin GPU During Readbacks"), tr("Unchecked"),
tr("Submits useless work to the GPU during readbacks to prevent it from going into powersave modes. "
"May improve performance but with a significant increase in power usage."));
// Software
dialog->registerWidgetHelp(m_ui.extraSWThreads, tr("Extra Rendering Threads"), tr("2 threads"),
tr("Number of rendering threads: 0 for single thread, 2 or more for multithread (1 is for debugging). "

View File

@ -643,6 +643,13 @@
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="spinGPUDuringReadbacks">
<property name="text">
<string>Spin GPU During Readbacks</string>
</property>
</widget>
</item>
</layout>
</item>
</layout>

View File

@ -796,6 +796,7 @@ set(pcsx2GSMetalShaders
GS/Renderers/Metal/convert.metal
GS/Renderers/Metal/present.metal
GS/Renderers/Metal/merge.metal
GS/Renderers/Metal/misc.metal
GS/Renderers/Metal/interlace.metal
GS/Renderers/Metal/tfx.metal
GS/Renderers/Metal/fxaa.metal

View File

@ -496,6 +496,7 @@ struct Pcsx2Config
OsdShowInputs : 1;
bool
HWSpinGPUForReadbacks : 1,
GPUPaletteConversion : 1,
AutoFlushSW : 1,
PreloadFrameWithGSData : 1,

View File

@ -298,6 +298,7 @@ void MetalHostDisplay::EndPresent()
[drawable present];
}];
dev->FlushEncoders();
dev->FrameCompleted();
m_current_drawable = nullptr;
if (m_capture_start_frame)
{

View File

@ -1445,6 +1445,7 @@ void GSApp::Init()
m_default_configuration["fxaa"] = "0";
m_default_configuration["HWDownloadMode"] = std::to_string(static_cast<u8>(GSHardwareDownloadMode::Enabled));
m_default_configuration["GSDumpCompression"] = std::to_string(static_cast<u8>(GSDumpCompressionMethod::LZMA));
m_default_configuration["HWSpinGPUForReadbacks"] = "0";
m_default_configuration["pcrtc_antiblur"] = "1";
m_default_configuration["disable_interlace_offset"] = "0";
m_default_configuration["pcrtc_offsets"] = "0";

View File

@ -25,6 +25,7 @@
#include "common/HashCombine.h"
#include "common/MRCHelpers.h"
#include "common/ReadbackSpinManager.h"
#include "GS/GS.h"
#include "GSMTLDeviceInfo.h"
#include "GSMTLSharedHeader.h"
@ -227,6 +228,14 @@ public:
u64 m_current_draw = 1;
std::atomic<u64> m_last_finished_draw{0};
// Spinning
ReadbackSpinManager m_spin_manager;
u32 m_encoders_in_current_cmdbuf;
u32 m_spin_timer;
MRCOwned<id<MTLComputePipelineState>> m_spin_pipeline;
MRCOwned<id<MTLBuffer>> m_spin_buffer;
MRCOwned<id<MTLFence>> m_spin_fence;
// Functions and Pipeline States
MRCOwned<id<MTLRenderPipelineState>> m_convert_pipeline[static_cast<int>(ShaderConvert::Count)];
MRCOwned<id<MTLRenderPipelineState>> m_present_pipeline[static_cast<int>(PresentShader::Count)];
@ -332,6 +341,8 @@ public:
void EndRenderPass();
/// Begin a new render pass (may reuse existing)
void BeginRenderPass(NSString* name, GSTexture* color, MTLLoadAction color_load, GSTexture* depth, MTLLoadAction depth_load, GSTexture* stencil = nullptr, MTLLoadAction stencil_load = MTLLoadActionDontCare);
/// Call at the end of each frame
void FrameCompleted();
GSTexture* CreateSurface(GSTexture::Type type, int width, int height, int levels, GSTexture::Format format) override;

View File

@ -214,6 +214,7 @@ id<MTLCommandBuffer> GSDeviceMTL::GetRenderCmdBuf()
{
if (!m_current_render_cmdbuf)
{
m_encoders_in_current_cmdbuf = 0;
m_current_render_cmdbuf = MRCRetain([m_queue commandBuffer]);
pxAssertRel(m_current_render_cmdbuf, "Failed to create draw command buffer!");
[m_current_render_cmdbuf setLabel:@"Draw"];
@ -258,15 +259,78 @@ void GSDeviceMTL::FlushEncoders()
[m_late_texture_upload_encoder endEncoding];
m_late_texture_upload_encoder = nil;
}
[m_current_render_cmdbuf addCompletedHandler:[backref = m_backref, draw = m_current_draw](id<MTLCommandBuffer> buf)
u32 spin_cycles = 0;
constexpr double s_to_ns = 1000000000;
if (m_spin_timer)
{
std::lock_guard<std::mutex> guard(backref->first);
if (GSDeviceMTL* dev = backref->second)
dev->DrawCommandBufferFinished(draw, buf);
}];
u32 spin_id;
{
std::lock_guard<std::mutex> guard(m_backref->first);
auto draw = m_spin_manager.DrawSubmitted(m_encoders_in_current_cmdbuf);
u32 constant_offset = 200000 * m_spin_manager.SpinsPerUnitTime(); // 200µs
u32 minimum_spin = 2 * constant_offset; // 400µs (200µs after subtracting constant_offset)
u32 maximum_spin = std::max<u32>(1024, 16000000 * m_spin_manager.SpinsPerUnitTime()); // 16ms
if (draw.recommended_spin > minimum_spin)
spin_cycles = std::min(draw.recommended_spin - constant_offset, maximum_spin);
spin_id = draw.id;
}
[m_current_render_cmdbuf addCompletedHandler:[backref = m_backref, draw = m_current_draw, spin_id](id<MTLCommandBuffer> buf)
{
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunguarded-availability"
// Starting from kernelStartTime includes time the command buffer spent waiting to execute
// This is useful for avoiding issues on GPUs without async compute (Intel) where spinning
// delays the next command buffer start, which then makes the spin manager think it should spin more
// (If a command buffer contains multiple encoders, the GPU will start before the kernel finishes,
// so we choose kernelStartTime over kernelEndTime)
u64 begin = [buf kernelStartTime] * s_to_ns;
u64 end = [buf GPUEndTime] * s_to_ns;
#pragma clang diagnostic pop
std::lock_guard<std::mutex> guard(backref->first);
if (GSDeviceMTL* dev = backref->second)
{
dev->DrawCommandBufferFinished(draw, buf);
dev->m_spin_manager.DrawCompleted(spin_id, begin, end);
}
}];
}
else
{
[m_current_render_cmdbuf addCompletedHandler:[backref = m_backref, draw = m_current_draw](id<MTLCommandBuffer> buf)
{
std::lock_guard<std::mutex> guard(backref->first);
if (GSDeviceMTL* dev = backref->second)
dev->DrawCommandBufferFinished(draw, buf);
}];
}
[m_current_render_cmdbuf commit];
m_current_render_cmdbuf = nil;
m_current_draw++;
if (spin_cycles)
{
id<MTLCommandBuffer> spinCmdBuf = [m_queue commandBuffer];
[spinCmdBuf setLabel:@"Spin"];
id<MTLComputeCommandEncoder> spinCmdEncoder = [spinCmdBuf computeCommandEncoder];
[spinCmdEncoder setLabel:@"Spin"];
[spinCmdEncoder waitForFence:m_spin_fence];
[spinCmdEncoder setComputePipelineState:m_spin_pipeline];
[spinCmdEncoder setBytes:&spin_cycles length:sizeof(spin_cycles) atIndex:0];
[spinCmdEncoder setBuffer:m_spin_buffer offset:0 atIndex:1];
[spinCmdEncoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
[spinCmdEncoder endEncoding];
[spinCmdBuf addCompletedHandler:[backref = m_backref, spin_cycles](id<MTLCommandBuffer> buf)
{
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunguarded-availability"
u64 begin = [buf GPUStartTime] * s_to_ns;
u64 end = [buf GPUEndTime] * s_to_ns;
#pragma clang diagnostic pop
std::lock_guard<std::mutex> guard(backref->first);
if (GSDeviceMTL* dev = backref->second)
dev->m_spin_manager.SpinCompleted(spin_cycles, begin, end);
}];
[spinCmdBuf commit];
}
}
void GSDeviceMTL::EndRenderPass()
@ -274,6 +338,8 @@ void GSDeviceMTL::EndRenderPass()
if (m_current_render.encoder)
{
EndDebugGroup(m_current_render.encoder);
if (m_spin_timer)
[m_current_render.encoder updateFence:m_spin_fence afterStages:MTLRenderStageFragment];
[m_current_render.encoder endEncoding];
m_current_render.encoder = nil;
memset(&m_current_render, 0, offsetof(MainRenderEncoder, depth_sel));
@ -315,6 +381,8 @@ void GSDeviceMTL::BeginRenderPass(NSString* name, GSTexture* color, MTLLoadActio
return;
}
m_encoders_in_current_cmdbuf++;
if (m_late_texture_upload_encoder)
{
[m_late_texture_upload_encoder endEncoding];
@ -365,6 +433,13 @@ void GSDeviceMTL::BeginRenderPass(NSString* name, GSTexture* color, MTLLoadActio
pxAssertRel(m_current_render.encoder, "Failed to create render encoder!");
}
void GSDeviceMTL::FrameCompleted()
{
if (m_spin_timer)
m_spin_timer--;
m_spin_manager.NextFrame();
}
static constexpr MTLPixelFormat ConvertPixelFormat(GSTexture::Format format)
{
switch (format)
@ -608,13 +683,31 @@ bool GSDeviceMTL::Create()
try
{
// Init metal stuff
m_draw_sync_fence = MRCTransfer([m_dev.dev newFence]);
m_fn_constants = MRCTransfer([MTLFunctionConstantValues new]);
vector_float2 upscale2 = vector2(GSConfig.UpscaleMultiplier, GSConfig.UpscaleMultiplier);
[m_fn_constants setConstantValue:&upscale2 type:MTLDataTypeFloat2 atIndex:GSMTLConstantIndex_SCALING_FACTOR];
setFnConstantB(m_fn_constants, m_dev.features.framebuffer_fetch, GSMTLConstantIndex_FRAMEBUFFER_FETCH);
m_draw_sync_fence = MRCTransfer([m_dev.dev newFence]);
[m_draw_sync_fence setLabel:@"Draw Sync Fence"];
m_spin_fence = MRCTransfer([m_dev.dev newFence]);
[m_spin_fence setLabel:@"Spin Fence"];
constexpr MTLResourceOptions spin_opts = MTLResourceStorageModePrivate | MTLResourceHazardTrackingModeUntracked;
m_spin_buffer = MRCTransfer([m_dev.dev newBufferWithLength:4 options:spin_opts]);
[m_spin_buffer setLabel:@"Spin Buffer"];
id<MTLCommandBuffer> initCommands = [m_queue commandBuffer];
id<MTLBlitCommandEncoder> clearSpinBuffer = [initCommands blitCommandEncoder];
[clearSpinBuffer fillBuffer:m_spin_buffer range:NSMakeRange(0, 4) value:0];
[clearSpinBuffer updateFence:m_spin_fence];
[clearSpinBuffer endEncoding];
NSError* err = nullptr;
m_spin_pipeline = MRCTransfer([m_dev.dev newComputePipelineStateWithFunction:LoadShader(@"waste_time") error:&err]);
if (err)
{
Console.Error("Failed to create spin pipeline: %s", [[err localizedDescription] UTF8String]);
return false;
}
m_hw_vertex = MRCTransfer([MTLVertexDescriptor new]);
[[[m_hw_vertex layouts] objectAtIndexedSubscript:GSMTLBufferIndexHWVertices] setStride:sizeof(GSVertex)];
applyAttribute(m_hw_vertex, GSMTLAttributeIndexST, MTLVertexFormatFloat2, offsetof(GSVertex, ST), GSMTLBufferIndexHWVertices);
@ -891,6 +984,8 @@ bool GSDeviceMTL::Create()
m_imgui_pipeline = MakePipeline(pdesc, LoadShader(@"vs_imgui"), LoadShader(@"ps_imgui"), @"imgui");
if (!m_dev.features.texture_swizzle)
m_imgui_pipeline_a8 = MakePipeline(pdesc, LoadShader(@"vs_imgui"), LoadShader(@"ps_imgui_a8"), @"imgui_a8");
[initCommands commit];
}
catch (GSRecoverableError&)
{
@ -946,10 +1041,20 @@ bool GSDeviceMTL::DownloadTexture(GSTexture* src, const GSVector4i& rect, GSText
destinationOffset:0
destinationBytesPerRow:out_map.pitch
destinationBytesPerImage:size];
if (m_spin_timer)
[encoder updateFence:m_spin_fence];
[encoder endEncoding];
[cmdbuf popDebugGroup];
FlushEncoders();
if (@available(macOS 10.15, iOS 10.3, *))
{
if (GSConfig.HWSpinGPUForReadbacks)
{
m_spin_manager.ReadbackRequested();
m_spin_timer = 30;
}
}
[cmdbuf waitUntilCompleted];
out_map.bits = static_cast<u8*>([m_texture_download_buf contents]);

View File

@ -0,0 +1,24 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
kernel void waste_time(constant uint& cycles [[buffer(0)]], device uint* spin [[buffer(1)]])
{
uint value = spin[0];
// The compiler doesn't know, but spin[0] == 0, so this loop won't actually go anywhere
for (uint i = 0; i < cycles; i++)
value = spin[value];
// Store the result back to the buffer so the compiler can't optimize it away
spin[0] = value;
}

View File

@ -164,6 +164,9 @@ const char* dialog_message(int ID, bool* updateText)
case IDC_GEOMETRY_SHADER_OVERRIDE:
return cvtString("Allows the GPU instead of just the CPU to transform lines into sprites. This reduces CPU load and bandwidth requirement, but it is heavier on the GPU.\n"
"Automatic detection is recommended.");
case IDC_SPIN_GPU:
return cvtString("Submits useless work to the GPU during readbacks to prevent it from going into powersave modes.\n"
"May improve performance but with a significant increase in power usage.");
case IDC_LINEAR_PRESENT:
return cvtString("Use bilinear filtering when Upscaling/Downscaling the image to the screen. Disable it if you want a sharper/pixelated output.");
// Exclusive for Hardware Renderer

View File

@ -87,6 +87,7 @@ enum
IDC_SWTHREADS_EDIT,
// OpenGL Advanced Settings
IDC_GEOMETRY_SHADER_OVERRIDE,
IDC_SPIN_GPU,
// On-screen Display
IDC_OSD_LOG,
IDC_OSD_MONITOR,

View File

@ -284,6 +284,7 @@ RendererTab::RendererTab(wxWindow* parent)
auto* hw_checks_box = new wxWrapSizer(wxHORIZONTAL);
auto* paltex_prereq = m_ui.addCheckBox(hw_checks_box, "GPU Palette Conversion", "paltex", IDC_PALTEX, hw_prereq);
m_ui.addCheckBox(hw_checks_box, "Spin GPU During Readbacks", "HWSpinGPUForReadbacks", IDC_SPIN_GPU);
auto aniso_prereq = [this, paltex_prereq]{ return m_is_hardware && paltex_prereq->GetValue() == false; };
auto* hw_choice_grid = new wxFlexGridSizer(2, space, space);
@ -473,10 +474,9 @@ PostTab::PostTab(wxWindow* parent)
auto* shader_boost_grid = new wxFlexGridSizer(2, space, space);
shader_boost_grid->AddGrowableCol(1);
auto shader_boost_prereq = [shade_boost_check, this] { return shade_boost_check.box->GetValue(); };
m_ui.addSliderAndLabel(shader_boost_grid, "Brightness:", "ShadeBoost_Brightness", 0, 100, 50, -1, shader_boost_prereq);
m_ui.addSliderAndLabel(shader_boost_grid, "Contrast:", "ShadeBoost_Contrast", 0, 100, 50, -1, shader_boost_prereq);
m_ui.addSliderAndLabel(shader_boost_grid, "Saturation:", "ShadeBoost_Saturation", 0, 100, 50, -1, shader_boost_prereq);
m_ui.addSliderAndLabel(shader_boost_grid, "Brightness:", "ShadeBoost_Brightness", 0, 100, 50, -1, shade_boost_check);
m_ui.addSliderAndLabel(shader_boost_grid, "Contrast:", "ShadeBoost_Contrast", 0, 100, 50, -1, shade_boost_check);
m_ui.addSliderAndLabel(shader_boost_grid, "Saturation:", "ShadeBoost_Saturation", 0, 100, 50, -1, shade_boost_check);
shade_boost_box->Add(shader_boost_grid, wxSizerFlags().Expand());
shader_box->Add(shade_boost_box.outer, wxSizerFlags().Expand());

View File

@ -325,6 +325,7 @@ Pcsx2Config::GSOptions::GSOptions()
OsdShowInputs = false;
HWDownloadMode = GSHardwareDownloadMode::Enabled;
HWSpinGPUForReadbacks = false;
GPUPaletteConversion = false;
AutoFlushSW = true;
PreloadFrameWithGSData = false;
@ -548,6 +549,7 @@ void Pcsx2Config::GSOptions::ReloadIniSettings()
GSSettingBool(OsdShowSettings);
GSSettingBool(OsdShowInputs);
GSSettingBool(HWSpinGPUForReadbacks);
GSSettingBoolEx(GPUPaletteConversion, "paltex");
GSSettingBoolEx(AutoFlushSW, "autoflush_sw");
GSSettingBoolEx(PreloadFrameWithGSData, "preload_frame_with_gs_data");