GS:HW: Make HDR algorithm more float-precision-friendly

- Use whole numbers instead of 255ths
- Use range -128 - 127 instead of 0 - 255 for accumulation blends
This commit is contained in:
TellowKrinkle 2022-10-03 22:01:18 -05:00 committed by lightningterror
parent 7ea8b983d4
commit 73ae9f8879
16 changed files with 129 additions and 90 deletions

View File

@ -121,17 +121,19 @@ PS_OUTPUT ps_datm0(PS_INPUT input)
return output;
}
PS_OUTPUT ps_mod256(PS_INPUT input)
PS_OUTPUT ps_hdr_init(PS_INPUT input)
{
PS_OUTPUT output;
float4 value = sample_c(input.t);
output.c = float4(round(value.rgb * 255), value.a);
return output;
}
float4 c = round(sample_c(input.t) * 255);
// We use 2 fmod to avoid negative value.
float4 fmod1 = fmod(c, 256) + 256;
float4 fmod2 = fmod(fmod1, 256);
output.c = fmod2 / 255.0f;
PS_OUTPUT ps_hdr_resolve(PS_INPUT input)
{
PS_OUTPUT output;
float4 value = sample_c(input.t);
output.c = float4(float3(int3(value.rgb) & 255) / 255, value.a);
return output;
}

View File

@ -736,9 +736,18 @@ void ps_dither(inout float3 C, float2 pos_xy)
void ps_color_clamp_wrap(inout float3 C)
{
if (PS_HDR && PS_COLCLIP) // COLCLIP flag indicates accumulation blend under HDR
{
int3 color = int3(C);
if (PS_DFMT == FMT_16)
color &= (int3)0xF8;
// -128 to 127 gives us longer before we run out of float precision
// Especially for games that mainly use 1 and 255 (sly), since that maps to 1 and -1
C = float3((color << 24) >> 24);
}
// When dithering the bottom 3 bits become meaningless and cause lines in the picture
// so we need to limit the color depth on dithered items
if (SW_BLEND || PS_DITHER || PS_FBMASK)
else if (SW_BLEND || PS_DITHER || PS_FBMASK)
{
// Standard Clamp
if (PS_COLCLIP == 0 && PS_HDR == 0)
@ -943,7 +952,7 @@ PS_OUTPUT ps_main(PS_INPUT input)
ps_fbmask(C, input.p.xy);
#if !PS_NO_COLOR
output.c0 = C / 255.0f;
output.c0 = PS_HDR ? float4(C.rgb, C.a / 255.0f) : C / 255.0f;
#if !PS_NO_COLOR1
output.c1 = (float4)(alpha_blend);
#endif

View File

@ -318,10 +318,19 @@ void ps_datm0()
}
#endif
#ifdef ps_mod256
void ps_mod256()
#ifdef ps_hdr_init
void ps_hdr_init()
{
SV_Target0 = mod(round(sample_c() * 255.0f), 256.0f) / 255.0f;
vec4 value = sample_c();
SV_Target0 = vec4(round(value.rgb * 255.0f), value.a);
}
#endif
#ifdef ps_hdr_resolve
void ps_hdr_resolve()
{
vec4 value = sample_c();
SV_Target0 = vec4(vec3(ivec3(value.rgb) & 255) / 255.0f, value.a);
}
#endif

View File

@ -638,9 +638,18 @@ void ps_dither(inout vec3 C)
void ps_color_clamp_wrap(inout vec3 C)
{
#if PS_HDR && PS_COLCLIP // COLCLIP flag indicates accumulation blend under HDR
ivec3 color = ivec3(C);
#if PS_DFMT == FMT_16
color &= 0xF8;
#endif
// -128 to 127 gives us longer before we run out of float precision
// Especially for games that mainly use 1 and 255 (sly), since that maps to 1 and -1
C = vec3((color << 24) >> 24);
// When dithering the bottom 3 bits become meaningless and cause lines in the picture
// so we need to limit the color depth on dithered items
#if SW_BLEND || PS_DITHER || PS_FBMASK
#elif SW_BLEND || PS_DITHER || PS_FBMASK
// Correct the Color value based on the output format
#if PS_COLCLIP == 0 && PS_HDR == 0
@ -925,7 +934,11 @@ void ps_main()
ps_fbmask(C);
#if !PS_NO_COLOR
#if PS_HDR
SV_Target0 = vec4(C.rgb, C.a / 255.0f);
#else
SV_Target0 = C / 255.0f;
#endif
#if !defined(DISABLE_DUAL_SOURCE) && !PS_NO_COLOR1
SV_Target1 = vec4(alpha_blend);
#endif

View File

@ -90,15 +90,19 @@ void ps_datm0()
}
#endif
#ifdef ps_mod256
void ps_mod256()
#ifdef ps_hdr_init
void ps_hdr_init()
{
vec4 c = roundEven(sample_c(v_tex) * 255);
// We use 2 fmod to avoid negative value.
vec4 fmod1 = mod(c, 256) + 256;
vec4 fmod2 = mod(fmod1, 256);
vec4 value = sample_c(v_tex);
o_col0 = vec4(roundEven(value.rgb * 255.0f), value.a);
}
#endif
o_col0 = fmod2 / 255.0f;
#ifdef ps_hdr_resolve
void ps_hdr_resolve()
{
vec4 value = sample_c(v_tex);
o_col0 = vec4(vec3(ivec3(value.rgb) & 255) / 255.0f, value.a);
}
#endif

View File

@ -967,9 +967,18 @@ void ps_dither(inout vec3 C)
void ps_color_clamp_wrap(inout vec3 C)
{
#if PS_HDR && PS_COLCLIP // COLCLIP flag indicates accumulation blend under HDR
ivec3 color = ivec3(C);
#if PS_DFMT == FMT_16
color &= 0xF8;
#endif
// -128 to 127 gives us longer before we run out of float precision
// Especially for games that mainly use 1 and 255 (sly), since that maps to 1 and -1
C = vec3((color << 24) >> 24);
// When dithering the bottom 3 bits become meaningless and cause lines in the picture
// so we need to limit the color depth on dithered items
#if SW_BLEND || PS_DITHER || PS_FBMASK
#elif SW_BLEND || PS_DITHER || PS_FBMASK
// Correct the Color value based on the output format
#if PS_COLCLIP == 0 && PS_HDR == 0
@ -1226,7 +1235,11 @@ void main()
ps_fbmask(C);
#if !PS_NO_COLOR
#if PS_HDR
o_col0 = vec4(C.rgb, C.a / 255.0f);
#else
o_col0 = C / 255.0f;
#endif
#if !defined(DISABLE_DUAL_SOURCE) && !PS_NO_COLOR1
o_col1 = vec4(alpha_blend);
#endif

View File

@ -27,7 +27,8 @@ const char* shaderName(ShaderConvert value)
case ShaderConvert::RGBA8_TO_16_BITS: return "ps_convert_rgba8_16bits";
case ShaderConvert::DATM_1: return "ps_datm1";
case ShaderConvert::DATM_0: return "ps_datm0";
case ShaderConvert::MOD_256: return "ps_mod256";
case ShaderConvert::HDR_INIT: return "ps_hdr_init";
case ShaderConvert::HDR_RESOLVE: return "ps_hdr_resolve";
case ShaderConvert::TRANSPARENCY_FILTER: return "ps_filter_transparency";
case ShaderConvert::FLOAT32_TO_16_BITS: return "ps_convert_float32_32bits";
case ShaderConvert::FLOAT32_TO_32_BITS: return "ps_convert_float32_32bits";

View File

@ -35,7 +35,8 @@ enum class ShaderConvert
RGBA8_TO_16_BITS,
DATM_1,
DATM_0,
MOD_256,
HDR_INIT,
HDR_RESOLVE,
TRANSPARENCY_FILTER,
FLOAT32_TO_16_BITS,
FLOAT32_TO_32_BITS,

View File

@ -1385,7 +1385,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
hdr_rt = CreateRenderTarget(rtsize.x, rtsize.y, GSTexture::Format::FloatColor);
// Warning: StretchRect must be called before BeginScene otherwise
// vertices will be overwritten. Trust me you don't want to do that.
StretchRect(config.rt, sRect, hdr_rt, dRect, ShaderConvert::COPY, false);
StretchRect(config.rt, sRect, hdr_rt, dRect, ShaderConvert::HDR_INIT, false);
}
BeginScene();
@ -1519,7 +1519,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
const GSVector2i size = config.rt->GetSize();
const GSVector4 dRect(config.drawarea);
const GSVector4 sRect = dRect / GSVector4(size.x, size.y).xyxy();
StretchRect(hdr_rt, sRect, config.rt, dRect, ShaderConvert::MOD_256, false);
StretchRect(hdr_rt, sRect, config.rt, dRect, ShaderConvert::HDR_RESOLVE, false);
Recycle(hdr_rt);
}
}

View File

@ -1133,18 +1133,6 @@ bool GSDevice12::CompileConvertPipelines()
if (i == ShaderConvert::COPY)
{
// compile the variant for setting up hdr rendering
for (u32 ds = 0; ds < 2; ds++)
{
gpb.SetRenderTarget(0, DXGI_FORMAT_R32G32B32A32_FLOAT);
gpb.SetDepthStencilFormat(ds ? DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS : DXGI_FORMAT_UNKNOWN);
m_hdr_setup_pipelines[ds] = gpb.Create(g_d3d12_context->GetDevice(), m_shader_cache, false);
if (!m_hdr_setup_pipelines[ds])
return false;
D3D12::SetObjectNameFormatted(m_hdr_setup_pipelines[ds].get(), "HDR setup/copy pipeline (ds=%u)", i, ds);
}
// compile color copy pipelines
gpb.SetRenderTarget(0, DXGI_FORMAT_R8G8B8A8_UNORM);
gpb.SetDepthStencilFormat(DXGI_FORMAT_UNKNOWN);
@ -1162,18 +1150,21 @@ bool GSDevice12::CompileConvertPipelines()
(i >> 3) & 1u);
}
}
else if (i == ShaderConvert::MOD_256)
else if (i == ShaderConvert::HDR_INIT || i == ShaderConvert::HDR_RESOLVE)
{
const bool is_setup = i == ShaderConvert::HDR_INIT;
std::array<ComPtr<ID3D12PipelineState>, 2>& arr = is_setup ? m_hdr_setup_pipelines : m_hdr_finish_pipelines;
for (u32 ds = 0; ds < 2; ds++)
{
pxAssert(!m_hdr_finish_pipelines[ds]);
pxAssert(!arr[ds]);
gpb.SetRenderTarget(0, is_setup ? DXGI_FORMAT_R32G32B32A32_FLOAT : DXGI_FORMAT_R8G8B8A8_UNORM);
gpb.SetDepthStencilFormat(ds ? DXGI_FORMAT_D32_FLOAT_S8X24_UINT : DXGI_FORMAT_UNKNOWN);
m_hdr_finish_pipelines[ds] = gpb.Create(g_d3d12_context->GetDevice(), m_shader_cache, false);
if (!m_hdr_finish_pipelines[ds])
arr[ds] = gpb.Create(g_d3d12_context->GetDevice(), m_shader_cache, false);
if (!arr[ds])
return false;
D3D12::SetObjectNameFormatted(m_hdr_setup_pipelines[ds].get(), "HDR finish/copy pipeline (ds=%u)", ds);
D3D12::SetObjectNameFormatted(arr[ds].get(), "HDR %s/copy pipeline (ds=%u)", is_setup ? "setup" : "finish", ds);
}
}
}

View File

@ -2761,8 +2761,9 @@ void GSRendererHW::EmulateBlending(bool& DATE_PRIMID, bool& DATE_BARRIER, bool&
// A fast algo that requires 2 passes
GL_INS("COLCLIP Fast HDR mode ENABLED");
m_conf.ps.hdr = 1;
m_conf.ps.colclip = accumulation_blend; // reuse as a flag for accumulation blend
blend_mix = false;
sw_blending = true; // Enable sw blending for the HDR algo
sw_blending = true; // Enable sw blending for the HDR algo
}
else if (sw_blending)
{

View File

@ -769,11 +769,11 @@ bool GSDeviceMTL::Create()
auto pdesc = [[MTLRenderPipelineDescriptor new] autorelease];
// FS Triangle Pipelines
pdesc.colorAttachments[0].pixelFormat = ConvertPixelFormat(GSTexture::Format::Color);
m_hdr_resolve_pipeline = MakePipeline(pdesc, fs_triangle, LoadShader(@"ps_mod256"), @"HDR Resolve");
m_hdr_resolve_pipeline = MakePipeline(pdesc, fs_triangle, LoadShader(@"ps_hdr_resolve"), @"HDR Resolve");
m_fxaa_pipeline = MakePipeline(pdesc, fs_triangle, LoadShader(@"ps_fxaa"), @"fxaa");
m_shadeboost_pipeline = MakePipeline(pdesc, fs_triangle, LoadShader(@"ps_shadeboost"), @"shadeboost");
pdesc.colorAttachments[0].pixelFormat = ConvertPixelFormat(GSTexture::Format::FloatColor);
m_hdr_init_pipeline = MakePipeline(pdesc, fs_triangle, LoadShader(@"ps_copy_fs"), @"HDR Init");
m_hdr_init_pipeline = MakePipeline(pdesc, fs_triangle, LoadShader(@"ps_hdr_init"), @"HDR Init");
pdesc.colorAttachments[0].pixelFormat = MTLPixelFormatInvalid;
pdesc.stencilAttachmentPixelFormat = MTLPixelFormatDepth32Float_Stencil8;
m_datm_pipeline[0] = MakePipeline(pdesc, fs_triangle, LoadShader(@"ps_datm0"), @"datm0");
@ -808,7 +808,8 @@ bool GSDeviceMTL::Create()
case ShaderConvert::Count:
case ShaderConvert::DATM_0:
case ShaderConvert::DATM_1:
case ShaderConvert::MOD_256:
case ShaderConvert::HDR_INIT:
case ShaderConvert::HDR_RESOLVE:
continue;
case ShaderConvert::FLOAT32_TO_32_BITS:
pdesc.colorAttachments[0].pixelFormat = ConvertPixelFormat(GSTexture::Format::UInt32);

View File

@ -111,10 +111,16 @@ fragment float4 ps_primid_init_datm1(float4 p [[position]], DirectReadTextureIn<
return tex.read(p).a < (127.5f / 255.f) ? -1 : FLT_MAX;
}
fragment float4 ps_mod256(float4 p [[position]], DirectReadTextureIn<float> tex)
fragment half4 ps_hdr_init(float4 p [[position]], DirectReadTextureIn<half> tex)
{
float4 c = round(tex.read(p) * 255.f);
return (c - 256.f * floor(c / 256.f)) / 255.f;
half4 in = tex.read(p);
return half4(round(in.rgb * 255.h), in.a);
}
fragment float4 ps_hdr_resolve(float4 p [[position]], DirectReadTextureIn<float> tex)
{
float4 in = tex.read(p);
return float4(float3(int3(in.rgb) & 255) / 255.f, in.a);
}
fragment float4 ps_filter_transparency(ConvertShaderData data [[stage_in]], ConvertPSRes res)

View File

@ -784,8 +784,19 @@ struct PSMain
void ps_color_clamp_wrap(thread float4& C)
{
if (PS_HDR && PS_COLCLIP) // COLCLIP flag indicates accumulation blend under HDR
{
int3 color = int3(C.rgb);
if (PS_DFMT == FMT_16)
color &= 0xF8;
// -128 to 127 gives us longer before we run out of float precision
// Especially for games that mainly use 1 and 255 (sly), since that maps to 1 and -1
C.rgb = float3(char3(color));
return;
}
// When dithering the bottom 3 bits become meaningless and cause lines in the picture so we need to limit the color depth on dithered items
if (!SW_BLEND && !PS_DITHER)
if (!SW_BLEND && !PS_DITHER && !PS_FBMASK)
return;
// Correct the Color value based on the output format
@ -978,7 +989,7 @@ struct PSMain
ps_fbmask(C);
if (PS_COLOR0)
out.c0 = C / 255.f;
out.c0 = PS_HDR ? float4(C.rgb, C.a / 255.f) : C / 255.f;
if (PS_COLOR0 && PS_ONLY_ALPHA)
out.c0.rgb = 0;
if (PS_COLOR1)

View File

@ -1866,14 +1866,9 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config)
hdr_rt = CreateRenderTarget(rtsize.x, rtsize.y, GSTexture::Format::FloatColor, false);
OMSetRenderTargets(hdr_rt, config.ds, &config.scissor);
// save blend state, since BlitRect destroys it
const bool old_blend = GLState::blend;
BlitRect(config.rt, config.drawarea, config.rt->GetSize(), false, false);
if (old_blend)
{
GLState::blend = old_blend;
glEnable(GL_BLEND);
}
GSVector4 dRect(config.drawarea);
const GSVector4 sRect = dRect / GSVector4(rtsize.x, rtsize.y).xyxy();
StretchRect(config.rt, sRect, hdr_rt, dRect, ShaderConvert::HDR_INIT, false);
}
else if (config.require_one_barrier && !m_features.texture_barrier)
{
@ -2041,7 +2036,7 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config)
GSVector2i size = config.rt->GetSize();
GSVector4 dRect(config.drawarea);
const GSVector4 sRect = dRect / GSVector4(size.x, size.y).xyxy();
StretchRect(hdr_rt, sRect, config.rt, dRect, ShaderConvert::MOD_256, false);
StretchRect(hdr_rt, sRect, config.rt, dRect, ShaderConvert::HDR_RESOLVE, false);
Recycle(hdr_rt);
}

View File

@ -1413,26 +1413,6 @@ bool GSDeviceVK::CompileConvertPipelines()
if (i == ShaderConvert::COPY)
{
// compile the variant for setting up hdr rendering
for (u32 ds = 0; ds < 2; ds++)
{
for (u32 fbl = 0; fbl < 2; fbl++)
{
pxAssert(!m_hdr_setup_pipelines[ds][fbl]);
gpb.SetRenderPass(GetTFXRenderPass(true, ds != 0, true, DATE_RENDER_PASS_NONE, fbl != 0,
VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_LOAD_OP_DONT_CARE),
0);
m_hdr_setup_pipelines[ds][fbl] =
gpb.Create(g_vulkan_context->GetDevice(), g_vulkan_shader_cache->GetPipelineCache(true), false);
if (!m_hdr_setup_pipelines[ds][fbl])
return false;
Vulkan::Util::SetObjectName(g_vulkan_context->GetDevice(), m_hdr_setup_pipelines[ds][fbl],
"HDR setup/copy pipeline (ds=%u, fbl=%u)", i, ds, fbl);
}
}
// compile color copy pipelines
gpb.SetRenderPass(m_utility_color_render_pass_discard, 0);
for (u32 i = 0; i < 16; i++)
@ -1451,24 +1431,26 @@ bool GSDeviceVK::CompileConvertPipelines()
(i >> 3) & 1u);
}
}
else if (i == ShaderConvert::MOD_256)
else if (i == ShaderConvert::HDR_INIT || i == ShaderConvert::HDR_RESOLVE)
{
const bool is_setup = i == ShaderConvert::HDR_INIT;
VkPipeline (&arr)[2][2] = *(is_setup ? &m_hdr_setup_pipelines : &m_hdr_finish_pipelines);
for (u32 ds = 0; ds < 2; ds++)
{
for (u32 fbl = 0; fbl < 2; fbl++)
{
pxAssert(!m_hdr_finish_pipelines[ds][fbl]);
pxAssert(!arr[ds][fbl]);
gpb.SetRenderPass(GetTFXRenderPass(true, ds != 0, false, DATE_RENDER_PASS_NONE, fbl != 0,
VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_LOAD_OP_DONT_CARE),
gpb.SetRenderPass(
GetTFXRenderPass(true, ds != 0, is_setup, DATE_RENDER_PASS_NONE, fbl != 0,
VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_LOAD_OP_DONT_CARE),
0);
m_hdr_finish_pipelines[ds][fbl] =
gpb.Create(g_vulkan_context->GetDevice(), g_vulkan_shader_cache->GetPipelineCache(true), false);
if (!m_hdr_finish_pipelines[ds][fbl])
arr[ds][fbl] = gpb.Create(g_vulkan_context->GetDevice(), g_vulkan_shader_cache->GetPipelineCache(true), false);
if (!arr[ds][fbl])
return false;
Vulkan::Util::SetObjectName(g_vulkan_context->GetDevice(), m_hdr_setup_pipelines[ds][fbl],
"HDR finish/copy pipeline (ds=%u, fbl=%u)", i, ds, fbl);
Vulkan::Util::SetObjectName(g_vulkan_context->GetDevice(), arr[ds][fbl],
"HDR %s/copy pipeline (ds=%u, fbl=%u)", is_setup ? "setup" : "finish", i, ds, fbl);
}
}
}