GPU/ShaderGen: Use lower precision where possible

This commit is contained in:
Connor McLaughlin 2021-06-13 20:53:23 +10:00
parent aafd20fb3c
commit ca28381ddd
2 changed files with 77 additions and 47 deletions

View File

@ -20,12 +20,12 @@ void GPU_HW_ShaderGen::WriteCommonFunctions(std::stringstream& ss)
{ {
DefineMacro(ss, "MULTISAMPLING", UsingMSAA()); DefineMacro(ss, "MULTISAMPLING", UsingMSAA());
ss << "CONSTANT uint RESOLUTION_SCALE = " << m_resolution_scale << "u;\n"; ss << "CONSTANT min16uint RESOLUTION_SCALE = " << m_resolution_scale << "u;\n";
ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n"; ss << "CONSTANT min16uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
ss << "CONSTANT float2 RCP_VRAM_SIZE = float2(1.0, 1.0) / float2(VRAM_SIZE);\n"; ss << "CONSTANT min16float2 RCP_VRAM_SIZE = float2(1.0, 1.0) / float2(VRAM_SIZE);\n";
ss << "CONSTANT uint2 NATIVE_VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ");\n"; ss << "CONSTANT min16uint2 NATIVE_VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ");\n";
ss << "CONSTANT float2 RCP_NATIVE_VRAM_SIZE = float2(1.0, 1.0) / float2(NATIVE_VRAM_SIZE);\n"; ss << "CONSTANT min16float2 RCP_NATIVE_VRAM_SIZE = float2(1.0, 1.0) / float2(NATIVE_VRAM_SIZE);\n";
ss << "CONSTANT uint MULTISAMPLES = " << m_multisamples << "u;\n"; ss << "CONSTANT min16uint MULTISAMPLES = " << m_multisamples << "u;\n";
ss << "CONSTANT bool PER_SAMPLE_SHADING = " << (m_per_sample_shading ? "true" : "false") << ";\n"; ss << "CONSTANT bool PER_SAMPLE_SHADING = " << (m_per_sample_shading ? "true" : "false") << ";\n";
ss << R"( ss << R"(
@ -47,7 +47,7 @@ uint fixYCoord(uint y)
#endif #endif
} }
uint fixNativeYCoord(uint y) min16uint fixNativeYCoord(min16uint y)
{ {
#if API_OPENGL || API_OPENGL_ES #if API_OPENGL || API_OPENGL_ES
return NATIVE_VRAM_SIZE.y - y - 1u; return NATIVE_VRAM_SIZE.y - y - 1u;
@ -56,7 +56,7 @@ uint fixNativeYCoord(uint y)
#endif #endif
} }
uint RGBA8ToRGBA5551(float4 v) uint RGBA8ToRGBA5551(min16float4 v)
{ {
uint r = uint(roundEven(v.r * 31.0)); uint r = uint(roundEven(v.r * 31.0));
uint g = uint(roundEven(v.g * 31.0)); uint g = uint(roundEven(v.g * 31.0));
@ -65,7 +65,7 @@ uint RGBA8ToRGBA5551(float4 v)
return (r) | (g << 5) | (b << 10) | (a << 15); return (r) | (g << 5) | (b << 10) | (a << 15);
} }
float4 RGBA5551ToRGBA8(uint v) min16float4 RGBA5551ToRGBA8(uint v)
{ {
uint r = (v & 31u); uint r = (v & 31u);
uint g = ((v >> 5) & 31u); uint g = ((v >> 5) & 31u);
@ -748,14 +748,14 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMod
ss << "};\n"; ss << "};\n";
ss << R"( ss << R"(
uint3 ApplyDithering(uint2 coord, uint3 icol) min16uint3 ApplyDithering(min16uint2 coord, min16uint3 icol)
{ {
#if DITHERING_SCALED #if DITHERING_SCALED
uint2 fc = coord & uint2(3u, 3u); min16uint2 fc = coord & uint2(3u, 3u);
#else #else
uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u); min16uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u);
#endif #endif
int offset = s_dither_values[fc.y * 4u + fc.x]; min16int offset = s_dither_values[fc.y * 4u + fc.x];
#if !TRUE_COLOR #if !TRUE_COLOR
return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31)); return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31));
@ -767,32 +767,32 @@ uint3 ApplyDithering(uint2 coord, uint3 icol)
#if TEXTURED #if TEXTURED
CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0); CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0);
uint2 ApplyTextureWindow(uint2 coords) min16uint2 ApplyTextureWindow(min16uint2 coords)
{ {
uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x; min16uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x;
uint y = (uint(coords.y) & u_texture_window_and.y) | u_texture_window_or.y; min16uint y = (uint(coords.y) & u_texture_window_and.y) | u_texture_window_or.y;
return uint2(x, y); return uint2(x, y);
} }
uint2 ApplyUpscaledTextureWindow(uint2 coords) min16uint2 ApplyUpscaledTextureWindow(min16uint2 coords)
{ {
uint2 native_coords = coords / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); min16uint2 native_coords = coords / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE);
uint2 coords_offset = coords % uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); min16uint2 coords_offset = coords % uint2(RESOLUTION_SCALE, RESOLUTION_SCALE);
return (ApplyTextureWindow(native_coords) * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) + coords_offset; return (ApplyTextureWindow(native_coords) * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) + coords_offset;
} }
uint2 FloatToIntegerCoords(float2 coords) min16uint2 FloatToIntegerCoords(min16float2 coords)
{ {
// With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates. // With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates.
// Floor them otherwise, as it currently breaks when upscaling as the vertex offset is not applied. // Floor them otherwise, as it currently breaks when upscaling as the vertex offset is not applied.
return uint2((RESOLUTION_SCALE == 1u) ? roundEven(coords) : floor(coords)); return uint2((RESOLUTION_SCALE == 1u) ? roundEven(coords) : floor(coords));
} }
float4 SampleFromVRAM(uint4 texpage, float2 coords) min16float4 SampleFromVRAM(min16uint4 texpage, min16float2 coords)
{ {
#if PALETTE #if PALETTE
uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords)); min16uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
uint2 index_coord = icoord; min16uint2 index_coord = icoord;
#if PALETTE_4_BIT #if PALETTE_4_BIT
index_coord.x /= 4u; index_coord.x /= 4u;
#elif PALETTE_8_BIT #elif PALETTE_8_BIT
@ -800,8 +800,9 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
#endif #endif
// load palette index // load palette index
uint2 vicoord = uint2(texpage.x + index_coord.x, fixNativeYCoord(texpage.y + index_coord.y)); min16uint2 vicoord = uint2(texpage.x + index_coord.x, fixNativeYCoord(texpage.y + index_coord.y));
float4 texel = SAMPLE_TEXTURE(samp0, float2(vicoord) * RCP_NATIVE_VRAM_SIZE); min16float2 vncoord = float2(vicoord) * RCP_NATIVE_VRAM_SIZE;
min16float4 texel = SAMPLE_TEXTURE(samp0, vncoord);
uint vram_value = RGBA8ToRGBA5551(texel); uint vram_value = RGBA8ToRGBA5551(texel);
// apply palette // apply palette
@ -814,7 +815,7 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
#endif #endif
// sample palette // sample palette
uint2 palette_icoord = uint2(texpage.z + palette_index, fixNativeYCoord(texpage.w)); min16uint2 palette_icoord = uint2(texpage.z + palette_index, fixNativeYCoord(texpage.w));
return SAMPLE_TEXTURE(samp0, float2(palette_icoord) * RCP_NATIVE_VRAM_SIZE); return SAMPLE_TEXTURE(samp0, float2(palette_icoord) * RCP_NATIVE_VRAM_SIZE);
#else #else
// Direct texturing. Render-to-texture effects. Use upscaled coordinates. // Direct texturing. Render-to-texture effects. Use upscaled coordinates.
@ -852,12 +853,12 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
ss << R"( ss << R"(
{ {
uint3 vertcol = uint3(v_col0.rgb); min16uint3 vertcol = uint3(v_col0.rgb);
bool semitransparent; bool semitransparent;
uint3 icolor; min16uint3 icolor;
float ialpha; min16float ialpha;
float oalpha; min16float oalpha;
#if INTERLACING #if INTERLACING
if ((fixYCoord(uint(v_pos.y)) & 1u) == u_interlaced_displayed_field) if ((fixYCoord(uint(v_pos.y)) & 1u) == u_interlaced_displayed_field)
@ -865,7 +866,7 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
#endif #endif
#if TEXTURED #if TEXTURED
float4 texcol; min16float4 texcol;
#if TEXTURE_FILTERING #if TEXTURE_FILTERING
FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha); FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha);
if (ialpha < 0.5) if (ialpha < 0.5)
@ -928,12 +929,12 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
#endif #endif
// Premultiply alpha so we don't need to use a colour output for it. // Premultiply alpha so we don't need to use a colour output for it.
float premultiply_alpha = ialpha; min16float premultiply_alpha = ialpha;
#if TRANSPARENCY #if TRANSPARENCY
premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0); premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0);
#endif #endif
float3 color; min16float3 color;
#if !TRUE_COLOR #if !TRUE_COLOR
// We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color // We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color
// into the blend unit, which can cause a small amount of error to accumulate. // into the blend unit, which can cause a small amount of error to accumulate.
@ -1302,7 +1303,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader()
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, true, false, false, msaa); DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, true, false, false, msaa);
ss << R"( ss << R"(
{ {
uint2 dst_coords = uint2(v_pos.xy); min16uint2 dst_coords = uint2(v_pos.xy);
// make sure it's not oversized and out of range // make sure it's not oversized and out of range
if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) || if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
@ -1312,18 +1313,18 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader()
} }
// find offset from the start of the row/column // find offset from the start of the row/column
uint2 offset; min16uint2 offset;
offset.x = (dst_coords.x < u_dst_coords.x) ? (VRAM_SIZE.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x); offset.x = (dst_coords.x < u_dst_coords.x) ? (VRAM_SIZE.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x);
offset.y = (dst_coords.y < u_dst_coords.y) ? (VRAM_SIZE.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y); offset.y = (dst_coords.y < u_dst_coords.y) ? (VRAM_SIZE.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y);
// find the source coordinates to copy from // find the source coordinates to copy from
uint2 src_coords = (u_src_coords + offset) % VRAM_SIZE; min16uint2 src_coords = (u_src_coords + offset) % VRAM_SIZE;
// sample and apply mask bit // sample and apply mask bit
#if MSAA_COPY #if MSAA_COPY
float4 color = LOAD_TEXTURE_MS(samp0, int2(src_coords), f_sample_index); min16float4 color = LOAD_TEXTURE_MS(samp0, int2(src_coords), f_sample_index);
#else #else
float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0); min16float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0);
#endif #endif
o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a); o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a);
#if !PGXP_DEPTH #if !PGXP_DEPTH

View File

@ -139,13 +139,13 @@ void ShaderGen::WriteHeader(std::stringstream& ss)
{ {
ss << "precision highp float;\n"; ss << "precision highp float;\n";
ss << "precision highp int;\n"; ss << "precision highp int;\n";
ss << "precision highp sampler2D;\n"; ss << "precision mediump sampler2D;\n";
if (GLAD_GL_ES_VERSION_3_1) if (GLAD_GL_ES_VERSION_3_1)
ss << "precision highp sampler2DMS;\n"; ss << "precision mediump sampler2DMS;\n";
if (GLAD_GL_ES_VERSION_3_2) if (GLAD_GL_ES_VERSION_3_2)
ss << "precision highp usamplerBuffer;\n"; ss << "precision mediump usamplerBuffer;\n";
ss << "\n"; ss << "\n";
} }
@ -170,6 +170,19 @@ void ShaderGen::WriteHeader(std::stringstream& ss)
ss << "#define frac fract\n"; ss << "#define frac fract\n";
ss << "#define lerp mix\n"; ss << "#define lerp mix\n";
ss << "#define min16int mediump int\n";
ss << "#define min16uint mediump uint\n";
ss << "#define min16float mediump float\n";
ss << "#define min16float2 mediump vec2\n";
ss << "#define min16float3 mediump vec3\n";
ss << "#define min16float4 mediump vec4\n";
ss << "#define min16int2 mediump ivec2\n";
ss << "#define min16int3 mediump ivec3\n";
ss << "#define min16int4 mediump ivec4\n";
ss << "#define min16uint2 mediump uvec2\n";
ss << "#define min16uint3 mediump uvec3\n";
ss << "#define min16uint4 mediump uvec4\n";
ss << "#define CONSTANT const\n"; ss << "#define CONSTANT const\n";
ss << "#define GLOBAL\n"; ss << "#define GLOBAL\n";
ss << "#define VECTOR_EQ(a, b) ((a) == (b))\n"; ss << "#define VECTOR_EQ(a, b) ((a) == (b))\n";
@ -211,6 +224,22 @@ void ShaderGen::WriteHeader(std::stringstream& ss)
ss << "#define mat2 float2x2\n"; ss << "#define mat2 float2x2\n";
ss << "#define mat3 float3x3\n"; ss << "#define mat3 float3x3\n";
ss << "#define mat4 float4x4\n"; ss << "#define mat4 float4x4\n";
#if 0
ss << "#define min16int int\n";
ss << "#define min16uint uint\n";
ss << "#define min16float float\n";
ss << "#define min16float2 float2\n";
ss << "#define min16float3 float3\n";
ss << "#define min16float4 float4\n";
ss << "#define min16int2 ivec2\n";
ss << "#define min16int3 ivec3\n";
ss << "#define min16int4 ivec4\n";
ss << "#define min16uint2 uint2\n";
ss << "#define min16uint3 uint3\n";
ss << "#define min16uint4 uint4\n";
#endif
ss << "#define CONSTANT static const\n"; ss << "#define CONSTANT static const\n";
ss << "#define GLOBAL static\n"; ss << "#define GLOBAL static\n";
ss << "#define VECTOR_EQ(a, b) (all((a) == (b)))\n"; ss << "#define VECTOR_EQ(a, b) (all((a) == (b)))\n";
@ -348,7 +377,7 @@ void ShaderGen::DeclareVertexEntryPoint(
ss << "out VertexData" << output_block_suffix << " {\n"; ss << "out VertexData" << output_block_suffix << " {\n";
for (u32 i = 0; i < num_color_outputs; i++) for (u32 i = 0; i < num_color_outputs; i++)
ss << " " << qualifier << "float4 v_col" << i << ";\n"; ss << " " << qualifier << "min16float4 v_col" << i << ";\n";
for (u32 i = 0; i < num_texcoord_outputs; i++) for (u32 i = 0; i < num_texcoord_outputs; i++)
ss << " " << qualifier << "float2 v_tex" << i << ";\n"; ss << " " << qualifier << "float2 v_tex" << i << ";\n";
@ -365,7 +394,7 @@ void ShaderGen::DeclareVertexEntryPoint(
const char* qualifier = GetInterpolationQualifier(false, centroid_interpolation, sample_interpolation, true); const char* qualifier = GetInterpolationQualifier(false, centroid_interpolation, sample_interpolation, true);
for (u32 i = 0; i < num_color_outputs; i++) for (u32 i = 0; i < num_color_outputs; i++)
ss << qualifier << "out float4 v_col" << i << ";\n"; ss << qualifier << "out min16float4 v_col" << i << ";\n";
for (u32 i = 0; i < num_texcoord_outputs; i++) for (u32 i = 0; i < num_texcoord_outputs; i++)
ss << qualifier << "out float2 v_tex" << i << ";\n"; ss << qualifier << "out float2 v_tex" << i << ";\n";
@ -406,7 +435,7 @@ void ShaderGen::DeclareVertexEntryPoint(
} }
for (u32 i = 0; i < num_color_outputs; i++) for (u32 i = 0; i < num_color_outputs; i++)
ss << " " << qualifier << "out float4 v_col" << i << " : COLOR" << i << ",\n"; ss << " " << qualifier << "out min16float4 v_col" << i << " : COLOR" << i << ",\n";
for (u32 i = 0; i < num_texcoord_outputs; i++) for (u32 i = 0; i < num_texcoord_outputs; i++)
ss << " " << qualifier << "out float2 v_tex" << i << " : TEXCOORD" << i << ",\n"; ss << " " << qualifier << "out float2 v_tex" << i << " : TEXCOORD" << i << ",\n";
@ -441,7 +470,7 @@ void ShaderGen::DeclareFragmentEntryPoint(
ss << "in VertexData {\n"; ss << "in VertexData {\n";
for (u32 i = 0; i < num_color_inputs; i++) for (u32 i = 0; i < num_color_inputs; i++)
ss << " " << qualifier << "float4 v_col" << i << ";\n"; ss << " " << qualifier << "min16float4 v_col" << i << ";\n";
for (u32 i = 0; i < num_texcoord_inputs; i++) for (u32 i = 0; i < num_texcoord_inputs; i++)
ss << " " << qualifier << "float2 v_tex" << i << ";\n"; ss << " " << qualifier << "float2 v_tex" << i << ";\n";
@ -458,7 +487,7 @@ void ShaderGen::DeclareFragmentEntryPoint(
const char* qualifier = GetInterpolationQualifier(false, centroid_interpolation, sample_interpolation, false); const char* qualifier = GetInterpolationQualifier(false, centroid_interpolation, sample_interpolation, false);
for (u32 i = 0; i < num_color_inputs; i++) for (u32 i = 0; i < num_color_inputs; i++)
ss << qualifier << "in float4 v_col" << i << ";\n"; ss << qualifier << "in min16float4 v_col" << i << ";\n";
for (u32 i = 0; i < num_texcoord_inputs; i++) for (u32 i = 0; i < num_texcoord_inputs; i++)
ss << qualifier << "in float2 v_tex" << i << ";\n"; ss << qualifier << "in float2 v_tex" << i << ";\n";
@ -510,7 +539,7 @@ void ShaderGen::DeclareFragmentEntryPoint(
ss << "void main(\n"; ss << "void main(\n";
for (u32 i = 0; i < num_color_inputs; i++) for (u32 i = 0; i < num_color_inputs; i++)
ss << " " << qualifier << "in float4 v_col" << i << " : COLOR" << i << ",\n"; ss << " " << qualifier << "in min16float4 v_col" << i << " : COLOR" << i << ",\n";
for (u32 i = 0; i < num_texcoord_inputs; i++) for (u32 i = 0; i < num_texcoord_inputs; i++)
ss << " " << qualifier << "in float2 v_tex" << i << " : TEXCOORD" << i << ",\n"; ss << " " << qualifier << "in float2 v_tex" << i << " : TEXCOORD" << i << ",\n";