Merge pull request #11703 from degasus/asciiart

Shaders: Add error message for asciiart shader if shader_subgroup is missing.
This commit is contained in:
Mai 2023-03-29 23:34:30 -04:00 committed by GitHub
commit 806ea59d77
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 148 additions and 82 deletions

View File

@ -14,10 +14,10 @@ DefaultValue = false
[/configuration]
*/
const uint MAX_CHARS = 96u; // max 96, must be a multiple of 32
const bool HAVE_FULL_FEATURE_FALLBACK = true;
const uint MAX_CHARS = 96u; // max 96, must be a multiple of 32
const bool HAVE_FULL_FEATURE_FALLBACK = false; // terrible slow, can easily softlock the GPU
const uint UNROLL_FALLBACK = 4;
const uint UNROLL_SIMD = 3; // max MAX_CHARS / 32
const uint UNROLL_SIMD = 3; // max MAX_CHARS / 32
// #undef SUPPORTS_SUBGROUP_REDUCTION
@ -95,34 +95,37 @@ const uint rasters[char_count][(char_pixels + 31) / 32] = {
// Precalculated sum of all pixels per character
const uint raster_active_pixels[char_count] = {
96, 18, 16, 40, 56, 42, 46, 10, 22, 22, 32, 28, 10, 16, 6, 24,
52, 29, 36, 44, 35, 42, 50, 28, 58, 51, 12, 16, 22, 32, 22, 26,
41, 46, 57, 38, 52, 38, 32, 46, 48, 30, 31, 43, 28, 56, 64, 52,
42, 52, 52, 44, 28, 48, 42, 58, 42, 32, 38, 26, 24, 26, 14, 8,
10, 34, 40, 26, 40, 32, 30, 33, 39, 16, 20, 37, 28, 43, 30, 30,
34, 34, 20, 28, 27, 30, 26, 36, 26, 24, 26, 30, 24, 30, 14, 0};
96, 18, 16, 40, 56, 42, 46, 10, 22, 22, 32, 28, 10, 16, 6, 24, 52, 29, 36, 44, 35, 42, 50, 28,
58, 51, 12, 16, 22, 32, 22, 26, 41, 46, 57, 38, 52, 38, 32, 46, 48, 30, 31, 43, 28, 56, 64, 52,
42, 52, 52, 44, 28, 48, 42, 58, 42, 32, 38, 26, 24, 26, 14, 8, 10, 34, 40, 26, 40, 32, 30, 33,
39, 16, 20, 37, 28, 43, 30, 30, 34, 34, 20, 28, 27, 30, 26, 36, 26, 24, 26, 30, 24, 30, 14, 0};
// Get one sample of the font: (pixel index, character index)
float SampleFont(uint2 pos) {
float SampleFont(uint2 pos)
{
return (rasters[pos.y][pos.x / 32] >> (pos.x % 32)) & uint(1);
}
// Get one sample of the framebuffer: (character position in screen space, pixel index)
float3 SampleTex(uint2 char_pos, uint pixel) {
float2 inv_resoltion = OptionEnabled(USE_WINDOW_RES) ? GetInvWindowResolution() : GetInvResolution();
float3 SampleTex(uint2 char_pos, uint pixel)
{
float2 inv_resoltion =
OptionEnabled(USE_WINDOW_RES) ? GetInvWindowResolution() : GetInvResolution();
float2 tex_pos = char_pos * char_dim + float2(pixel % char_width, pixel / char_width) + 0.5;
return SampleLocation(tex_pos * inv_resoltion).xyz;
}
struct CharResults {
float3 fg; // font color
float3 bg; // background color
float err; // MSE of this configuration
uint c; // character index
struct CharResults
{
float3 fg; // font color
float3 bg; // background color
float err; // MSE of this configuration
uint c; // character index
};
// Calculate the font and background color and the MSE for a given character
CharResults CalcCharRes(uint c, float3 t, float3 ft) {
CharResults CalcCharRes(uint c, float3 t, float3 ft)
{
CharResults o;
o.c = c;
@ -140,7 +143,8 @@ CharResults CalcCharRes(uint c, float3 t, float3 ft) {
// The calculation isn't stable if the font is all-one. Return max err
// instead.
if (f == char_pixels) {
if (f == char_pixels)
{
o.err = char_pixels * char_pixels;
return o;
}
@ -184,11 +188,10 @@ CharResults CalcCharRes(uint c, float3 t, float3 ft) {
// solution.
float3 a = (ft * (f - float(char_pixels)) + t * (f - ff)) / (f * f - ff * float(char_pixels));
float3 b = (ft * f - t * ff) / (f * f - ff * float(char_pixels));
float3 b = (ft * f - t * ff) / (f * f - ff * float(char_pixels));
float3 e = a * a * ff + 2.0 * a * b * (f - ff) - 2.0 * a * ft +
b * b * (-2.0 * f + ff + float(char_pixels)) + 2.0 * b * ft -
2.0 * b * t + tt;
b * b * (-2.0 * f + ff + float(char_pixels)) + 2.0 * b * ft - 2.0 * b * t + tt;
o.err = dot(e, float3(1.0, 1.0, 1.0));
o.fg = a;
@ -199,12 +202,13 @@ CharResults CalcCharRes(uint c, float3 t, float3 ft) {
}
// Get the color of the pixel of this invocation based on the character details
float3 GetFinalPixel(CharResults char_out) {
float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
uint2 char_pos = uint2(floor(GetCoordinates() * resolution / char_dim));
uint2 pixel_offset = uint2(floor(GetCoordinates() * resolution) - char_pos * char_dim);
float font = SampleFont(int2(pixel_offset.x + char_width * pixel_offset.y, char_out.c));
return char_out.fg * font + char_out.bg * (1.0 - font);
float3 GetFinalPixel(CharResults char_out)
{
float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
uint2 char_pos = uint2(floor(GetCoordinates() * resolution / char_dim));
uint2 pixel_offset = uint2(floor(GetCoordinates() * resolution) - char_pos * char_dim);
float font = SampleFont(int2(pixel_offset.x + char_width * pixel_offset.y, char_out.c));
return char_out.fg * font + char_out.bg * (1.0 - font);
}
/*
@ -218,18 +222,22 @@ float3 GetFinalPixel(CharResults char_out) {
Terrible in performance, only for reference.
*/
CharResults CalcCharTrivial(uint2 char_pos) {
CharResults CalcCharTrivial(uint2 char_pos)
{
float3 t;
CharResults char_out;
char_out.err = char_pixels * char_pixels;
for (uint c = 0; c < MAX_CHARS; c += 1) {
for (uint c = 0; c < MAX_CHARS; c += 1)
{
float3 ft = float3(0.0, 0.0, 0.0);
for (uint pixel = 0; pixel < char_pixels; pixel += 1) {
for (uint pixel = 0; pixel < char_pixels; pixel += 1)
{
float3 tex = SampleTex(char_pos, pixel);
float font = SampleFont(uint2(pixel, c));
ft += font * tex;
}
if (c == 0) t = ft;
if (c == 0)
t = ft;
CharResults res = CalcCharRes(c, t, ft);
if (res.err < char_out.err)
char_out = res;
@ -238,43 +246,52 @@ CharResults CalcCharTrivial(uint2 char_pos) {
}
/*
However for better performance, some characters are tested at once. This saves some expensive texture() calls.
Also split the loop over the pixels in groups of 32 for only fetching the uint32 of the font once.
However for better performance, some characters are tested at once. This saves some expensive
texture() calls. Also split the loop over the pixels in groups of 32 for only fetching the uint32
of the font once.
*/
CharResults CalcCharFallback(uint2 char_pos) {
CharResults CalcCharFallback(uint2 char_pos)
{
float3 t;
CharResults char_out;
char_out.err = char_pixels * char_pixels;
for (uint c = 0; c < MAX_CHARS; c += UNROLL_FALLBACK) {
for (uint c = 0; c < MAX_CHARS; c += UNROLL_FALLBACK)
{
// Declare ft
float3 ft[UNROLL_FALLBACK];
for (uint i = 0; i < UNROLL_FALLBACK; i++)
ft[i] = float3(0.0, 0.0, 0.0);
// Split `for p : pixels` in groups of 32. This makes accessing the texture (bit in uint32) easier.
for (uint pixel = 0; pixel < char_pixels; pixel += 32) {
// Split `for p : pixels` in groups of 32. This makes accessing the texture (bit in uint32)
// easier.
for (uint pixel = 0; pixel < char_pixels; pixel += 32)
{
uint font_i[UNROLL_FALLBACK];
for (uint i = 0; i < UNROLL_FALLBACK; i++)
font_i[i] = rasters[c + i][pixel / 32];
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += 1) {
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += 1)
{
float3 tex = SampleTex(char_pos, pixel + pixel_offset);
// Inner kernel of `ft += font * tex`. Most time is spend in here.
for (uint i = 0; i < UNROLL_FALLBACK; i++) {
for (uint i = 0; i < UNROLL_FALLBACK; i++)
{
float font = (font_i[i] >> pixel_offset) & uint(1);
ft[i] += font * tex;
}
}
}
if (c == 0) {
if (c == 0)
{
// First char has font := 1, so t = ft. Cache this value for the next iterations.
t = ft[0];
}
// Check if this character fits better than the last one.
for (uint i = 0; i < UNROLL_FALLBACK; i++) {
for (uint i = 0; i < UNROLL_FALLBACK; i++)
{
CharResults res = CalcCharRes(c + i, t, ft[i]);
if (res.err < char_out.err)
char_out = res;
@ -289,58 +306,63 @@ CharResults CalcCharFallback(uint2 char_pos) {
- distribute all characters over the lanes and check for them in parallel
- distribute the uniform texture access and broadcast each back to each lane
*/
CharResults CalcCharSIMD(uint2 char_pos, uint simd_width) {
CharResults CalcCharSIMD(uint2 char_pos, uint simd_width)
{
// Font color, bg color, character, error -- of character with minimum error
CharResults char_out;
char_out.err = char_pixels * char_pixels;
float3 t;
#ifdef SUPPORTS_SUBGROUP_REDUCTION
#ifdef SUPPORTS_SUBGROUP_REDUCTION
// Hack: Work in hard-codeded fixed SIMD mode
if (gl_SubgroupInvocationID < simd_width) {
if (gl_SubgroupInvocationID < simd_width)
{
// Loop over all characters
for (uint c = 0; c < MAX_CHARS; c += UNROLL_SIMD * simd_width) {
for (uint c = 0; c < MAX_CHARS; c += UNROLL_SIMD * simd_width)
{
// registers for "sum of font * texture"
float3 ft[UNROLL_SIMD];
for (uint i = 0; i < UNROLL_SIMD; i++)
ft[i] = float3(0.0, 0.0, 0.0);
for (uint pixel = 0; pixel < char_pixels; pixel += 32) {
for (uint pixel = 0; pixel < char_pixels; pixel += 32)
{
// Preload the font uint32 for the next 32 pixels
uint font_i[UNROLL_SIMD];
for (uint i = 0; i < UNROLL_SIMD; i++)
font_i[i] = rasters[c + UNROLL_SIMD*gl_SubgroupInvocationID + i][pixel / 32];
font_i[i] = rasters[c + UNROLL_SIMD * gl_SubgroupInvocationID + i][pixel / 32];
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += simd_width) {
// Copy one full WRAP of textures into registers and shuffle them around
// for later usage. This avoids one memory transaction per tested pixel
// & character.
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += simd_width)
{
// Copy one full WRAP of textures into registers and shuffle them around for later usage.
// This avoids one memory transaction per tested pixel & character.
float3 tex_simd = SampleTex(char_pos, pixel + pixel_offset + gl_SubgroupInvocationID);
for (uint k = 0; k < simd_width; k += 1) {
for (uint k = 0; k < simd_width; k += 1)
{
float3 tex = subgroupBroadcast(tex_simd, k);
// Note: As pixel iterates based on power-of-two gl_SubgroupSize, the
// const memory access to rasters is CSE'd and the inner loop
// Note: As pixel iterates based on power-of-two gl_SubgroupSize,
// the const memory access to rasters is CSE'd and the inner loop
// after unrolling only contains: testing one bit + shuffle +
// conditional add
for (uint i = 0; i < UNROLL_SIMD; i++) {
for (uint i = 0; i < UNROLL_SIMD; i++)
{
float font = (font_i[i] >> (k + pixel_offset % 32)) & uint(1);
ft[i] += font * tex;
}
}
}
}
if (c == 0) {
if (c == 0)
{
// font[0] is a hardcoded 1 font, so t = ft
t = subgroupBroadcast(ft[0], 0);
}
for (uint i = 0; i < UNROLL_SIMD; i++) {
CharResults res = CalcCharRes(c + UNROLL_SIMD*gl_SubgroupInvocationID + i, t, ft[i]);
for (uint i = 0; i < UNROLL_SIMD; i++)
{
CharResults res = CalcCharRes(c + UNROLL_SIMD * gl_SubgroupInvocationID + i, t, ft[i]);
if (res.err < char_out.err)
char_out = res;
}
@ -355,63 +377,107 @@ CharResults CalcCharSIMD(uint2 char_pos, uint simd_width) {
char_out.c = subgroupBroadcast(char_out.c, smallest);
char_out.err = err_min;
#endif
#endif
return char_out;
}
bool supportsSIMD(uint simd_width) {
#ifdef SUPPORTS_SUBGROUP_REDUCTION
bool supportsSIMD(uint simd_width)
{
#ifdef SUPPORTS_SUBGROUP_REDUCTION
const uint mask = simd_width == 32u ? 0xFFFFFFFFu : (1u << simd_width) - 1;
return (subgroupBallot(true)[0] & mask) == mask;
#else
#else
return false;
#endif
#endif
}
void main() {
// "Error: The AsciiArt shader requires the missing GPU extention KHR_shader_subgroup."
const uint missing_subgroup_warning_len = 82;
const uint missing_subgroup_warning[missing_subgroup_warning_len] = {
37, 82, 82, 79, 82, 26, 95, 52, 72, 69, 95, 33, 83, 67, 73, 73, 33, 82, 84, 95, 83,
72, 65, 68, 69, 82, 95, 82, 69, 81, 85, 73, 82, 69, 83, 95, 84, 72, 69, 95, 77, 73,
83, 83, 73, 78, 71, 95, 39, 48, 53, 95, 69, 88, 84, 69, 78, 84, 73, 79, 78, 95, 43,
40, 50, 63, 83, 72, 65, 68, 69, 82, 63, 83, 85, 66, 71, 82, 79, 85, 80, 14};
float3 ShowWarning(uint2 char_pos)
{
CharResults char_out;
char_out.fg = float3(1.0, 1.0, 1.0);
char_out.bg = float3(0.0, 0.0, 0.0);
char_out.c = 95u; // just background
if (char_pos.y == 0u && char_pos.x < missing_subgroup_warning_len)
{
char_out.c = missing_subgroup_warning[char_pos.x];
}
return GetFinalPixel(char_out);
}
void main()
{
// Calculate the character position of this pixel
float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
uint2 char_pos_self = uint2(floor(GetCoordinates() * resolution / char_dim));
float3 color_out;
#ifdef SUPPORTS_SUBGROUP_REDUCTION
if (supportsSIMD(8)) {
#ifdef SUPPORTS_SUBGROUP_REDUCTION
if (supportsSIMD(8))
{
// Loop over all character positions covered by this wave
bool pixel_active = !gl_HelperInvocation;
CharResults char_out;
while (true) {
while (true)
{
// Fetch the next active character position
uint4 active_lanes = subgroupBallot(pixel_active);
if (active_lanes == uint4(0, 0, 0, 0)) {
if (active_lanes == uint4(0, 0, 0, 0))
{
break;
}
uint2 char_pos = subgroupBroadcast(char_pos_self, subgroupBallotFindLSB(active_lanes));
// And calculate everything for this character position
if (supportsSIMD(32)) {
if (supportsSIMD(32))
{
char_out = CalcCharSIMD(char_pos, 32);
} else if (supportsSIMD(16)) {
}
else if (supportsSIMD(16))
{
char_out = CalcCharSIMD(char_pos, 16);
} else if (supportsSIMD(8)) {
}
else if (supportsSIMD(8))
{
char_out = CalcCharSIMD(char_pos, 8);
}
// Draw the character on screen
if (char_pos == char_pos_self) {
if (char_pos == char_pos_self)
{
color_out = GetFinalPixel(char_out);
pixel_active = false;
}
if (OptionEnabled(DEBUG_ONLY_ONE_CHAR)) {
if (OptionEnabled(DEBUG_ONLY_ONE_CHAR))
{
break;
}
}
} else
#endif
if (HAVE_FULL_FEATURE_FALLBACK) {
}
else
#else
if (char_pos_self.y <= 1u)
{
color_out = ShowWarning(char_pos_self);
}
else
#endif
if (HAVE_FULL_FEATURE_FALLBACK)
{
color_out = GetFinalPixel(CalcCharFallback(char_pos_self));
} else {
}
else
{
color_out = Sample().xyz;
}