gl: Fix D24X8 accelerated encode/decode

- PS3 D24X8 is swapped as a full word, unlike PC.
- Add missing paths to handle custom swap behavior.
This commit is contained in:
kd-11 2022-09-21 23:15:40 +03:00 committed by kd-11
parent 81fa3da101
commit 362a26a404
5 changed files with 76 additions and 33 deletions

View File

@ -207,7 +207,8 @@ namespace gl
compute_task::run(cmd, num_invocations);
}
cs_shuffle_d32fx8_to_x8d24f::cs_shuffle_d32fx8_to_x8d24f()
template <bool SwapBytes>
cs_shuffle_d32fx8_to_x8d24f<SwapBytes>::cs_shuffle_d32fx8_to_x8d24f()
{
uniforms = "uniform uint in_ptr, out_ptr;\n";
@ -223,15 +224,22 @@ namespace gl
" value |= stencil;\n"
" data[index + out_ptr] = bswap_u32(value);\n";
if constexpr (!SwapBytes)
{
work_kernel = fmt::replace_all(work_kernel, "bswap_u32(value)", "value", 1);
}
cs_shuffle_base::build("");
}
void cs_shuffle_d32fx8_to_x8d24f::bind_resources()
template <bool SwapBytes>
void cs_shuffle_d32fx8_to_x8d24f<SwapBytes>::bind_resources()
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
}
void cs_shuffle_d32fx8_to_x8d24f::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
template <bool SwapBytes>
void cs_shuffle_d32fx8_to_x8d24f<SwapBytes>::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
{
u32 data_offset;
if (src_offset > dst_offset)
@ -250,7 +258,11 @@ namespace gl
cs_shuffle_base::run(cmd, data, num_texels * 4, data_offset);
}
cs_shuffle_x8d24f_to_d32fx8::cs_shuffle_x8d24f_to_d32fx8()
template cs_shuffle_d32fx8_to_x8d24f<true>;
template cs_shuffle_d32fx8_to_x8d24f<false>;
template <bool SwapBytes>
cs_shuffle_x8d24f_to_d32fx8<SwapBytes>::cs_shuffle_x8d24f_to_d32fx8()
{
uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n";
@ -267,15 +279,22 @@ namespace gl
" data[index * 2 + out_offset] = d24f_to_f32(depth);\n"
" data[index * 2 + (out_offset + 1)] = stencil;\n";
if constexpr (!SwapBytes)
{
work_kernel = fmt::replace_all(work_kernel, "value = bswap_u32(value)", "// value = bswap_u32(value)", 1);
}
cs_shuffle_base::build("");
}
void cs_shuffle_x8d24f_to_d32fx8::bind_resources()
template <bool SwapBytes>
void cs_shuffle_x8d24f_to_d32fx8<SwapBytes>::bind_resources()
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
}
void cs_shuffle_x8d24f_to_d32fx8::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
template <bool SwapBytes>
void cs_shuffle_x8d24f_to_d32fx8<SwapBytes>::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
{
u32 data_offset;
if (src_offset > dst_offset)
@ -294,6 +313,9 @@ namespace gl
cs_shuffle_base::run(cmd, data, num_texels * 4, data_offset);
}
template cs_shuffle_x8d24f_to_d32fx8<true>;
template cs_shuffle_x8d24f_to_d32fx8<false>;
cs_d24x8_to_ssbo::cs_d24x8_to_ssbo()
{
initialize();
@ -332,11 +354,11 @@ namespace gl
}
// This method is callable in sensitive code and must restore the GL state on exit
gl::saved_sampler_state save_0(GL_COMPUTE_BUFFER_SLOT(0), m_sampler);
gl::saved_sampler_state save_1(GL_COMPUTE_BUFFER_SLOT(1), m_sampler);
gl::saved_sampler_state save_sampler0(GL_COMPUTE_BUFFER_SLOT(0), m_sampler);
gl::saved_sampler_state save_sampler1(GL_COMPUTE_BUFFER_SLOT(1), m_sampler);
gl::bind_image_view_safe(cmd, GL_COMPUTE_BUFFER_SLOT(0), depth_view);
gl::bind_image_view_safe(cmd, GL_COMPUTE_BUFFER_SLOT(1), stencil_view);
gl::bind_image_view_safe save_image1(cmd, GL_COMPUTE_BUFFER_SLOT(0), depth_view);
gl::bind_image_view_safe save_image2(cmd, GL_COMPUTE_BUFFER_SLOT(1), stencil_view);
dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(2), out_offset, row_pitch * 4 * region.height);
@ -383,8 +405,8 @@ namespace gl
}
// This method is callable in sensitive code and must restore the GL state on exit
gl::saved_sampler_state save(GL_COMPUTE_BUFFER_SLOT(0), m_sampler);
gl::bind_image_view_safe(cmd, GL_COMPUTE_BUFFER_SLOT(0), data_view);
gl::saved_sampler_state save_sampler(GL_COMPUTE_BUFFER_SLOT(0), m_sampler);
gl::bind_image_view_safe save_image(cmd, GL_COMPUTE_BUFFER_SLOT(0), data_view);
dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(1), out_offset, row_pitch * 4 * region.height);

View File

@ -78,6 +78,7 @@ namespace gl
}
};
template <bool SwapBytes>
struct cs_shuffle_d32fx8_to_x8d24f : cs_shuffle_base
{
u32 m_ssbo_length = 0;
@ -89,6 +90,7 @@ namespace gl
void run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels);
};
template <bool SwapBytes>
struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base
{
u32 m_ssbo_length = 0;

View File

@ -289,10 +289,10 @@ namespace gl
}
if (auto as_vi = dynamic_cast<const gl::viewable_image*>(src);
gl::get_driver_caps().vendor_AMD &&
src->get_target() == gl::texture::target::texture2D &&
as_vi)
{
// RGBA8 <-> D24X8 bitcasts are some very common conversions due to some PS3 coding hacks & workarounds.
switch (src->get_internal_format())
{
case gl::texture::internal_format::depth24_stencil8:
@ -337,8 +337,16 @@ namespace gl
mem_info->memory_required = (mem_info->image_size_in_texels * 6);
ensure(!initialize_scratch_mem());
get_compute_task<cs_fconvert_task<f32, f16, false, true>>()->run(cmd, dst, dst_offset,
static_cast<u32>(mem_info->image_size_in_bytes), static_cast<u32>(mem_info->image_size_in_bytes));
if (pack_info.swap_bytes) [[ likely ]]
{
get_compute_task<cs_fconvert_task<f32, f16, false, true>>()->run(cmd, dst, dst_offset,
static_cast<u32>(mem_info->image_size_in_bytes), static_cast<u32>(mem_info->image_size_in_bytes));
}
else
{
get_compute_task<cs_fconvert_task<f32, f16, false, false>>()->run(cmd, dst, dst_offset,
static_cast<u32>(mem_info->image_size_in_bytes), static_cast<u32>(mem_info->image_size_in_bytes));
}
result = reinterpret_cast<void*>(mem_info->image_size_in_bytes + dst_offset);
}
else if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV)
@ -347,8 +355,16 @@ namespace gl
mem_info->memory_required = (mem_info->image_size_in_texels * 12);
ensure(!initialize_scratch_mem());
get_compute_task<cs_shuffle_d32fx8_to_x8d24f>()->run(cmd, dst, dst_offset,
static_cast<u32>(mem_info->image_size_in_bytes), static_cast<u32>(mem_info->image_size_in_texels));
if (pack_info.swap_bytes)
{
get_compute_task<cs_shuffle_d32fx8_to_x8d24f<true>>()->run(cmd, dst, dst_offset,
static_cast<u32>(mem_info->image_size_in_bytes), static_cast<u32>(mem_info->image_size_in_texels));
}
else
{
get_compute_task<cs_shuffle_d32fx8_to_x8d24f<false>>()->run(cmd, dst, dst_offset,
static_cast<u32>(mem_info->image_size_in_bytes), static_cast<u32>(mem_info->image_size_in_texels));
}
result = reinterpret_cast<void*>(mem_info->image_size_in_bytes + dst_offset);
}
else
@ -501,7 +517,6 @@ namespace gl
else
{
// Stencil format on NV. Use driver upload path
if (unpack_info.type == GL_UNSIGNED_INT_24_8)
{
if (auto job = get_trivial_transform_job(unpack_info))
@ -517,7 +532,15 @@ namespace gl
{
mem_info->memory_required = (mem_info->image_size_in_texels * 8);
initialize_scratch_mem();
get_compute_task<cs_shuffle_x8d24f_to_d32fx8>()->run(cmd, transfer_buf, in_offset, out_offset, static_cast<u32>(mem_info->image_size_in_texels));
if (unpack_info.swap_bytes)
{
get_compute_task<cs_shuffle_x8d24f_to_d32fx8<true>>()->run(cmd, transfer_buf, in_offset, out_offset, static_cast<u32>(mem_info->image_size_in_texels));
}
else
{
get_compute_task<cs_shuffle_x8d24f_to_d32fx8<false>>()->run(cmd, transfer_buf, in_offset, out_offset, static_cast<u32>(mem_info->image_size_in_texels));
}
}
else
{

View File

@ -14,8 +14,8 @@ R"(
#define FMT_GL_BGR5_A1 0x99F0
#define FMT_GL_RGBA4 0x8056
#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8
#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24
#define bswap_u16(bits) (bits & 0xFFu) << 8u | (bits & 0xFF00u) >> 8u | (bits & 0xFF0000u) << 8u | (bits & 0xFF000000u) >> 8u
#define bswap_u32(bits) (bits & 0xFFu) << 24u | (bits & 0xFF00u) << 8u | (bits & 0xFF0000u) >> 8u | (bits & 0xFF000000u) >> 24u
layout(location=0) out vec4 outColor;
@ -73,18 +73,10 @@ uint readUint32(const in uint address)
uvec2 readUint24_8(const in uint address)
{
const uint raw_value = data[address];
const uint stencil = bitfieldExtract(raw_value, 0, 8);
if (swap_bytes != 0)
{
const uint depth = min(bswap_u32(raw_value), 0xffffff);
return uvec2(depth, stencil);
}
const uint raw_value = readUint32(address);
return uvec2(
bitfieldExtract(raw_value, 8, 24),
stencil
bitfieldExtract(raw_value, 0, 8)
);
}

View File

@ -5,6 +5,8 @@ layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;
#define IMAGE_LOCATION(x) (x + %loc)
#define SSBO_LOCATION IMAGE_LOCATION(2)
#define bswap_u32(bits) (bits & 0xFFu) << 24u | (bits & 0xFF00u) << 8u | (bits & 0xFF0000u) >> 8u | (bits & 0xFF000000u) >> 24u
layout(%set, binding=IMAGE_LOCATION(0)) uniform sampler2D depthData;
layout(%set, binding=IMAGE_LOCATION(1)) uniform usampler2D stencilData;
@ -62,13 +64,15 @@ void main()
float depth = texelFetch(depthData, coord, 0).x;
uint stencil = texelFetch(stencilData, coord, 0).x;
uint depth_bytes = uint(depth * 0xffffff);
uint value = (depth_bytes << 8) | stencil;
if (swap_bytes != 0)
{
depth_bytes = (bitfieldExtract(depth_bytes, 0, 8) << 16u) | (bitfieldExtract(depth_bytes, 16, 8) << 0u) | depth_bytes & 0xFF00u;
// PS3-style byteswap (full word). PC byteswap is slightly different.
value = bswap_u32(value);
}
data[input_coord_to_output_id(coord)] = (depth_bytes << 8) | stencil;
data[input_coord_to_output_id(coord)] = value;
}
}
)"