rsx/gl: Implement variable path interpreter for optimal performance

This commit is contained in:
kd-11 2020-04-19 20:23:12 +03:00 committed by Ivan
parent 930bc9179d
commit bc5c4c9205
5 changed files with 285 additions and 109 deletions

View File

@ -223,6 +223,8 @@ vec4 read_cond()
return shuffle(cc[GET_BITS(1, 31, 1)], GET_BITS(1, 21, 8)); return shuffle(cc[GET_BITS(1, 31, 1)], GET_BITS(1, 21, 8));
} }
#ifdef WITH_TEXTURES
vec4 _texture(in vec4 coord, float bias) vec4 _texture(in vec4 coord, float bias)
{ {
const uint tex_num = GET_BITS(0, 17, 4); const uint tex_num = GET_BITS(0, 17, 4);
@ -275,6 +277,8 @@ vec4 _textureLod(in vec4 coord, float lod)
return vec4(0.); return vec4(0.);
} }
#endif
void write_dst(in vec4 value) void write_dst(in vec4 value)
{ {
bvec4 inst_mask = bvec4( bvec4 inst_mask = bvec4(
@ -423,8 +427,11 @@ void main()
value = sin(s0.xxxx); break; value = sin(s0.xxxx); break;
case RSX_FP_OPCODE_NRM: case RSX_FP_OPCODE_NRM:
value.xyz = normalize(s0.xyz); break; value.xyz = normalize(s0.xyz); break;
#ifdef WITH_TEXTURES
case RSX_FP_OPCODE_TEX: case RSX_FP_OPCODE_TEX:
value = _texture(s0, 0.f); break; value = _texture(s0, 0.f); break;
#endif
default: default:
handled = false; handled = false;
} }
@ -470,6 +477,8 @@ void main()
value = s0 / s1.xxxx; value = s0 / s1.xxxx;
case RSX_FP_OPCODE_DIVSQ: case RSX_FP_OPCODE_DIVSQ:
value = s0 * inversesqrt(s1.xxxx); break; value = s0 * inversesqrt(s1.xxxx); break;
#ifdef WITH_TEXTURES
//case RSX_FP_OPCODE_TXP: //case RSX_FP_OPCODE_TXP:
//case RSX_FP_OPCODE_TXD: //case RSX_FP_OPCODE_TXD:
case RSX_FP_OPCODE_TXL: case RSX_FP_OPCODE_TXL:
@ -478,6 +487,7 @@ void main()
value = _texture(s0, s1.x); break; value = _texture(s0, s1.x); break;
//case RSX_FP_OPCODE_TEXBEM: //case RSX_FP_OPCODE_TEXBEM:
//case RSX_FP_OPCODE_TXPBEM: //case RSX_FP_OPCODE_TXPBEM:
#endif
default: default:
handled = false; handled = false;
} }
@ -529,29 +539,41 @@ void main()
write_dst(value); write_dst(value);
} }
if (!shader_attribute(CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS)) #ifdef WITH_HALF_OUTPUT_REGISTER
{
ocol0 = regs16[0]; ocol0 = regs16[0];
ocol1 = regs16[4]; ocol1 = regs16[4];
ocol1 = regs16[6]; ocol1 = regs16[6];
ocol1 = regs16[8]; ocol1 = regs16[8];
} #else
else
{
ocol0 = regs32[0]; ocol0 = regs32[0];
ocol1 = regs32[2]; ocol1 = regs32[2];
ocol1 = regs32[3]; ocol1 = regs32[3];
ocol1 = regs32[4]; ocol1 = regs32[4];
} #endif
if (shader_attribute(CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)) #ifdef WITH_DEPTH_EXPORT
{ gl_FragDepth = regs32[1].z;
gl_FragDepth = regs32[1].z; #endif
}
else // Typically an application will pick one strategy and stick with it
{ #ifdef ALPHA_TEST_GEQUAL
gl_FragDepth = gl_FragCoord.z; if (ocol0.a < alpha_ref) discard; // gequal
} #endif
#ifdef ALPHA_TEST_GREATER
if (ocol0.a > alpha_ref) discard; // greater
#endif
#ifdef ALPHA_TEST_LESS
if (ocol0.a >= alpha_ref) discard; // less
#endif
#ifdef ALPHA_TEST_LEQUAL
if (ocol0.a > alpha_ref) discard; // lequal
#endif
#ifdef ALPHA_TEST_EQUAL
if (ocol0.a != alpha_ref) discard; // equal
#endif
#ifdef ALPHA_TEST_NEQUAL
if (ocol0.a == alpha_ref) discard; // nequal
#endif
} }
)" )"

View File

@ -5,6 +5,19 @@ namespace program_common
{ {
namespace interpreter namespace interpreter
{ {
enum compiler_option
{
COMPILER_OPT_ENABLE_TEXTURES = 1,
COMPILER_OPT_ENABLE_DEPTH_EXPORT = 2,
COMPILER_OPT_ENABLE_F32_EXPORT = 4,
COMPILER_OPT_ENABLE_ALPHA_TEST_GE = 8,
COMPILER_OPT_ENABLE_ALPHA_TEST_G = 16,
COMPILER_OPT_ENABLE_ALPHA_TEST_LE = 32,
COMPILER_OPT_ENABLE_ALPHA_TEST_L = 64,
COMPILER_OPT_ENABLE_ALPHA_TEST_EQ = 128,
COMPILER_OPT_ENABLE_ALPHA_TEST_NE = 256,
};
static std::string get_vertex_interpreter() static std::string get_vertex_interpreter()
{ {
const char* s = const char* s =

View File

@ -621,13 +621,21 @@ bool GLGSRender::load_program()
current_vertex_program.skip_vertex_input_check = true; //not needed for us since decoding is done server side current_vertex_program.skip_vertex_input_check = true; //not needed for us since decoding is done server side
current_fragment_program.unnormalized_coords = 0; //unused current_fragment_program.unnormalized_coords = 0; //unused
} }
else if (m_program && else if (m_program)
(m_program != m_shader_interpreter.get() || interpreter_mode == shader_interpreter_mode::forced))
{ {
return true; if (!m_shader_interpreter.is_interpreter(m_program)) [[likely]]
{
return true;
}
if (interpreter_mode == shader_interpreter_mode::forced)
{
m_program = m_shader_interpreter.get(current_fp_metadata);
return true;
}
} }
auto old_program = m_program; const bool was_interpreter = m_shader_interpreter.is_interpreter(m_program);
if (interpreter_mode != shader_interpreter_mode::forced) [[likely]] if (interpreter_mode != shader_interpreter_mode::forced) [[likely]]
{ {
void* pipeline_properties = nullptr; void* pipeline_properties = nullptr;
@ -660,12 +668,16 @@ bool GLGSRender::load_program()
m_program->sync(); m_program->sync();
} }
} }
else
{
m_program = nullptr;
}
if (!m_program && interpreter_mode != shader_interpreter_mode::disabled) if (!m_program && interpreter_mode != shader_interpreter_mode::disabled)
{ {
// Fall back to interpreter // Fall back to interpreter
m_program = m_shader_interpreter.get(); m_program = m_shader_interpreter.get(current_fp_metadata);
if (old_program != m_program) if (was_interpreter != m_shader_interpreter.is_interpreter(m_program))
{ {
// Program has changed, reupload // Program has changed, reupload
m_interpreter_state = rsx::invalidate_pipeline_bits; m_interpreter_state = rsx::invalidate_pipeline_bits;
@ -689,7 +701,7 @@ void GLGSRender::load_program_env()
const bool update_vertex_env = !!(m_graphics_state & rsx::pipeline_state::vertex_state_dirty); const bool update_vertex_env = !!(m_graphics_state & rsx::pipeline_state::vertex_state_dirty);
const bool update_fragment_env = !!(m_graphics_state & rsx::pipeline_state::fragment_state_dirty); const bool update_fragment_env = !!(m_graphics_state & rsx::pipeline_state::fragment_state_dirty);
const bool update_fragment_texture_env = !!(m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty); const bool update_fragment_texture_env = !!(m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty);
const bool update_instruction_buffers = (!!m_interpreter_state && m_program == m_shader_interpreter.get()); const bool update_instruction_buffers = (!!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program));
m_program->use(); m_program->use();

View File

@ -44,46 +44,70 @@ namespace gl
void shader_interpreter::create() void shader_interpreter::create()
{ {
texture_pools[0].create(shader::type::vertex);
texture_pools[1].create(shader::type::fragment);
build_vs(); build_vs();
build_fs(); build_program(::program_common::interpreter::COMPILER_OPT_ENABLE_TEXTURES);
build_program(::program_common::interpreter::COMPILER_OPT_ENABLE_TEXTURES | ::program_common::interpreter::COMPILER_OPT_ENABLE_F32_EXPORT);
program_handle.create().
attach(vs).
attach(fs).
link();
program_handle.uniforms[0] = GL_STREAM_BUFFER_START + 0;
program_handle.uniforms[1] = GL_STREAM_BUFFER_START + 1;
// Initialize texture bindings
int assigned = 0;
auto& allocator = texture_pools[1];
const char* type_names[] = { "sampler1D_array", "sampler2D_array", "samplerCube_array", "sampler3D_array" };
for (int i = 0; i < 4; ++i)
{
for (int j = 0; j < allocator.pools[i].pool_size; ++j)
{
allocator.pools[i].allocate(assigned++);
}
program_handle.uniforms[type_names[i]] = allocator.pools[i].allocated;
}
} }
void shader_interpreter::destroy() void shader_interpreter::destroy()
{ {
program_handle.remove(); for (auto& prog : m_program_cache)
vs.remove(); {
fs.remove(); prog.second->fs.remove();
prog.second->prog.remove();
}
m_vs.remove();
} }
glsl::program* shader_interpreter::get() glsl::program* shader_interpreter::get(const interpreter::program_metadata& metadata)
{ {
return &program_handle; // Build options
u64 opt = 0;
if (rsx::method_registers.alpha_test_enabled()) [[unlikely]]
{
switch (rsx::method_registers.alpha_func())
{
case rsx::comparison_function::always:
break;
case rsx::comparison_function::never:
return nullptr;
case rsx::comparison_function::greater_or_equal:
opt |= program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_GE;
break;
case rsx::comparison_function::greater:
opt |= program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_G;
break;
case rsx::comparison_function::less_or_equal:
opt |= program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_LE;
break;
case rsx::comparison_function::less:
opt |= program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_L;
break;
case rsx::comparison_function::equal:
opt |= program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_EQ;
break;
case rsx::comparison_function::not_equal:
opt |= program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_NE;
break;
}
}
if (rsx::method_registers.shader_control() & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) opt |= program_common::interpreter::COMPILER_OPT_ENABLE_DEPTH_EXPORT;
if (rsx::method_registers.shader_control() & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) opt |= program_common::interpreter::COMPILER_OPT_ENABLE_F32_EXPORT;
if (metadata.referenced_textures_mask) opt |= program_common::interpreter::COMPILER_OPT_ENABLE_TEXTURES;
if (auto it = m_program_cache.find(opt); it != m_program_cache.end()) [[likely]]
{
m_current_interpreter = it->second.get();
}
else
{
m_current_interpreter = build_program(opt);
}
return &m_current_interpreter->prog;
} }
void shader_interpreter::build_vs() void shader_interpreter::build_vs()
@ -124,43 +148,47 @@ namespace gl
builder << program_common::interpreter::get_vertex_interpreter(); builder << program_common::interpreter::get_vertex_interpreter();
const std::string s = builder.str(); const std::string s = builder.str();
vs.create(glsl::shader::type::vertex); m_vs.create(glsl::shader::type::vertex);
vs.source(s); m_vs.source(s);
vs.compile(); m_vs.compile();
} }
void shader_interpreter::build_fs() void shader_interpreter::build_fs(u64 compiler_options, interpreter::cached_program& prog_data)
{ {
// Allocate TIUs // Allocate TIUs
auto& allocator = texture_pools[1]; auto& allocator = prog_data.allocator;
if (allocator.max_image_units >= 32) if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_TEXTURES)
{ {
// 16 + 4 + 4 + 4 allocator.create(glsl::shader::type::fragment);
allocator.allocate(4); // 1D if (allocator.max_image_units >= 32)
allocator.allocate(16); // 2D {
allocator.allocate(4); // CUBE // 16 + 4 + 4 + 4
allocator.allocate(4); // 3D allocator.allocate(4); // 1D
} allocator.allocate(16); // 2D
else if (allocator.max_image_units >= 24) allocator.allocate(4); // CUBE
{ allocator.allocate(4); // 3D
// 16 + 4 + 2 + 2 }
allocator.allocate(2); // 1D else if (allocator.max_image_units >= 24)
allocator.allocate(16); // 2D {
allocator.allocate(2); // CUBE // 16 + 4 + 2 + 2
allocator.allocate(4); // 3D allocator.allocate(2); // 1D
} allocator.allocate(16); // 2D
else if (allocator.max_image_units >= 16) allocator.allocate(2); // CUBE
{ allocator.allocate(4); // 3D
// 10 + 2 + 2 + 2 }
allocator.allocate(2); // 1D else if (allocator.max_image_units >= 16)
allocator.allocate(10); // 2D {
allocator.allocate(2); // CUBE // 10 + 2 + 2 + 2
allocator.allocate(2); // 3D allocator.allocate(2); // 1D
} allocator.allocate(10); // 2D
else allocator.allocate(2); // CUBE
{ allocator.allocate(2); // 3D
// Unusable }
rsx_log.fatal("Failed to allocate enough TIUs for shader interpreter."); else
{
// Unusable
rsx_log.fatal("Failed to allocate enough TIUs for shader interpreter.");
}
} }
::glsl::shader_properties properties{}; ::glsl::shader_properties properties{};
@ -182,18 +210,67 @@ namespace gl
::glsl::insert_subheader_block(builder); ::glsl::insert_subheader_block(builder);
comp.insertConstants(builder); comp.insertConstants(builder);
const char* type_names[] = { "sampler1D", "sampler2D", "samplerCube", "sampler3D" }; if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_GE)
for (int i = 0; i < 4; ++i)
{ {
builder << "uniform " << type_names[i] << " " << type_names[i] << "_array[" << allocator.pools[i].pool_size << "];\n"; builder << "#define ALPHA_TEST_GEQUAL\n";
} }
builder << "\n" if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_G)
"#define IS_TEXTURE_RESIDENT(index) (texture_handles[index] < 0xFF)\n" {
"#define SAMPLER1D(index) sampler1D_array[texture_handles[index]]\n" builder << "#define ALPHA_TEST_GREATER\n";
"#define SAMPLER2D(index) sampler2D_array[texture_handles[index]]\n" }
"#define SAMPLER3D(index) sampler3D_array[texture_handles[index]]\n"
"#define SAMPLERCUBE(index) samplerCube_array[texture_handles[index]]\n\n"; if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_LE)
{
builder << "#define ALPHA_TEST_LEQUAL\n";
}
if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_L)
{
builder << "#define ALPHA_TEST_LESS\n";
}
if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_EQ)
{
builder << "#define ALPHA_TEST_EQUAL\n";
}
if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_ALPHA_TEST_NE)
{
builder << "#define ALPHA_TEST_NEQUAL\n";
}
if (!(compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_F32_EXPORT))
{
builder << "#define WITH_HALF_OUTPUT_REGISTER\n";
}
if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_DEPTH_EXPORT)
{
builder << "#define WITH_DEPTH_EXPORT\n";
}
if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_TEXTURES)
{
builder << "#define WITH_TEXTURES\n\n";
const char* type_names[] = { "sampler1D", "sampler2D", "samplerCube", "sampler3D" };
for (int i = 0; i < 4; ++i)
{
builder << "uniform " << type_names[i] << " " << type_names[i] << "_array[" << allocator.pools[i].pool_size << "];\n";
}
builder << "\n"
"#define IS_TEXTURE_RESIDENT(index) (texture_handles[index] < 0xFF)\n"
"#define SAMPLER1D(index) sampler1D_array[texture_handles[index]]\n"
"#define SAMPLER2D(index) sampler2D_array[texture_handles[index]]\n"
"#define SAMPLER3D(index) sampler3D_array[texture_handles[index]]\n"
"#define SAMPLERCUBE(index) samplerCube_array[texture_handles[index]]\n\n";
}
else if (compiler_options)
{
builder << "\n";
}
builder << builder <<
"layout(std430, binding =" << GL_INTERPRETER_FRAGMENT_BLOCK << ") readonly restrict buffer FragmentInstructionBlock\n" "layout(std430, binding =" << GL_INTERPRETER_FRAGMENT_BLOCK << ") readonly restrict buffer FragmentInstructionBlock\n"
@ -211,22 +288,62 @@ namespace gl
builder << program_common::interpreter::get_fragment_interpreter(); builder << program_common::interpreter::get_fragment_interpreter();
const std::string s = builder.str(); const std::string s = builder.str();
fs.create(glsl::shader::type::fragment); prog_data.fs.create(glsl::shader::type::fragment);
fs.source(s); prog_data.fs.source(s);
fs.compile(); prog_data.fs.compile();
}
interpreter::cached_program* shader_interpreter::build_program(u64 compiler_options)
{
auto data = new interpreter::cached_program();
build_fs(compiler_options, *data);
data->prog.create().
attach(m_vs).
attach(data->fs).
link();
data->prog.uniforms[0] = GL_STREAM_BUFFER_START + 0;
data->prog.uniforms[1] = GL_STREAM_BUFFER_START + 1;
if (compiler_options & program_common::interpreter::COMPILER_OPT_ENABLE_TEXTURES)
{
// Initialize texture bindings
int assigned = 0;
auto& allocator = data->allocator;
const char* type_names[] = { "sampler1D_array", "sampler2D_array", "samplerCube_array", "sampler3D_array" };
for (int i = 0; i < 4; ++i)
{
for (int j = 0; j < allocator.pools[i].pool_size; ++j)
{
allocator.pools[i].allocate(assigned++);
}
data->prog.uniforms[type_names[i]] = allocator.pools[i].allocated;
}
}
m_program_cache[compiler_options].reset(data);
return data;
}
bool shader_interpreter::is_interpreter(const glsl::program* program)
{
return (program == &m_current_interpreter->prog);
} }
void shader_interpreter::update_fragment_textures( void shader_interpreter::update_fragment_textures(
const std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, 16>& descriptors, const std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, 16>& descriptors,
u16 reference_mask, u32* out) u16 reference_mask, u32* out)
{ {
if (reference_mask == 0) if (reference_mask == 0 || !m_current_interpreter)
{ {
return; return;
} }
// Reset allocation // Reset allocation
auto& allocator = texture_pools[1]; auto& allocator = m_current_interpreter->allocator;
for (unsigned i = 0; i < 4; ++i) for (unsigned i = 0; i < 4; ++i)
{ {
allocator.pools[i].num_used = 0; allocator.pools[i].num_used = 0;
@ -306,9 +423,9 @@ namespace gl
} }
} }
if (allocator.pools[0].flags) program_handle.uniforms["sampler1D_array"] = allocator.pools[0].allocated; if (allocator.pools[0].flags) m_current_interpreter->prog.uniforms["sampler1D_array"] = allocator.pools[0].allocated;
if (allocator.pools[1].flags) program_handle.uniforms["sampler2D_array"] = allocator.pools[1].allocated; if (allocator.pools[1].flags) m_current_interpreter->prog.uniforms["sampler2D_array"] = allocator.pools[1].allocated;
if (allocator.pools[2].flags) program_handle.uniforms["samplerCube_array"] = allocator.pools[2].allocated; if (allocator.pools[2].flags) m_current_interpreter->prog.uniforms["samplerCube_array"] = allocator.pools[2].allocated;
if (allocator.pools[3].flags) program_handle.uniforms["sampler3D_array"] = allocator.pools[3].allocated; if (allocator.pools[3].flags) m_current_interpreter->prog.uniforms["sampler3D_array"] = allocator.pools[3].allocated;
} }
} }

View File

@ -1,10 +1,13 @@
#pragma once #pragma once
#include "GLHelpers.h" #include "GLHelpers.h"
#include "../Common/ProgramStateCache.h"
namespace gl namespace gl
{ {
namespace interpreter namespace interpreter
{ {
using program_metadata = program_hash_util::fragment_program_utils::fragment_program_metadata;
enum class texture_pool_flags enum class texture_pool_flags
{ {
dirty = 1 dirty = 1
@ -48,17 +51,25 @@ namespace gl
void create(::gl::glsl::shader::type domain); void create(::gl::glsl::shader::type domain);
void allocate(int size); void allocate(int size);
}; };
struct cached_program
{
glsl::shader fs;
glsl::program prog;
texture_pool_allocator allocator;
};
} }
class shader_interpreter class shader_interpreter
{ {
glsl::shader vs; glsl::shader m_vs;
glsl::shader fs; std::unordered_map<u64, std::unique_ptr<interpreter::cached_program>> m_program_cache;
glsl::program program_handle;
interpreter::texture_pool_allocator texture_pools[2];
void build_vs(); void build_vs();
void build_fs(); void build_fs(u64 compiler_options, interpreter::cached_program& prog_data);
interpreter::cached_program* build_program(u64 compiler_options);
interpreter::cached_program* m_current_interpreter = nullptr;
public: public:
void create(); void create();
@ -66,6 +77,7 @@ namespace gl
void update_fragment_textures(const std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, 16>& descriptors, u16 reference_mask, u32* out); void update_fragment_textures(const std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, 16>& descriptors, u16 reference_mask, u32* out);
glsl::program* get(); glsl::program* get(const interpreter::program_metadata& fp_metadata);
bool is_interpreter(const glsl::program* program);
}; };
} }