Merge pull request #528 from DrChat/glsl_optimizations

GLSL Optimizations
This commit is contained in:
Ben Vanik 2016-01-30 17:46:09 -08:00
commit b696c645f5
2 changed files with 182 additions and 115 deletions

View File

@ -110,19 +110,19 @@ layout(std430, column_major) buffer;
// This must match DrawBatcher::CommonHeader. // This must match DrawBatcher::CommonHeader.
struct StateData { struct StateData {
vec4 window_scale; vec4 window_scale; // 0x0
vec4 vtx_fmt; vec4 vtx_fmt; // 0x10
vec4 alpha_test; vec4 alpha_test; // 0x20
uint ps_param_gen; uint ps_param_gen; // 0x30
uint padding[3]; uint padding[3]; // 0x34
// TODO(benvanik): variable length. // TODO(benvanik): variable length.
uvec2 texture_samplers[32]; uvec2 texture_samplers[32]; // 0x40
uint texture_swizzles[32]; uint texture_swizzles[32]; // 0x140
vec4 float_consts[512]; vec4 float_consts[512]; // 0x1C0
int bool_consts[8]; int bool_consts[8]; // 0x21C0
int loop_consts[32]; int loop_consts[32]; // 0x2240
}; };
layout(binding = 0) buffer State { layout(binding = 0) readonly buffer State {
StateData states[]; StateData states[];
}; };
@ -194,33 +194,33 @@ layout(location = 1) out VertexData vtx;
vec3 get_10_11_11_u(const uint data_in) { vec3 get_10_11_11_u(const uint data_in) {
vec3 vec; vec3 vec;
vec.x = bitfieldExtract(data_in, 0, 10); vec.x = bitfieldExtract(data_in, 22, 10);
vec.y = bitfieldExtract(data_in, 10, 11); vec.y = bitfieldExtract(data_in, 11, 11);
vec.z = bitfieldExtract(data_in, 21, 11); vec.z = bitfieldExtract(data_in, 0, 11);
return vec; return vec;
} }
vec3 get_10_11_11_s(const int data_in) { vec3 get_10_11_11_s(const int data_in) {
vec3 vec; vec3 vec;
vec.x = bitfieldExtract(data_in, 0, 10); vec.x = bitfieldExtract(data_in, 22, 10);
vec.y = bitfieldExtract(data_in, 10, 11); vec.y = bitfieldExtract(data_in, 11, 11);
vec.z = bitfieldExtract(data_in, 21, 11); vec.z = bitfieldExtract(data_in, 0, 11);
return vec; return vec;
} }
vec4 get_2_10_10_10_u(const uint data_in) { vec4 get_2_10_10_10_u(const uint data_in) {
vec4 vec; vec4 vec;
vec.w = bitfieldExtract(data_in, 0, 2 ); vec.x = bitfieldExtract(data_in, 20, 10);
vec.x = bitfieldExtract(data_in, 2, 10); vec.y = bitfieldExtract(data_in, 10, 10);
vec.y = bitfieldExtract(data_in, 12, 10); vec.z = bitfieldExtract(data_in, 0, 10);
vec.z = bitfieldExtract(data_in, 22, 10); vec.w = bitfieldExtract(data_in, 30, 2);
return vec; return vec;
} }
vec4 get_2_10_10_10_s(const int data_in) { vec4 get_2_10_10_10_s(const int data_in) {
vec4 vec; vec4 vec;
vec.w = bitfieldExtract(data_in, 0, 2 ); vec.x = bitfieldExtract(data_in, 20, 10);
vec.x = bitfieldExtract(data_in, 2, 10); vec.y = bitfieldExtract(data_in, 10, 10);
vec.y = bitfieldExtract(data_in, 12, 10); vec.z = bitfieldExtract(data_in, 0, 10);
vec.z = bitfieldExtract(data_in, 22, 10); vec.w = bitfieldExtract(data_in, 30, 2);
return vec; return vec;
} }
@ -229,14 +229,9 @@ vec4 applyTransform(const in StateData state, vec4 pos) {
// w is 1/W0, so fix it. // w is 1/W0, so fix it.
pos.w = 1.0 / pos.w; pos.w = 1.0 / pos.w;
} }
if (state.vtx_fmt.x != 0.0) {
// Already multiplied by 1/W0, so pull it out. // Already multiplied by 1/W0, so pull it out.
pos.xy /= pos.w; pos.xyz = mix(pos.xyz, pos.xyz / pos.w, notEqual(state.vtx_fmt.xyz, vec3(0.0)));
}
if (state.vtx_fmt.z != 0.0) {
// Already multiplied by 1/W0, so pull it out.
pos.z /= pos.w;
}
pos.xy *= state.window_scale.xy; pos.xy *= state.window_scale.xy;
return pos; return pos;
}; };
@ -244,9 +239,6 @@ void processVertex(const in StateData state);
void main() { void main() {
gl_Position = vec4(0.0, 0.0, 0.0, 1.0); gl_Position = vec4(0.0, 0.0, 0.0, 1.0);
gl_PointSize = 1.0; gl_PointSize = 1.0;
for (int i = 0; i < vtx.o.length(); ++i) {
vtx.o[i] = vec4(0.0, 0.0, 0.0, 0.0);
}
const StateData state = states[gl_DrawIDARB]; const StateData state = states[gl_DrawIDARB];
processVertex(state); processVertex(state);
gl_Position = applyTransform(state, gl_Position); gl_Position = applyTransform(state, gl_Position);
@ -255,6 +247,18 @@ void main() {
)"); )");
} else { } else {
EmitSource(R"( EmitSource(R"(
float getWeights1D(sampler1D tex, float texCoord) {
return fract(texCoord * textureSize(tex, 0));
}
vec2 getWeights2D(sampler2D tex, vec2 texCoord) {
return fract(texCoord * textureSize(tex, 0));
}
vec3 getWeights3D(sampler3D tex, vec3 texCoord) {
return fract(texCoord * textureSize(tex, 0));
}
layout(origin_upper_left, pixel_center_integer) in vec4 gl_FragCoord; layout(origin_upper_left, pixel_center_integer) in vec4 gl_FragCoord;
layout(location = 0) flat in uint draw_id; layout(location = 0) flat in uint draw_id;
layout(location = 1) in VertexData vtx; layout(location = 1) in VertexData vtx;
@ -323,9 +327,20 @@ void main() {
// Represents number of times remaining to loop. // Represents number of times remaining to loop.
EmitSource(" ivec4 loop_count = ivec4(0);\n"); EmitSource(" ivec4 loop_count = ivec4(0);\n");
// Previous Vector result (used as a scratch).
EmitSource(" vec4 pv;\n");
// Previous Scalar result (used for RETAIN_PREV).
EmitSource(" float ps;\n");
// Temps for source register values.
EmitSource(" vec4 src0;\n");
EmitSource(" vec4 src1;\n");
EmitSource(" vec4 src2;\n");
// Temporary registers. // Temporary registers.
if (is_vertex_shader()) { if (is_vertex_shader()) {
EmitSource(" vec4 r[64];\n"); EmitSource(" vec4 r[64];\n");
// FIXME: We're probably supposed to use a vs_param_gen here.
EmitSource(" r[0].x = gl_VertexID;\n"); EmitSource(" r[0].x = gl_VertexID;\n");
} else { } else {
// Bring interpolators from vertex shader into temporary registers. // Bring interpolators from vertex shader into temporary registers.
@ -337,22 +352,27 @@ void main() {
EmitSource( EmitSource(
" vec4 ps_param_gen = vec4(gl_FragCoord.xy, gl_PointCoord.xy);\n"); " vec4 ps_param_gen = vec4(gl_FragCoord.xy, gl_PointCoord.xy);\n");
EmitSource(" ps_param_gen.x *= (gl_FrontFacing ? 1.0 : -1.0);\n"); EmitSource(" ps_param_gen.x *= (gl_FrontFacing ? 1.0 : -1.0);\n");
// This is insane, but r[ps_param_gen] causes nvidia to fully deopt? // This is insane, but r[ps_param_gen] causes nvidia to fully deopt?
// EmitSource(" r[state.ps_param_gen] = ps_param_gen;\n"); // EmitSource(" r[state.ps_param_gen] = ps_param_gen;\n");
EmitSource(" if (state.ps_param_gen == 0) r[0] = ps_param_gen;\n");
for (int i = 1; i < kMaxInterpolators; ++i) { // FIXME: Branches are still generated for registers that are never used!
// May need a usage map?
for (int i = 0; i < kMaxInterpolators; i++) {
EmitSource( EmitSource(
" else if (state.ps_param_gen == %d) r[%d] = ps_param_gen;\n", i, " r[%d] = mix(r[%d], ps_param_gen, state.ps_param_gen == %d);\n",
i); i, i, i);
} }
EmitSource(" }\n"); EmitSource(" }\n");
} }
// Master loop and switch for flow control. // Master loop and switch for flow control.
EmitSourceDepth("int pc = 0;\n"); EmitSourceDepth("int pc = 0;\n");
EmitSourceDepth("while (pc != 0xFFFF) {\n"); EmitSourceDepth("do {\n");
Indent(); Indent();
EmitSourceDepth("switch (pc) {\n"); EmitSourceDepth("switch (pc) {\n");
EmitSourceDepth("case 0x0:\n");
} }
std::vector<uint8_t> GlslShaderTranslator::CompleteTranslation() { std::vector<uint8_t> GlslShaderTranslator::CompleteTranslation() {
@ -360,7 +380,7 @@ std::vector<uint8_t> GlslShaderTranslator::CompleteTranslation() {
EmitSourceDepth("default: pc = 0xFFFF; break;\n"); EmitSourceDepth("default: pc = 0xFFFF; break;\n");
EmitSourceDepth("}; // switch\n"); EmitSourceDepth("}; // switch\n");
Unindent(); Unindent();
EmitSourceDepth("} // while\n"); EmitSourceDepth("} while (pc != 0xFFFF); // do while\n");
// End of process*() function. // End of process*() function.
EmitSource("}\n"); EmitSource("}\n");
@ -368,7 +388,12 @@ std::vector<uint8_t> GlslShaderTranslator::CompleteTranslation() {
return source_.ToBytes(); return source_.ToBytes();
} }
void GlslShaderTranslator::ProcessLabel(uint32_t cf_index) {} void GlslShaderTranslator::ProcessLabel(uint32_t cf_index) {
// Case 0x0 is already defined at this point.
if (cf_index != 0x0) {
EmitSourceDepth("case 0x%X:\n", cf_index);
}
}
void GlslShaderTranslator::ProcessControlFlowNopInstruction() { void GlslShaderTranslator::ProcessControlFlowNopInstruction() {
EmitSource("// cnop\n"); EmitSource("// cnop\n");
@ -377,7 +402,6 @@ void GlslShaderTranslator::ProcessControlFlowNopInstruction() {
void GlslShaderTranslator::ProcessControlFlowInstructionBegin( void GlslShaderTranslator::ProcessControlFlowInstructionBegin(
uint32_t cf_index) { uint32_t cf_index) {
cf_wrote_pc_ = false; cf_wrote_pc_ = false;
EmitSourceDepth("case 0x%X: // L%d\n", cf_index, cf_index);
Indent(); Indent();
} }
@ -393,6 +417,7 @@ void GlslShaderTranslator::ProcessExecInstructionBegin(
EmitSource("// "); EmitSource("// ");
instr.Disassemble(&source_); instr.Disassemble(&source_);
cf_exec_pred_ = false;
switch (instr.type) { switch (instr.type) {
case ParsedExecInstruction::Type::kUnconditional: case ParsedExecInstruction::Type::kUnconditional:
EmitSourceDepth("{\n"); EmitSourceDepth("{\n");
@ -404,19 +429,12 @@ void GlslShaderTranslator::ProcessExecInstructionBegin(
instr.condition ? '!' : '='); instr.condition ? '!' : '=');
break; break;
case ParsedExecInstruction::Type::kPredicated: case ParsedExecInstruction::Type::kPredicated:
cf_exec_pred_ = true;
cf_exec_pred_cond_ = instr.condition;
EmitSourceDepth("if (%cp0) {\n", instr.condition ? ' ' : '!'); EmitSourceDepth("if (%cp0) {\n", instr.condition ? ' ' : '!');
break; break;
} }
Indent(); Indent();
// Previous Vector result (used as a scratch).
EmitSourceDepth("vec4 pv;\n");
// Previous Scalar result (used for RETAIN_PREV).
EmitSourceDepth("float ps;\n");
// Temps for source register values.
EmitSourceDepth("vec4 src0;\n");
EmitSourceDepth("vec4 src1;\n");
EmitSourceDepth("vec4 src2;\n");
} }
void GlslShaderTranslator::ProcessExecInstructionEnd( void GlslShaderTranslator::ProcessExecInstructionEnd(
@ -546,7 +564,7 @@ void GlslShaderTranslator::ProcessJumpInstruction(
EmitSourceDepth("pc = 0x%X; // L%d\n", instr.target_address, EmitSourceDepth("pc = 0x%X; // L%d\n", instr.target_address,
instr.target_address); instr.target_address);
cf_wrote_pc_ = true; EmitSourceDepth("break;\n");
Unindent(); Unindent();
if (needs_fallthrough) { if (needs_fallthrough) {
@ -558,7 +576,6 @@ void GlslShaderTranslator::ProcessJumpInstruction(
} else { } else {
EmitSourceDepth("}\n"); EmitSourceDepth("}\n");
} }
EmitSourceDepth("break;\n");
} }
void GlslShaderTranslator::ProcessAllocInstruction( void GlslShaderTranslator::ProcessAllocInstruction(
@ -655,6 +672,9 @@ void GlslShaderTranslator::ProcessTextureFetchInstruction(
switch (instr.opcode) { switch (instr.opcode) {
case FetchOpcode::kTextureFetch: case FetchOpcode::kTextureFetch:
EmitSourceDepth("{\n");
Indent();
switch (instr.dimension) { switch (instr.dimension) {
case TextureDimension::k1D: case TextureDimension::k1D:
EmitSourceDepth("if (state.texture_samplers[%d] != uvec2(0)) {\n", EmitSourceDepth("if (state.texture_samplers[%d] != uvec2(0)) {\n",
@ -671,9 +691,21 @@ void GlslShaderTranslator::ProcessTextureFetchInstruction(
EmitSourceDepth("if (state.texture_samplers[%d] != uvec2(0)) {\n", EmitSourceDepth("if (state.texture_samplers[%d] != uvec2(0)) {\n",
instr.operands[1].storage_index); instr.operands[1].storage_index);
EmitSourceDepth( EmitSourceDepth(
" pv = texture(sampler2D(state.texture_samplers[%d]), " " sampler2D samp = sampler2D(state.texture_samplers[%d]);\n",
"src0.xy);\n",
instr.operands[1].storage_index); instr.operands[1].storage_index);
if (instr.attributes.offset_x == 0.f &&
instr.attributes.offset_y == 0.f) {
EmitSourceDepth(" pv = texture(samp, src0.xy);\n",
instr.operands[1].storage_index);
} else {
// FIXME: This offset is still wrong, somehow.
EmitSourceDepth(
" pv = texture(samp, src0.xy + (vec2(%.2f, %.2f) / "
"textureSize(samp, 0)));\n",
instr.attributes.offset_x, instr.attributes.offset_y);
}
EmitSourceDepth("} else {\n"); EmitSourceDepth("} else {\n");
EmitSourceDepth(" pv = vec4(src0.x, src0.y, 0.0, 1.0);\n"); EmitSourceDepth(" pv = vec4(src0.x, src0.y, 0.0, 1.0);\n");
EmitSourceDepth("}\n"); EmitSourceDepth("}\n");
@ -702,15 +734,24 @@ void GlslShaderTranslator::ProcessTextureFetchInstruction(
EmitSourceDepth("}\n"); EmitSourceDepth("}\n");
break; break;
} }
EmitSourceDepth("{\n");
EmitSourceDepth(
" float swiz_source[6] = {pv.x, pv.y, pv.z, pv.w, 0.0, 1.0};\n");
EmitSourceDepth("uint swiz = state.texture_swizzles[%d];\n", EmitSourceDepth("uint swiz = state.texture_swizzles[%d];\n",
instr.operands[1].storage_index); instr.operands[1].storage_index);
EmitSourceDepth(" pv = vec4(swiz_source[swiz & 0x7],\n"); EmitSourceDepth("vec4 orig = pv;\n");
EmitSourceDepth(" swiz_source[(swiz >> 3) & 0x7],\n"); EmitSourceDepth("ivec4 sv = ivec4(bitfieldExtract(swiz, 0, 3),\n");
EmitSourceDepth(" swiz_source[(swiz >> 6) & 0x7],\n"); EmitSourceDepth(" bitfieldExtract(swiz, 3, 3),\n");
EmitSourceDepth(" swiz_source[(swiz >> 9) & 0x7]);\n"); EmitSourceDepth(" bitfieldExtract(swiz, 6, 3),\n");
EmitSourceDepth(" bitfieldExtract(swiz, 9, 3));\n");
// This is a little uglier than using an array, but much, much faster.
EmitSourceDepth("pv = mix(pv, orig.xxxx, equal(sv, ivec4(0)));\n");
EmitSourceDepth("pv = mix(pv, orig.yyyy, equal(sv, ivec4(1)));\n");
EmitSourceDepth("pv = mix(pv, orig.zzzz, equal(sv, ivec4(2)));\n");
EmitSourceDepth("pv = mix(pv, orig.wwww, equal(sv, ivec4(3)));\n");
EmitSourceDepth("pv = mix(pv, vec4(0.0), equal(sv, ivec4(4)));\n");
EmitSourceDepth("pv = mix(pv, vec4(1.0), equal(sv, ivec4(5)));\n");
Unindent();
EmitSourceDepth("}\n"); EmitSourceDepth("}\n");
break; break;
case FetchOpcode::kGetTextureBorderColorFrac: case FetchOpcode::kGetTextureBorderColorFrac:
@ -726,8 +767,29 @@ void GlslShaderTranslator::ProcessTextureFetchInstruction(
EmitSourceDepth("pv = vec4(0.0);\n"); EmitSourceDepth("pv = vec4(0.0);\n");
break; break;
case FetchOpcode::kGetTextureWeights: case FetchOpcode::kGetTextureWeights:
switch (instr.dimension) {
case TextureDimension::k1D:
EmitSourceDepth(
"pv.x = getWeights1D(sampler1D(state.texture_samplers[%d]), "
"src0.x);\n",
instr.operands[1].storage_index);
break;
case TextureDimension::k2D:
EmitSourceDepth(
"pv.xy = getWeights2D(sampler2D(state.texture_samplers[%d]), "
"src0.xy);\n",
instr.operands[1].storage_index);
break;
case TextureDimension::k3D:
EmitSourceDepth(
"pv.xyz = getWeights3D(sampler3D(state.texture_samplers[%d]), "
"src0.xyz);\n",
instr.operands[1].storage_index);
break;
default:
EmitUnimplementedTranslationError(); EmitUnimplementedTranslationError();
EmitSourceDepth("pv = vec4(0.0);\n"); EmitSourceDepth("pv = vec4(0.0);\n");
}
break; break;
case FetchOpcode::kSetTextureLod: case FetchOpcode::kSetTextureLod:
EmitUnimplementedTranslationError(); EmitUnimplementedTranslationError();
@ -865,6 +927,14 @@ void GlslShaderTranslator::EmitStoreResult(const InstructionResult& result,
if (!result.has_any_writes()) { if (!result.has_any_writes()) {
return; return;
} }
// Special gl_pointSize discard
if (result.storage_target == InstructionStorageTarget::kPointSize &&
!result.is_standard_swizzle()) {
EmitUnimplementedTranslationError();
return;
}
bool uses_storage_index = false; bool uses_storage_index = false;
switch (result.storage_target) { switch (result.storage_target) {
case InstructionStorageTarget::kRegister: case InstructionStorageTarget::kRegister:
@ -971,7 +1041,12 @@ void GlslShaderTranslator::EmitStoreResult(const InstructionResult& result,
void GlslShaderTranslator::ProcessVectorAluInstruction( void GlslShaderTranslator::ProcessVectorAluInstruction(
const ParsedAluInstruction& instr) { const ParsedAluInstruction& instr) {
if (instr.is_predicated) { // Emit if statement only if we have a different predicate condition than our
// containing block.
bool conditional = false;
if (instr.is_predicated &&
(!cf_exec_pred_ || (cf_exec_pred_cond_ != instr.predicate_condition))) {
conditional = true;
EmitSourceDepth("if (%cp0) {\n", instr.predicate_condition ? ' ' : '!'); EmitSourceDepth("if (%cp0) {\n", instr.predicate_condition ? ' ' : '!');
Indent(); Indent();
} }
@ -1003,34 +1078,22 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// seq dest, src0, src1 // seq dest, src0, src1
case AluVectorOpcode::kSeq: case AluVectorOpcode::kSeq:
EmitSourceDepth("pv.x = src0.x == src1.x ? 1.0 : 0.0;\n"); EmitSourceDepth("pv = vec4(equal(src0, src1));\n");
EmitSourceDepth("pv.y = src0.y == src1.y ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.z = src0.z == src1.z ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.w = src0.w == src1.w ? 1.0 : 0.0;\n");
break; break;
// sgt dest, src0, src1 // sgt dest, src0, src1
case AluVectorOpcode::kSgt: case AluVectorOpcode::kSgt:
EmitSourceDepth("pv.x = src0.x > src1.x ? 1.0 : 0.0;\n"); EmitSourceDepth("pv = vec4(greaterThan(src0, src1));\n");
EmitSourceDepth("pv.y = src0.y > src1.y ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.z = src0.z > src1.z ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.w = src0.w > src1.w ? 1.0 : 0.0;\n");
break; break;
// sge dest, src0, src1 // sge dest, src0, src1
case AluVectorOpcode::kSge: case AluVectorOpcode::kSge:
EmitSourceDepth("pv.x = src0.x >= src1.x ? 1.0 : 0.0;\n"); EmitSourceDepth("pv = vec4(greaterThanEqual(src0, src1));\n");
EmitSourceDepth("pv.y = src0.y >= src1.y ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.z = src0.z >= src1.z ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.w = src0.w >= src1.w ? 1.0 : 0.0;\n");
break; break;
// sne dest, src0, src1 // sne dest, src0, src1
case AluVectorOpcode::kSne: case AluVectorOpcode::kSne:
EmitSourceDepth("pv.x = src0.x != src1.x ? 1.0 : 0.0;\n"); EmitSourceDepth("pv = vec4(notEqual(src0, src1));\n");
EmitSourceDepth("pv.y = src0.y != src1.y ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.z = src0.z != src1.z ? 1.0 : 0.0;\n");
EmitSourceDepth("pv.w = src0.w != src1.w ? 1.0 : 0.0;\n");
break; break;
// frc dest, src0 // frc dest, src0
@ -1055,26 +1118,21 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// cndeq dest, src0, src1, src2 // cndeq dest, src0, src1, src2
case AluVectorOpcode::kCndEq: case AluVectorOpcode::kCndEq:
EmitSourceDepth("pv.x = src0.x == 0.0 ? src1.x : src2.x;\n"); // src0 == 0 ? src1 : src2;
EmitSourceDepth("pv.y = src0.y == 0.0 ? src1.y : src2.y;\n"); EmitSourceDepth("pv = mix(src2, src1, equal(src0, vec4(0)));\n");
EmitSourceDepth("pv.z = src0.z == 0.0 ? src1.z : src2.z;\n");
EmitSourceDepth("pv.w = src0.w == 0.0 ? src1.w : src2.w;\n");
break; break;
// cndge dest, src0, src1, src2 // cndge dest, src0, src1, src2
case AluVectorOpcode::kCndGe: case AluVectorOpcode::kCndGe:
EmitSourceDepth("pv.x = src0.x >= 0.0 ? src1.x : src2.x;\n"); // src0 >= 0 ? src1 : src2;
EmitSourceDepth("pv.y = src0.y >= 0.0 ? src1.y : src2.y;\n"); EmitSourceDepth(
EmitSourceDepth("pv.z = src0.z >= 0.0 ? src1.z : src2.z;\n"); "pv = mix(src2, src1, greaterThanEqual(src0, vec4(0)));\n");
EmitSourceDepth("pv.w = src0.w >= 0.0 ? src1.w : src2.w;\n");
break; break;
// cndgt dest, src0, src1, src2 // cndgt dest, src0, src1, src2
case AluVectorOpcode::kCndGt: case AluVectorOpcode::kCndGt:
EmitSourceDepth("pv.x = src0.x > 0.0 ? src1.x : src2.x;\n"); // src0 > 0 ? src1 : src2;
EmitSourceDepth("pv.y = src0.y > 0.0 ? src1.y : src2.y;\n"); EmitSourceDepth("pv = mix(src2, src1, greaterThan(src0, vec4(0)));\n");
EmitSourceDepth("pv.z = src0.z > 0.0 ? src1.z : src2.z;\n");
EmitSourceDepth("pv.w = src0.w > 0.0 ? src1.w : src2.w;\n");
break; break;
// dp4 dest, src0, src1 // dp4 dest, src0, src1
@ -1106,6 +1164,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// setp_eq_push dest, src0, src1 // setp_eq_push dest, src0, src1
case AluVectorOpcode::kSetpEqPush: case AluVectorOpcode::kSetpEqPush:
cf_exec_pred_ = false;
EmitSourceDepth("p0 = src0.w == 0.0 && src1.w == 0.0 ? true : false;\n"); EmitSourceDepth("p0 = src0.w == 0.0 && src1.w == 0.0 ? true : false;\n");
EmitSourceDepth( EmitSourceDepth(
"pv = vec4(src0.x == 0.0 && src1.x == 0.0 ? 0.0 : src0.x + 1.0);\n"); "pv = vec4(src0.x == 0.0 && src1.x == 0.0 ? 0.0 : src0.x + 1.0);\n");
@ -1113,6 +1172,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// setp_ne_push dest, src0, src1 // setp_ne_push dest, src0, src1
case AluVectorOpcode::kSetpNePush: case AluVectorOpcode::kSetpNePush:
cf_exec_pred_ = false;
EmitSourceDepth("p0 = src0.w == 0.0 && src1.w != 0.0 ? true : false;\n"); EmitSourceDepth("p0 = src0.w == 0.0 && src1.w != 0.0 ? true : false;\n");
EmitSourceDepth( EmitSourceDepth(
"pv = vec4(src0.x == 0.0 && src1.x != 0.0 ? 0.0 : src0.x + 1.0);\n"); "pv = vec4(src0.x == 0.0 && src1.x != 0.0 ? 0.0 : src0.x + 1.0);\n");
@ -1120,6 +1180,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// setp_gt_push dest, src0, src1 // setp_gt_push dest, src0, src1
case AluVectorOpcode::kSetpGtPush: case AluVectorOpcode::kSetpGtPush:
cf_exec_pred_ = false;
EmitSourceDepth("p0 = src0.w == 0.0 && src1.w > 0.0 ? true : false;\n"); EmitSourceDepth("p0 = src0.w == 0.0 && src1.w > 0.0 ? true : false;\n");
EmitSourceDepth( EmitSourceDepth(
"pv = vec4(src0.x == 0.0 && src1.x > 0.0 ? 0.0 : src0.x + 1.0);\n"); "pv = vec4(src0.x == 0.0 && src1.x > 0.0 ? 0.0 : src0.x + 1.0);\n");
@ -1127,6 +1188,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// setp_ge_push dest, src0, src1 // setp_ge_push dest, src0, src1
case AluVectorOpcode::kSetpGePush: case AluVectorOpcode::kSetpGePush:
cf_exec_pred_ = false;
EmitSourceDepth("p0 = src0.w == 0.0 && src1.w >= 0.0 ? true : false;\n"); EmitSourceDepth("p0 = src0.w == 0.0 && src1.w >= 0.0 ? true : false;\n");
EmitSourceDepth( EmitSourceDepth(
"pv = vec4(src0.x == 0.0 && src1.x >= 0.0 ? 0.0 : src0.x + 1.0);\n"); "pv = vec4(src0.x == 0.0 && src1.x >= 0.0 ? 0.0 : src0.x + 1.0);\n");
@ -1134,9 +1196,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// kill_eq dest, src0, src1 // kill_eq dest, src0, src1
case AluVectorOpcode::kKillEq: case AluVectorOpcode::kKillEq:
EmitSourceDepth( EmitSourceDepth("if (any(equal(src0, src1))) {\n");
"if (src0.x == src1.x || src0.y == src1.y || src0.z == src1.z || "
"src0.w == src1.w) {\n");
EmitSourceDepth(" pv = vec4(1.0);\n"); EmitSourceDepth(" pv = vec4(1.0);\n");
EmitSourceDepth(" discard;\n"); EmitSourceDepth(" discard;\n");
EmitSourceDepth("} else {\n"); EmitSourceDepth("} else {\n");
@ -1146,9 +1206,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// kill_gt dest, src0, src1 // kill_gt dest, src0, src1
case AluVectorOpcode::kKillGt: case AluVectorOpcode::kKillGt:
EmitSourceDepth( EmitSourceDepth("if (any(greaterThan(src0, src1))) {\n");
"if (src0.x > src1.x || src0.y > src1.y || src0.z > src1.z || "
"src0.w > src1.w) {\n");
EmitSourceDepth(" pv = vec4(1.0);\n"); EmitSourceDepth(" pv = vec4(1.0);\n");
EmitSourceDepth(" discard;\n"); EmitSourceDepth(" discard;\n");
EmitSourceDepth("} else {\n"); EmitSourceDepth("} else {\n");
@ -1158,9 +1216,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// kill_ge dest, src0, src1 // kill_ge dest, src0, src1
case AluVectorOpcode::kKillGe: case AluVectorOpcode::kKillGe:
EmitSourceDepth( EmitSourceDepth("if (any(greaterThanEqual(src0, src1))) {\n");
"if (src0.x >= src1.x || src0.y >= src1.y || src0.z >= src1.z || "
"src0.w >= src1.w) {\n");
EmitSourceDepth(" pv = vec4(1.0);\n"); EmitSourceDepth(" pv = vec4(1.0);\n");
EmitSourceDepth(" discard;\n"); EmitSourceDepth(" discard;\n");
EmitSourceDepth("} else {\n"); EmitSourceDepth("} else {\n");
@ -1170,9 +1226,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
// kill_ne dest, src0, src1 // kill_ne dest, src0, src1
case AluVectorOpcode::kKillNe: case AluVectorOpcode::kKillNe:
EmitSourceDepth( EmitSourceDepth("if (any(notEqual(src0, src1))) {\n");
"if (src0.x != src1.x || src0.y != src1.y || src0.z != src1.z || "
"src0.w != src1.w) {\n");
EmitSourceDepth(" pv = vec4(1.0);\n"); EmitSourceDepth(" pv = vec4(1.0);\n");
EmitSourceDepth(" discard;\n"); EmitSourceDepth(" discard;\n");
EmitSourceDepth("} else {\n"); EmitSourceDepth("} else {\n");
@ -1197,7 +1251,7 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
EmitStoreVectorResult(instr.result); EmitStoreVectorResult(instr.result);
if (instr.is_predicated) { if (conditional) {
Unindent(); Unindent();
EmitSourceDepth("}\n"); EmitSourceDepth("}\n");
} }
@ -1205,7 +1259,10 @@ void GlslShaderTranslator::ProcessVectorAluInstruction(
void GlslShaderTranslator::ProcessScalarAluInstruction( void GlslShaderTranslator::ProcessScalarAluInstruction(
const ParsedAluInstruction& instr) { const ParsedAluInstruction& instr) {
if (instr.is_predicated) { bool conditional = false;
if (instr.is_predicated &&
(!cf_exec_pred_ || (cf_exec_pred_cond_ != instr.predicate_condition))) {
conditional = true;
EmitSourceDepth("if (%cp0) {\n", instr.predicate_condition ? ' ' : '!'); EmitSourceDepth("if (%cp0) {\n", instr.predicate_condition ? ' ' : '!');
Indent(); Indent();
} }
@ -1254,22 +1311,22 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// seqs dest, src0.a // seqs dest, src0.a
case AluScalarOpcode::kSeqs: case AluScalarOpcode::kSeqs:
EmitSourceDepth("ps = src0.x == 0.0 ? 1.0 : 0.0;\n"); EmitSourceDepth("ps = float(src0.x == 0.0);\n");
break; break;
// sgts dest, src0.a // sgts dest, src0.a
case AluScalarOpcode::kSgts: case AluScalarOpcode::kSgts:
EmitSourceDepth("ps = src0.x > 0.0 ? 1.0 : 0.0;\n"); EmitSourceDepth("ps = float(src0.x > 0.0);\n");
break; break;
// sges dest, src0.a // sges dest, src0.a
case AluScalarOpcode::kSges: case AluScalarOpcode::kSges:
EmitSourceDepth("ps = src0.x >= 0.0 ? 1.0 : 0.0;\n"); EmitSourceDepth("ps = float(src0.x >= 0.0);\n");
break; break;
// snes dest, src0.a // snes dest, src0.a
case AluScalarOpcode::kSnes: case AluScalarOpcode::kSnes:
EmitSourceDepth("ps = src0.x != 0.0 ? 1.0 : 0.0;\n"); EmitSourceDepth("ps = float(src0.x != 0.0);\n");
break; break;
// frcs dest, src0.a // frcs dest, src0.a
@ -1365,6 +1422,7 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// setp_eq dest, src0.a // setp_eq dest, src0.a
case AluScalarOpcode::kSetpEq: case AluScalarOpcode::kSetpEq:
cf_exec_pred_ = false;
EmitSourceDepth("if (src0.x == 0.0) {\n"); EmitSourceDepth("if (src0.x == 0.0) {\n");
EmitSourceDepth(" ps = 0.0;\n"); EmitSourceDepth(" ps = 0.0;\n");
EmitSourceDepth(" p0 = true;\n"); EmitSourceDepth(" p0 = true;\n");
@ -1376,6 +1434,7 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// setp_ne dest, src0.a // setp_ne dest, src0.a
case AluScalarOpcode::kSetpNe: case AluScalarOpcode::kSetpNe:
cf_exec_pred_ = false;
EmitSourceDepth("if (src0.x != 0.0) {\n"); EmitSourceDepth("if (src0.x != 0.0) {\n");
EmitSourceDepth(" ps = 0.0;\n"); EmitSourceDepth(" ps = 0.0;\n");
EmitSourceDepth(" p0 = true;\n"); EmitSourceDepth(" p0 = true;\n");
@ -1387,6 +1446,7 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// setp_gt dest, src0.a // setp_gt dest, src0.a
case AluScalarOpcode::kSetpGt: case AluScalarOpcode::kSetpGt:
cf_exec_pred_ = false;
EmitSourceDepth("if (src0.x > 0.0) {\n"); EmitSourceDepth("if (src0.x > 0.0) {\n");
EmitSourceDepth(" ps = 0.0;\n"); EmitSourceDepth(" ps = 0.0;\n");
EmitSourceDepth(" p0 = true;\n"); EmitSourceDepth(" p0 = true;\n");
@ -1398,6 +1458,7 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// setp_ge dest, src0.a // setp_ge dest, src0.a
case AluScalarOpcode::kSetpGe: case AluScalarOpcode::kSetpGe:
cf_exec_pred_ = false;
EmitSourceDepth("if (src0.x >= 0.0) {\n"); EmitSourceDepth("if (src0.x >= 0.0) {\n");
EmitSourceDepth(" ps = 0.0;\n"); EmitSourceDepth(" ps = 0.0;\n");
EmitSourceDepth(" p0 = true;\n"); EmitSourceDepth(" p0 = true;\n");
@ -1409,6 +1470,7 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// setp_inv dest, src0.a // setp_inv dest, src0.a
case AluScalarOpcode::kSetpInv: case AluScalarOpcode::kSetpInv:
cf_exec_pred_ = false;
EmitSourceDepth("if (src0.x == 1.0) {\n"); EmitSourceDepth("if (src0.x == 1.0) {\n");
EmitSourceDepth(" ps = 0.0;\n"); EmitSourceDepth(" ps = 0.0;\n");
EmitSourceDepth(" p0 = true;\n"); EmitSourceDepth(" p0 = true;\n");
@ -1420,6 +1482,7 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// setp_pop dest, src0.a // setp_pop dest, src0.a
case AluScalarOpcode::kSetpPop: case AluScalarOpcode::kSetpPop:
cf_exec_pred_ = false;
EmitSourceDepth("if (src0.x - 1.0 <= 0.0) {\n"); EmitSourceDepth("if (src0.x - 1.0 <= 0.0) {\n");
EmitSourceDepth(" ps = 0.0;\n"); EmitSourceDepth(" ps = 0.0;\n");
EmitSourceDepth(" p0 = true;\n"); EmitSourceDepth(" p0 = true;\n");
@ -1431,12 +1494,14 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
// setp_clr dest // setp_clr dest
case AluScalarOpcode::kSetpClr: case AluScalarOpcode::kSetpClr:
cf_exec_pred_ = false;
EmitSourceDepth("ps = FLT_MAX;\n"); EmitSourceDepth("ps = FLT_MAX;\n");
EmitSourceDepth("p0 = false;\n"); EmitSourceDepth("p0 = false;\n");
break; break;
// setp_rstr dest, src0.a // setp_rstr dest, src0.a
case AluScalarOpcode::kSetpRstr: case AluScalarOpcode::kSetpRstr:
cf_exec_pred_ = false;
EmitSourceDepth("ps = src0.x;\n"); EmitSourceDepth("ps = src0.x;\n");
EmitSourceDepth("p0 = src0.x == 0.0 ? true : false;\n"); EmitSourceDepth("p0 = src0.x == 0.0 ? true : false;\n");
break; break;
@ -1530,7 +1595,7 @@ void GlslShaderTranslator::ProcessScalarAluInstruction(
EmitStoreScalarResult(instr.result); EmitStoreScalarResult(instr.result);
if (instr.is_predicated) { if (conditional) {
Unindent(); Unindent();
EmitSourceDepth("}\n"); EmitSourceDepth("}\n");
} }

View File

@ -73,6 +73,8 @@ class GlslShaderTranslator : public ShaderTranslator {
int depth_ = 0; int depth_ = 0;
char depth_prefix_[16] = {0}; char depth_prefix_[16] = {0};
bool cf_wrote_pc_ = false; bool cf_wrote_pc_ = false;
bool cf_exec_pred_ = false;
bool cf_exec_pred_cond_ = false;
void ProcessVectorAluInstruction(const ParsedAluInstruction& instr); void ProcessVectorAluInstruction(const ParsedAluInstruction& instr);
void ProcessScalarAluInstruction(const ParsedAluInstruction& instr); void ProcessScalarAluInstruction(const ParsedAluInstruction& instr);