Experimenting with vertex pipeline.

2014-12-26 23:14:15 -08:00 · 2014-12-26 23:14:15 -08:00 · 14ee211ea9
parent d2a3cba4f3
commit 14ee211ea9
9 changed files with 541 additions and 133 deletions
--- a/src/xenia/gpu/gl4/circular_buffer.h
+++ b/src/xenia/gpu/gl4/circular_buffer.h
@ -31,6 +31,8 @@ class CircularBuffer {

  bool Initialize();

+  GLuint handle() const { return buffer_; }
+
  Allocation Acquire(size_t length);
  void Commit(Allocation allocation);

--- a/src/xenia/gpu/gl4/command_processor.cc
+++ b/src/xenia/gpu/gl4/command_processor.cc
@ -151,13 +151,17 @@ bool CommandProcessor::SetupGL() {
                       GL_MAP_WRITE_BIT | GL_DYNAMIC_STORAGE_BIT);

  // Circular buffer holding scratch vertex/index data.
-  glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
-  glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
  if (!scratch_buffer_.Initialize()) {
    PLOGE("Unable to initialize scratch buffer");
    return false;
  }

+  GLuint vertex_array;
+  glGenVertexArrays(1, &vertex_array);
+  glBindVertexArray(vertex_array);
+  glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+  glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+
  return true;
 }

@ -251,8 +255,7 @@ void CommandProcessor::PrepareForWait() {
  // TODO(benvanik): fences and fancy stuff. We should figure out a way to
  // make interrupt callbacks from the GPU so that we don't have to do a full
  // synchronize here.
-  // glFlush();
-  glFinish();
+  glFlush();

  if (FLAGS_thread_safe_gl) {
    context_->ClearCurrent();
@ -1162,10 +1165,11 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
    return false;
  }

-  // if (!PopulateShaders(draw_command)) {
-  //  XELOGE("Unable to prepare draw shaders");
-  //  return false;
-  //}
+  if (!UpdateShaders(draw_command)) {
+    PLOGE("Unable to prepare draw shaders");
+    return false;
+  }
+
  // if (!PopulateSamplers(draw_command)) {
  //  XELOGE("Unable to prepare draw samplers");
  //  return false;
@ -1176,25 +1180,77 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
    return false;
  }
  if (!PopulateVertexBuffers(draw_command)) {
-    XELOGE("Unable to setup vertex buffers");
+    PLOGE("Unable to setup vertex buffers");
    return false;
  }

+  GLenum prim_type = 0;
+  switch (cmd.prim_type) {
+    case PrimitiveType::kPointList:
+      prim_type = GL_POINTS;
+      /*if (vs->DemandGeometryShader(
+        D3D11VertexShaderResource::POINT_SPRITE_SHADER, &geometry_shader)) {
+        return 1;
+      }*/
+      break;
+    case PrimitiveType::kLineList:
+      prim_type = GL_LINES;
+      break;
+    case PrimitiveType::kLineStrip:
+      prim_type = GL_LINE_STRIP;
+      break;
+    case PrimitiveType::kLineLoop:
+      prim_type = GL_LINE_LOOP;
+      break;
+    case PrimitiveType::kTriangleList:
+      prim_type = GL_TRIANGLES;
+      break;
+    case PrimitiveType::kTriangleStrip:
+      prim_type = GL_TRIANGLE_STRIP;
+      break;
+    case PrimitiveType::kTriangleFan:
+      prim_type = GL_TRIANGLE_FAN;
+      break;
+    case PrimitiveType::kRectangleList:
+      prim_type = GL_TRIANGLE_STRIP;
+      /*if (vs->DemandGeometryShader(
+        D3D11VertexShaderResource::RECT_LIST_SHADER, &geometry_shader)) {
+        return 1;
+      }*/
+      break;
+    case PrimitiveType::kQuadList:
+      prim_type = GL_LINES_ADJACENCY;
+      /*if
+      (vs->DemandGeometryShader(D3D11VertexShaderResource::QUAD_LIST_SHADER,
+                                   &geometry_shader)) {
+        return 1;
+      }*/
+      break;
+    default:
+    case PrimitiveType::kUnknown0x07:
+      prim_type = GL_POINTS;
+      XELOGE("D3D11: unsupported primitive type %d", cmd.prim_type);
+      break;
+  }
+
+  // HACK HACK HACK
+  glDisable(GL_DEPTH_TEST);
+
  if (cmd.index_buffer.address) {
    // Indexed draw.
    // PopulateIndexBuffer has our element array setup.
-    //size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32
-    //                          ? sizeof(uint32_t)
-    //                          : sizeof(uint16_t);
-    //glDrawElementsBaseVertex(
-    //    prim_type, cmd.index_count,
-    //    cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT
-    //                                                   : GL_UNSIGNED_SHORT,
-    //    reinterpret_cast<void*>(cmd.start_index * element_size),
-    //    cmd.base_vertex);
+    size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32
+                              ? sizeof(uint32_t)
+                              : sizeof(uint16_t);
+    glDrawElementsBaseVertex(
+        prim_type, cmd.index_count,
+        cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT
+                                                       : GL_UNSIGNED_SHORT,
+        reinterpret_cast<void*>(cmd.start_index * element_size),
+        cmd.base_vertex);
  } else {
    // Auto draw.
-    //glDrawArrays(prim_type, cmd.start_index, cmd.index_count);
+    glDrawArrays(prim_type, cmd.start_index, cmd.index_count);
  }

  return true;
@ -1215,10 +1271,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
    };
  };
  struct UniformDataBlock {
-    float4 window_offset;    // tx,ty,?,?
-    float4 window_scissor;   // x0,y0,x1,y1
-    float4 viewport_offset;  // tx,ty,tz,?
-    float4 viewport_scale;   // sx,sy,sz,?
+    float4 window_offset;      // tx,ty,rt_w,rt_h
+    float4 window_scissor;     // x0,y0,x1,y1
+    float4 viewport_offset;    // tx,ty,tz,?
+    float4 viewport_scale;     // sx,sy,sz,?
    // TODO(benvanik): vertex format xyzw?

    float4 alpha_test;  // alpha test enable, func, ref, ?
@ -1236,11 +1292,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
  static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
                "Need <=16k uniform data");

-  auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(
-      glMapNamedBufferRange(uniform_data_buffer_, 0, 16 * 1024,
-                            GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT));
+  auto allocation = scratch_buffer_.Acquire(16 * 1024);
+  auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(allocation.host_ptr);
  if (!buffer_ptr) {
-    PLOGE("Unable to map uniform data buffer");
+    PLOGE("Unable to allocate uniform data buffer");
    return false;
  }

@ -1257,18 +1312,9 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
  buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF);
  buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);

-  // Viewport scaling. Only enabled if the flags are all set.
-  buffer_ptr->viewport_scale.x =
-      regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;  // 640
-  buffer_ptr->viewport_offset.x =
-      regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32;  // 640
-  buffer_ptr->viewport_scale.y =
-      regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;  // -360
-  buffer_ptr->viewport_offset.y =
-      regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;  // 360
-  buffer_ptr->viewport_scale.z = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32;  // 1
-  buffer_ptr->viewport_offset.z =
-      regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32;  // 0
+  // HACK: no clue where to get these values.
+  buffer_ptr->window_offset.z = 1280;
+  buffer_ptr->window_offset.w = 720;

  // Whether each of the viewport settings is enabled.
  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
@ -1282,6 +1328,23 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
  assert_true(vport_xscale_enable == vport_yscale_enable ==
              vport_zscale_enable == vport_xoffset_enable ==
              vport_yoffset_enable == vport_zoffset_enable);
+
+  // Viewport scaling. Only enabled if the flags are all set.
+  buffer_ptr->viewport_scale.x =
+      vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1;  // 640
+  buffer_ptr->viewport_offset.x = vport_xoffset_enable
+                                      ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
+                                      : 0;  // 640
+  buffer_ptr->viewport_scale.y = vport_yscale_enable
+                                     ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
+                                     : 1;  // -360
+  buffer_ptr->viewport_offset.y = vport_yoffset_enable
+                                      ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
+                                      : 0;  // 360
+  buffer_ptr->viewport_scale.z =
+      vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1;  // 1
+  buffer_ptr->viewport_offset.z =
+      vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0;  // 0
  // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0.
  //            = false: multiply the X, Y coordinates by 1/W0.
  bool vtx_xy_fmt = (vte_control >> 8) & 0x1;
@ -1504,7 +1567,9 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
                stencil_op_map[(depth_control & 0x0001C000) >> 14]);
  }

-  glUnmapNamedBuffer(uniform_data_buffer_);
+  // Stash - program setup will bind this to uniforms.
+  draw_command->state_data_gpu_ptr = allocation.gpu_ptr;
+  scratch_buffer_.Commit(std::move(allocation));

  return true;
 }
@ -1590,11 +1655,80 @@ bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) {

  // TEST TEST TEST TEST TEST TEST TEST TEST TEST TEST
  // Pretend we are drawing.
-  glEnable(GL_SCISSOR_TEST);
-  glScissor(100, 100, 100, 100);
-  float red[] = {rand() / (float)RAND_MAX, 0, 0, 1.0f};
-  glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0, red);
-  glDisable(GL_SCISSOR_TEST);
+  // glEnable(GL_SCISSOR_TEST);
+  // glScissor(100, 100, 100, 100);
+  // float red[] = {rand() / (float)RAND_MAX, 0, 0, 1.0f};
+  // glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0,
+  // red);
+  // glDisable(GL_SCISSOR_TEST);
+
+  return true;
+}
+
+bool CommandProcessor::UpdateShaders(DrawCommand* draw_command) {
+  SCOPE_profile_cpu_f("gpu");
+  auto& regs = *register_file_;
+  auto& cmd = *draw_command;
+
+  xe_gpu_program_cntl_t program_cntl;
+  program_cntl.dword_0 = regs[XE_GPU_REG_SQ_PROGRAM_CNTL].u32;
+  if (!active_vertex_shader_->has_prepared()) {
+    if (!active_vertex_shader_->PrepareVertexShader(program_cntl)) {
+      XELOGE("Unable to prepare vertex shader");
+      return false;
+    }
+  } else if (!active_vertex_shader_->is_valid()) {
+    XELOGE("Vertex shader invalid");
+    return false;
+  }
+
+  if (!active_pixel_shader_->has_prepared()) {
+    if (!active_pixel_shader_->PreparePixelShader(program_cntl,
+                                                  active_vertex_shader_)) {
+      XELOGE("Unable to prepare pixel shader");
+      return false;
+    }
+  } else if (!active_pixel_shader_->is_valid()) {
+    XELOGE("Pixel shader invalid");
+    return false;
+  }
+
+  GLuint vertex_program = active_vertex_shader_->program();
+  GLuint geometry_program = 0;
+  GLuint fragment_program = active_pixel_shader_->program();
+
+  GLuint pipeline;
+  glCreateProgramPipelines(1, &pipeline);
+  glUseProgramStages(pipeline, GL_VERTEX_SHADER_BIT, vertex_program);
+  glUseProgramStages(pipeline, GL_GEOMETRY_SHADER_BIT, geometry_program);
+  glUseProgramStages(pipeline, GL_FRAGMENT_SHADER_BIT, fragment_program);
+
+  // HACK: layout(location=0) on a bindless uniform crashes nvidia driver.
+  GLint vertex_state_loc = glGetUniformLocation(vertex_program, "state");
+  assert_true(vertex_state_loc == -1 || vertex_state_loc == 0);
+  GLint geometry_state_loc =
+      geometry_program ? glGetUniformLocation(geometry_program, "state") : -1;
+  assert_true(geometry_state_loc == -1 || geometry_state_loc == 0);
+  GLint fragment_state_loc = glGetUniformLocation(fragment_program, "state");
+  assert_true(fragment_state_loc == -1 || fragment_state_loc == 0);
+
+  // TODO(benvanik): do we need to do this for all stages if the locations
+  // match?
+  if (vertex_state_loc != -1) {
+    glProgramUniformHandleui64ARB(vertex_program, vertex_state_loc,
+                                  cmd.state_data_gpu_ptr);
+  }
+  if (geometry_program && geometry_state_loc != -1) {
+    glProgramUniformHandleui64ARB(geometry_program, geometry_state_loc,
+                                  cmd.state_data_gpu_ptr);
+  }
+  if (fragment_state_loc != -1) {
+    glProgramUniformHandleui64ARB(fragment_program, fragment_state_loc,
+                                  cmd.state_data_gpu_ptr);
+  }
+
+  glBindProgramPipeline(pipeline);
+  // glDeleteProgramPipelines(1, &pipeline);

  return true;
 }
@ -1641,15 +1775,9 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
  SCOPE_profile_cpu_f("gpu");
  auto& regs = *register_file_;
  auto& cmd = *draw_command;
+  assert_not_null(active_vertex_shader_);

-  if (!cmd.vertex_shader) {
-    // No vertex shader, no-op.
-    return true;
-  }
-
-  const auto& buffer_inputs = cmd.vertex_shader->buffer_inputs();
-
-  // glBindVertexArray(vertex_array);
+  const auto& buffer_inputs = active_vertex_shader_->buffer_inputs();

  for (size_t n = 0; n < buffer_inputs.count; n++) {
    const auto& desc = buffer_inputs.descs[n];
@ -1685,9 +1813,100 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
        reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
        fetch->size);

-    /*glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV,
-                           desc.input_index,
-                           allocation.gpu_ptr, allocation.length);*/
+    uint32_t el_index = 0;
+    for (uint32_t i = 0; i < desc.element_count; ++i) {
+      const auto& el = desc.elements[i];
+      GLuint comp_count;
+      GLuint comp_size;
+      GLenum comp_type;
+      switch (el.format) {
+        case VertexFormat::k_8_8_8_8:
+          comp_count = 4;
+          comp_size = 1;
+          comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE;
+          break;
+        case VertexFormat::k_2_10_10_10:
+          comp_count = 4;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT_2_10_10_10_REV
+                                   : GL_UNSIGNED_INT_2_10_10_10_REV;
+          break;
+        case VertexFormat::k_10_11_11:
+          comp_count = 3;
+          comp_size = 4;
+          assert_false(el.is_signed);
+          comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV;
+          break;
+        /*case VertexFormat::k_11_11_10:
+          break;*/
+        case VertexFormat::k_16_16:
+          comp_count = 2;
+          comp_size = 2;
+          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
+          break;
+        case VertexFormat::k_16_16_FLOAT:
+          comp_count = 2;
+          comp_size = 2;
+          comp_type = GL_HALF_FLOAT;
+          break;
+        case VertexFormat::k_16_16_16_16:
+          comp_count = 4;
+          comp_size = 2;
+          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
+          break;
+        case VertexFormat::k_16_16_16_16_FLOAT:
+          comp_count = 4;
+          comp_size = 2;
+          comp_type = GL_HALF_FLOAT;
+          break;
+        case VertexFormat::k_32:
+          comp_count = 1;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_32:
+          comp_count = 2;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_32_32_32:
+          comp_count = 4;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_FLOAT:
+          comp_count = 1;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_FLOAT:
+          comp_count = 2;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_32_FLOAT:
+          comp_count = 3;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_32_32_FLOAT:
+          comp_count = 4;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        default:
+          assert_unhandled_case(el.format);
+          break;
+      }
+      size_t offset = el.offset_words * sizeof(uint32_t);
+      glEnableVertexAttribArray(el_index);
+      glVertexAttribFormatNV(el_index, comp_count, comp_type, el.is_normalized,
+                             desc.stride_words * sizeof(uint32_t));
+      glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index,
+                             allocation.gpu_ptr + offset,
+                             allocation.length - offset);
+      ++el_index;
+    }

    // Flush buffer before we draw.
    scratch_buffer_.Commit(std::move(allocation));
@ -1782,7 +2001,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
  GLenum read_format;
  GLenum read_type;
  switch (copy_dest_format) {
-    case ColorFormat::kColor_8_8_8_8:
+    case ColorFormat::k_8_8_8_8:
      read_format = copy_dest_swap ? GL_BGRA : GL_RGBA;
      read_type = GL_UNSIGNED_BYTE;
      break;
@ -1832,10 +2051,10 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
        // glBindBuffer(GL_READ_FRAMEBUFFER, framebuffer)
        glNamedFramebufferReadBuffer(source_framebuffer->framebuffer,
                                     GL_COLOR_ATTACHMENT0 + copy_src_select);
-        glReadPixels(x, y, w, h, read_format, read_type, ptr);
+        //glReadPixels(x, y, w, h, read_format, read_type, ptr);
      } else {
        // Source from the bound depth/stencil target.
-        glReadPixels(x, y, w, h, GL_DEPTH_STENCIL, read_type, ptr);
+        //glReadPixels(x, y, w, h, GL_DEPTH_STENCIL, read_type, ptr);
      }
      break;
    case CopyCommand::kRaw:
@ -1876,7 +2095,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
    glClearNamedFramebufferfi(source_framebuffer->framebuffer, GL_DEPTH_STENCIL,
                              depth.float_value, stencil);
  }
-
+  
  return true;
 }

@ -1890,8 +2109,8 @@ GLuint CommandProcessor::GetColorRenderTarget(uint32_t pitch,
  uint32_t height = 2560;

  // NOTE: we strip gamma formats down to normal ones.
-  if (format == ColorRenderTargetFormat::k8888Gamma) {
-    format = ColorRenderTargetFormat::k8888;
+  if (format == ColorRenderTargetFormat::k_8_8_8_8_GAMMA) {
+    format = ColorRenderTargetFormat::k_8_8_8_8;
  }

  for (auto& it = cached_color_render_targets_.begin();
@ -1910,8 +2129,8 @@ GLuint CommandProcessor::GetColorRenderTarget(uint32_t pitch,

  GLenum internal_format;
  switch (format) {
-    case ColorRenderTargetFormat::k8888:
-    case ColorRenderTargetFormat::k8888Gamma:
+    case ColorRenderTargetFormat::k_8_8_8_8:
+    case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
      internal_format = GL_RGBA8;
      break;
    default:
--- a/src/xenia/gpu/gl4/command_processor.h
+++ b/src/xenia/gpu/gl4/command_processor.h
@ -47,9 +47,6 @@ struct DrawCommand {
  uint32_t index_count;
  uint32_t base_vertex;

-  GL4Shader* vertex_shader;
-  GL4Shader* pixel_shader;
-
  // Index buffer, if present.
  // If index_count > 0 but buffer is nullptr then auto draw.
  struct {
@ -69,6 +66,8 @@ struct DrawCommand {
  size_t vertex_shader_sampler_count;
  SamplerInput pixel_shader_samplers[32];
  size_t pixel_shader_sampler_count;
+
+  GLuint64 state_data_gpu_ptr;
 };

 class CommandProcessor {
@ -188,6 +187,7 @@ class CommandProcessor {
  bool IssueDraw(DrawCommand* draw_command);
  bool UpdateState(DrawCommand* draw_command);
  bool UpdateRenderTargets(DrawCommand* draw_command);
+  bool UpdateShaders(DrawCommand* draw_command);
  bool PopulateIndexBuffer(DrawCommand* draw_command);
  bool PopulateVertexBuffers(DrawCommand* draw_command);
  bool IssueCopy(DrawCommand* draw_command);
--- a/src/xenia/gpu/gl4/gl4_graphics_system.cc
+++ b/src/xenia/gpu/gl4/gl4_graphics_system.cc
@ -11,6 +11,7 @@

 #include <poly/threading.h>
 #include <xenia/cpu/processor.h>
+#include <xenia/gpu/gl4/gl4_gpu-private.h>
 #include <xenia/gpu/gpu-private.h>

 namespace xe {
@ -42,11 +43,19 @@ X_STATUS GL4GraphicsSystem::Setup() {
    control_ = std::make_unique<WGLControl>(loop);
    emulator_->main_window()->AddChild(control_.get());

+    if (FLAGS_thread_safe_gl) {
+      control_->context()->MakeCurrent();
+    }
+
    // Setup the GL context the command processor will do all its drawing in.
    // It's shared with the control context so that we can resolve framebuffers
    // from it.
    processor_context = control_->context()->CreateShared();

+    if (FLAGS_thread_safe_gl) {
+      control_->context()->ClearCurrent();
+    }
+
    control_ready_fence.Signal();
  });
  control_ready_fence.Wait();
--- a/src/xenia/gpu/gl4/gl4_shader.cc
+++ b/src/xenia/gpu/gl4/gl4_shader.cc
@ -15,7 +15,163 @@ namespace xe {
 namespace gpu {
 namespace gl4 {

-bool GL4Shader::TranslateImpl() { return true; }
+extern "C" GLEWContext* glewGetContext();
+
+GL4Shader::GL4Shader(ShaderType shader_type, uint64_t data_hash,
+                     const uint32_t* dword_ptr, uint32_t dword_count)
+    : Shader(shader_type, data_hash, dword_ptr, dword_count), program_(0) {}
+
+GL4Shader::~GL4Shader() { glDeleteProgram(program_); }
+
+const std::string header =
+    "#version 450\n"
+    "#extension all : warn\n"
+    "#extension GL_ARB_bindless_texture : require\n"
+    "#extension GL_ARB_explicit_uniform_location : require\n"
+    "#extension GL_ARB_shading_language_420pack : require\n"
+    "#extension GL_ARB_shader_storage_buffer_object : require\n"
+    "#extension GL_NV_shader_buffer_load : require\n"
+    "precision highp float;\n"
+    "precision highp int;\n"
+    "layout(std140, column_major) uniform;\n"
+    "layout(std430, column_major) buffer;\n"
+    "struct StateData {\n"
+    "  vec4 window_offset;\n"
+    "  vec4 window_scissor;\n"
+    "  vec4 viewport_offset;\n"
+    "  vec4 viewport_scale;\n"
+    "  vec4 alpha_test;\n"
+    "  vec4 float_consts[512];\n"
+    "  uint fetch_consts[32 * 6];\n"
+    "  int bool_consts[8];\n"
+    "  int loop_consts[32];\n"
+    "};\n"
+    "struct VertexData {\n"
+    "  vec4 o[16];\n"
+    "};\n"
+    "\n"
+    "uniform StateData* state;\n";
+
+bool GL4Shader::PrepareVertexShader(
+    const xenos::xe_gpu_program_cntl_t& program_cntl) {
+  if (has_prepared_) {
+    return is_valid_;
+  }
+  has_prepared_ = true;
+
+  std::string apply_viewport =
+      "vec4 applyViewport(vec4 pos) {\n"
+      // TODO(benvanik): piecewise viewport_enable -> offset/scale logic.
+      "  if (false) {\n"
+      "  } else {\n"
+      /*"    pos.xy = pos.xy / vec2(state->window_offset.z / 2.0, "
+      "-state->window_offset.w / 2.0) + vec2(-1.0, 1.0);\n"
+      "    pos.zw = vec2(0.0, 1.0);\n"*/
+      "    pos.xy = pos.xy / vec2(1280.0 / 2.0, "
+      "-720.0 / 2.0) + vec2(-1.0, 1.0);\n"
+      "    //pos.zw = vec2(0.0, 1.0);\n"
+      "  }\n"
+      "  pos.x = pos.x * state->viewport_scale.x + \n"
+      "      state->viewport_offset.x;\n"
+      "  pos.y = pos.y * state->viewport_scale.y + \n"
+      "      state->viewport_offset.y;\n"
+      "  pos.z = pos.z * state->viewport_scale.z + \n"
+      "      state->viewport_offset.z;\n"
+      "  pos.xy += state->window_offset.xy;\n"
+      "  return pos;\n"
+      "}\n";
+  std::string source =
+      header + apply_viewport +
+      "out gl_PerVertex {\n"
+      "  vec4 gl_Position;\n"
+      "  float gl_PointSize;\n"
+      "  float gl_ClipDistance[];\n"
+      "};\n"
+      "layout(location = 0) in vec3 iF0;\n"
+      "layout(location = 1) in vec4 iF1;\n"
+      "layout(location = 0) out VertexData vtx;\n"
+      "void main() {\n"
+      //"  vec4 oPos = vec4(iF0.xy, 0.0, 1.0);\n"
+      "  vec4 oPos = iF0.xxxx * state->float_consts[0];\n"
+      "  oPos = (iF0.yyyy * state->float_consts[1]) + oPos;\n"
+      "  oPos = (iF0.zzzz * state->float_consts[2]) + oPos;\n"
+      "  oPos = (vec4(1.0, 1.0, 1.0, 1.0) * state->float_consts[3]) + oPos;\n"
+      //"  gl_PointSize = 1.0;\n"
+      "  for (int i = 0; i < vtx.o.length(); ++i) {\n"
+      "    vtx.o[0] = vec4(0.0, 0.0, 0.0, 0.0);\n"
+      "  }\n"
+      "  vtx.o[0] = iF1;\n"
+      "  gl_Position = applyViewport(oPos);\n"
+      //"  gl_Position = oPos;\n"
+      "}\n";
+
+  if (!CompileProgram(source)) {
+    return false;
+  }
+
+  is_valid_ = true;
+  return true;
+}
+
+bool GL4Shader::PreparePixelShader(
+    const xenos::xe_gpu_program_cntl_t& program_cntl,
+    GL4Shader* vertex_shader) {
+  if (has_prepared_) {
+    return is_valid_;
+  }
+  has_prepared_ = true;
+
+  std::string source = header +
+                       "layout(location = 0) in VertexData vtx;\n"
+                       "layout(location = 0) out vec4 oC[4];\n"
+                       "void main() {\n"
+                       "  for (int i = 0; i < oC.length(); ++i) {\n"
+                       "    oC[i] = vec4(1.0, 0.0, 0.0, 1.0);\n"
+                       "  }\n"
+                       "  oC[0] = vtx.o[0];\n"
+                       //"  gl_FragDepth = 0.0;\n"
+                       "}\n";
+
+  if (!CompileProgram(source)) {
+    return false;
+  }
+
+  is_valid_ = true;
+  return true;
+}
+
+bool GL4Shader::CompileProgram(std::string source) {
+  assert_zero(program_);
+
+  translated_disassembly_ = std::move(source);
+  const char* source_str = translated_disassembly_.c_str();
+
+  program_ = glCreateShaderProgramv(shader_type_ == ShaderType::kVertex
+                                        ? GL_VERTEX_SHADER
+                                        : GL_FRAGMENT_SHADER,
+                                    1, &source_str);
+  if (!program_) {
+    PLOGE("Unable to create shader program");
+    return false;
+  }
+
+  GLint link_status = 0;
+  glGetProgramiv(program_, GL_LINK_STATUS, &link_status);
+  if (!link_status) {
+    // log_length includes the null character.
+    GLint log_length = 0;
+    glGetProgramiv(program_, GL_INFO_LOG_LENGTH, &log_length);
+    std::string info_log;
+    info_log.resize(log_length - 1);
+    glGetProgramInfoLog(program_, log_length, &log_length,
+                        const_cast<char*>(info_log.data()));
+    PLOGE("Unable to link program: %s", info_log.c_str());
+    error_log_ = std::move(info_log);
+    return false;
+  }
+
+  return true;
+}

 }  // namespace gl4
 }  // namespace gpu
--- a/src/xenia/gpu/gl4/gl4_shader.h
+++ b/src/xenia/gpu/gl4/gl4_shader.h
@ -11,6 +11,7 @@
 #define XENIA_GPU_GL4_GL4_SHADER_H_

 #include <xenia/common.h>
+#include <xenia/gpu/gl4/gl_context.h>
 #include <xenia/gpu/shader.h>

 namespace xe {
@ -19,10 +20,20 @@ namespace gl4 {

 class GL4Shader : public Shader {
 public:
-  using Shader::Shader;
+  GL4Shader(ShaderType shader_type, uint64_t data_hash,
+            const uint32_t* dword_ptr, uint32_t dword_count);
+  ~GL4Shader() override;
+
+  GLuint program() const { return program_; }
+
+  bool PrepareVertexShader(const xenos::xe_gpu_program_cntl_t& program_cntl);
+  bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl,
+                          GL4Shader* vertex_shader);

 protected:
-  bool TranslateImpl() override;
+  bool CompileProgram(std::string source);
+
+  GLuint program_;
 };

 }  // namespace gl4
--- a/src/xenia/gpu/shader.cc
+++ b/src/xenia/gpu/shader.cc
@ -16,10 +16,14 @@ namespace xe {
 namespace gpu {

 using namespace xe::gpu::ucode;
+using namespace xe::gpu::xenos;

 Shader::Shader(ShaderType shader_type, uint64_t data_hash,
               const uint32_t* dword_ptr, uint32_t dword_count)
-    : shader_type_(shader_type), data_hash_(data_hash), is_valid_(false) {
+    : shader_type_(shader_type),
+      data_hash_(data_hash),
+      has_prepared_(false),
+      is_valid_(false) {
  data_.resize(dword_count);
  poly::copy_and_swap(data_.data(), dword_ptr, dword_count);
  std::memset(&alloc_counts_, 0, sizeof(alloc_counts_));
@ -35,18 +39,7 @@ Shader::Shader(ShaderType shader_type, uint64_t data_hash,
  GatherIO();
 }

-bool Shader::Translate() {
-  assert_false(is_valid_);
-
-  // TODO(benvanik): disk cache/etc - lookup hash and load if found.
-  // TODO(benvanik): dump to disk.
-
-  // Attempt implementation-specific translation.
-  // This may take awhile, and probably will fail.
-  // TODO(benvanik): parallelize? (allow two translations at once, etc).
-  is_valid_ = TranslateImpl();
-  return is_valid_;
-}
+Shader::~Shader() = default;

 void Shader::GatherIO() {
  // Process all execution blocks.
@ -203,44 +196,43 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
  }

  el->vtx_fetch = *vtx;
-  el->format = vtx->format;
+  el->format = static_cast<VertexFormat>(vtx->format);
  el->is_normalized = vtx->num_format_all == 0;
  el->is_signed = vtx->format_comp_all == 1;
  el->offset_words = vtx->offset;
  el->size_words = 0;
  switch (el->format) {
-    case FMT_8_8_8_8:
-    case FMT_2_10_10_10:
-    case FMT_10_11_11:
-    case FMT_11_11_10:
+    case VertexFormat::k_8_8_8_8:
+    case VertexFormat::k_2_10_10_10:
+    case VertexFormat::k_10_11_11:
+    case VertexFormat::k_11_11_10:
      el->size_words = 1;
      break;
-    case FMT_16_16:
-    case FMT_16_16_FLOAT:
+    case VertexFormat::k_16_16:
+    case VertexFormat::k_16_16_FLOAT:
      el->size_words = 1;
      break;
-    case FMT_16_16_16_16:
-    case FMT_16_16_16_16_FLOAT:
+    case VertexFormat::k_16_16_16_16:
+    case VertexFormat::k_16_16_16_16_FLOAT:
      el->size_words = 2;
      break;
-    case FMT_32:
-    case FMT_32_FLOAT:
+    case VertexFormat::k_32:
+    case VertexFormat::k_32_FLOAT:
      el->size_words = 1;
      break;
-    case FMT_32_32:
-    case FMT_32_32_FLOAT:
+    case VertexFormat::k_32_32:
+    case VertexFormat::k_32_32_FLOAT:
      el->size_words = 2;
      break;
-    case FMT_32_32_32_FLOAT:
+    case VertexFormat::k_32_32_32_FLOAT:
      el->size_words = 3;
      break;
-    case FMT_32_32_32_32:
-    case FMT_32_32_32_32_FLOAT:
+    case VertexFormat::k_32_32_32_32:
+    case VertexFormat::k_32_32_32_32_FLOAT:
      el->size_words = 4;
      break;
    default:
-      XELOGE("Unknown vertex format: %d", el->format);
-      assert_always();
+      assert_unhandled_case(el->format);
      break;
  }
 }
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -20,21 +20,19 @@ namespace gpu {

 class Shader {
 public:
-  Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr,
-         uint32_t dword_count);
+  virtual ~Shader();

  ShaderType type() const { return shader_type_; }
+  bool has_prepared() const { return has_prepared_; }
  bool is_valid() const { return is_valid_; }
  const std::string& ucode_disassembly() const { return ucode_disassembly_; }
  const std::string& translated_disassembly() const {
    return translated_disassembly_;
  }

-  bool Translate();
-
  struct BufferDescElement {
    ucode::instr_fetch_vtx_t vtx_fetch;
-    uint32_t format;
+    xenos::VertexFormat format;
    uint32_t offset_words;
    uint32_t size_words;
    bool is_signed;
@ -76,7 +74,8 @@ class Shader {
  const std::vector<ucode::instr_cf_alloc_t>& allocs() const { return allocs_; }

 protected:
-  virtual bool TranslateImpl() = 0;
+  Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr,
+         uint32_t dword_count);

  void GatherIO();
  void GatherAlloc(const ucode::instr_cf_alloc_t* cf);
@ -87,10 +86,12 @@ class Shader {
  ShaderType shader_type_;
  uint64_t data_hash_;
  std::vector<uint32_t> data_;
+  bool has_prepared_;
  bool is_valid_;

  std::string ucode_disassembly_;
  std::string translated_disassembly_;
+  std::string error_log_;

  AllocCounts alloc_counts_;
  std::vector<ucode::instr_cf_exec_t> execs_;
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -72,9 +72,9 @@ enum class MsaaSamples : uint32_t {
 };

 enum class ColorRenderTargetFormat : uint32_t {
-  k8888 = 0,       // D3DFMT_A8R8G8B8 (or ABGR?)
-  k8888Gamma = 1,  // D3DFMT_A8R8G8B8 with gamma correction
-                   // ...
+  k_8_8_8_8 = 0,        // D3DFMT_A8R8G8B8 (or ABGR?)
+  k_8_8_8_8_GAMMA = 1,  // D3DFMT_A8R8G8B8 with gamma correction
+                        // ...
 };

 enum class DepthRenderTargetFormat : uint32_t {
@ -98,29 +98,47 @@ enum class CopyCommand : uint32_t {

 // Subset of a2xx_sq_surfaceformat.
 enum class ColorFormat : uint32_t {
-  kColor_8 = 2,
-  kColor_1_5_5_5 = 3,
-  kColor_5_6_5 = 4,
-  kColor_6_5_5 = 5,
-  kColor_8_8_8_8 = 6,
-  kColor_2_10_10_10 = 7,
-  kColor_8_A = 8,
-  kColor_8_B = 9,
-  kColor_8_8 = 10,
-  kColor_8_8_8_8_A = 14,
-  kColor_4_4_4_4 = 15,
-  kColor_10_11_11 = 16,
-  kColor_11_11_10 = 17,
-  kColor_16 = 24,
-  kColor_16_16 = 25,
-  kColor_16_16_16_16 = 26,
-  kColor_16_FLOAT = 30,
-  kColor_16_16_FLOAT = 31,
-  kColor_16_16_16_16_FLOAT = 32,
-  kColor_32_FLOAT = 36,
-  kColor_32_32_FLOAT = 37,
-  kColor_32_32_32_32_FLOAT = 38,
-  kColor_2_10_10_10_FLOAT = 62,
+  k_8 = 2,
+  k_1_5_5_5 = 3,
+  k_5_6_5 = 4,
+  k_6_5_5 = 5,
+  k_8_8_8_8 = 6,
+  k_2_10_10_10 = 7,
+  k_8_A = 8,
+  k_8_B = 9,
+  k_8_8 = 10,
+  k_8_8_8_8_A = 14,
+  k_4_4_4_4 = 15,
+  k_10_11_11 = 16,
+  k_11_11_10 = 17,
+  k_16 = 24,
+  k_16_16 = 25,
+  k_16_16_16_16 = 26,
+  k_16_FLOAT = 30,
+  k_16_16_FLOAT = 31,
+  k_16_16_16_16_FLOAT = 32,
+  k_32_FLOAT = 36,
+  k_32_32_FLOAT = 37,
+  k_32_32_32_32_FLOAT = 38,
+  k_2_10_10_10_FLOAT = 62,
+};
+
+enum class VertexFormat : uint32_t {
+  k_8_8_8_8 = 6,
+  k_2_10_10_10 = 7,
+  k_10_11_11 = 16,
+  k_11_11_10 = 17,
+  k_16_16 = 25,
+  k_16_16_16_16 = 26,
+  k_16_16_FLOAT = 31,
+  k_16_16_16_16_FLOAT = 32,
+  k_32 = 33,
+  k_32_32 = 34,
+  k_32_32_32_32 = 35,
+  k_32_FLOAT = 36,
+  k_32_32_FLOAT = 37,
+  k_32_32_32_32_FLOAT = 38,
+  k_32_32_32_FLOAT = 57,
 };

 #define XE_GPU_MAKE_SWIZZLE(x, y, z, w)                        \