diff --git a/src/xenia/gpu/gl4/circular_buffer.h b/src/xenia/gpu/gl4/circular_buffer.h
index 2ef75853f..dde0e41d1 100644
--- a/src/xenia/gpu/gl4/circular_buffer.h
+++ b/src/xenia/gpu/gl4/circular_buffer.h
@@ -31,6 +31,8 @@ class CircularBuffer {
 
   bool Initialize();
 
+  GLuint handle() const { return buffer_; }
+
   Allocation Acquire(size_t length);
   void Commit(Allocation allocation);
 
diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc
index a15d53116..da4839329 100644
--- a/src/xenia/gpu/gl4/command_processor.cc
+++ b/src/xenia/gpu/gl4/command_processor.cc
@@ -151,13 +151,17 @@ bool CommandProcessor::SetupGL() {
                        GL_MAP_WRITE_BIT | GL_DYNAMIC_STORAGE_BIT);
 
   // Circular buffer holding scratch vertex/index data.
-  glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
-  glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
   if (!scratch_buffer_.Initialize()) {
     PLOGE("Unable to initialize scratch buffer");
     return false;
   }
 
+  GLuint vertex_array;
+  glGenVertexArrays(1, &vertex_array);
+  glBindVertexArray(vertex_array);
+  glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+  glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+
   return true;
 }
 
@@ -251,8 +255,7 @@ void CommandProcessor::PrepareForWait() {
   // TODO(benvanik): fences and fancy stuff. We should figure out a way to
   // make interrupt callbacks from the GPU so that we don't have to do a full
   // synchronize here.
-  // glFlush();
-  glFinish();
+  glFlush();
 
   if (FLAGS_thread_safe_gl) {
     context_->ClearCurrent();
@@ -1162,10 +1165,11 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
     return false;
   }
 
-  // if (!PopulateShaders(draw_command)) {
-  //  XELOGE("Unable to prepare draw shaders");
-  //  return false;
-  //}
+  if (!UpdateShaders(draw_command)) {
+    PLOGE("Unable to prepare draw shaders");
+    return false;
+  }
+
   // if (!PopulateSamplers(draw_command)) {
   //  XELOGE("Unable to prepare draw samplers");
   //  return false;
@@ -1176,25 +1180,77 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
     return false;
   }
   if (!PopulateVertexBuffers(draw_command)) {
-    XELOGE("Unable to setup vertex buffers");
+    PLOGE("Unable to setup vertex buffers");
     return false;
   }
 
+  GLenum prim_type = 0;
+  switch (cmd.prim_type) {
+    case PrimitiveType::kPointList:
+      prim_type = GL_POINTS;
+      /*if (vs->DemandGeometryShader(
+        D3D11VertexShaderResource::POINT_SPRITE_SHADER, &geometry_shader)) {
+        return 1;
+      }*/
+      break;
+    case PrimitiveType::kLineList:
+      prim_type = GL_LINES;
+      break;
+    case PrimitiveType::kLineStrip:
+      prim_type = GL_LINE_STRIP;
+      break;
+    case PrimitiveType::kLineLoop:
+      prim_type = GL_LINE_LOOP;
+      break;
+    case PrimitiveType::kTriangleList:
+      prim_type = GL_TRIANGLES;
+      break;
+    case PrimitiveType::kTriangleStrip:
+      prim_type = GL_TRIANGLE_STRIP;
+      break;
+    case PrimitiveType::kTriangleFan:
+      prim_type = GL_TRIANGLE_FAN;
+      break;
+    case PrimitiveType::kRectangleList:
+      prim_type = GL_TRIANGLE_STRIP;
+      /*if (vs->DemandGeometryShader(
+        D3D11VertexShaderResource::RECT_LIST_SHADER, &geometry_shader)) {
+        return 1;
+      }*/
+      break;
+    case PrimitiveType::kQuadList:
+      prim_type = GL_LINES_ADJACENCY;
+      /*if
+      (vs->DemandGeometryShader(D3D11VertexShaderResource::QUAD_LIST_SHADER,
+                                   &geometry_shader)) {
+        return 1;
+      }*/
+      break;
+    default:
+    case PrimitiveType::kUnknown0x07:
+      prim_type = GL_POINTS;
+      XELOGE("D3D11: unsupported primitive type %d", cmd.prim_type);
+      break;
+  }
+
+  // HACK HACK HACK
+  glDisable(GL_DEPTH_TEST);
+
   if (cmd.index_buffer.address) {
     // Indexed draw.
     // PopulateIndexBuffer has our element array setup.
-    //size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32
-    //                          ? sizeof(uint32_t)
-    //                          : sizeof(uint16_t);
-    //glDrawElementsBaseVertex(
-    //    prim_type, cmd.index_count,
-    //    cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT
-    //                                                   : GL_UNSIGNED_SHORT,
-    //    reinterpret_cast<void*>(cmd.start_index * element_size),
-    //    cmd.base_vertex);
+    size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32
+                              ? sizeof(uint32_t)
+                              : sizeof(uint16_t);
+    glDrawElementsBaseVertex(
+        prim_type, cmd.index_count,
+        cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT
+                                                       : GL_UNSIGNED_SHORT,
+        reinterpret_cast<void*>(cmd.start_index * element_size),
+        cmd.base_vertex);
   } else {
     // Auto draw.
-    //glDrawArrays(prim_type, cmd.start_index, cmd.index_count);
+    glDrawArrays(prim_type, cmd.start_index, cmd.index_count);
   }
 
   return true;
@@ -1215,10 +1271,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
     };
   };
   struct UniformDataBlock {
-    float4 window_offset;    // tx,ty,?,?
-    float4 window_scissor;   // x0,y0,x1,y1
-    float4 viewport_offset;  // tx,ty,tz,?
-    float4 viewport_scale;   // sx,sy,sz,?
+    float4 window_offset;      // tx,ty,rt_w,rt_h
+    float4 window_scissor;     // x0,y0,x1,y1
+    float4 viewport_offset;    // tx,ty,tz,?
+    float4 viewport_scale;     // sx,sy,sz,?
     // TODO(benvanik): vertex format xyzw?
 
     float4 alpha_test;  // alpha test enable, func, ref, ?
@@ -1236,11 +1292,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
   static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
                 "Need <=16k uniform data");
 
-  auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(
-      glMapNamedBufferRange(uniform_data_buffer_, 0, 16 * 1024,
-                            GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT));
+  auto allocation = scratch_buffer_.Acquire(16 * 1024);
+  auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(allocation.host_ptr);
   if (!buffer_ptr) {
-    PLOGE("Unable to map uniform data buffer");
+    PLOGE("Unable to allocate uniform data buffer");
     return false;
   }
 
@@ -1257,18 +1312,9 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
   buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF);
   buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);
 
-  // Viewport scaling. Only enabled if the flags are all set.
-  buffer_ptr->viewport_scale.x =
-      regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;  // 640
-  buffer_ptr->viewport_offset.x =
-      regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32;  // 640
-  buffer_ptr->viewport_scale.y =
-      regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;  // -360
-  buffer_ptr->viewport_offset.y =
-      regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;  // 360
-  buffer_ptr->viewport_scale.z = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32;  // 1
-  buffer_ptr->viewport_offset.z =
-      regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32;  // 0
+  // HACK: no clue where to get these values.
+  buffer_ptr->window_offset.z = 1280;
+  buffer_ptr->window_offset.w = 720;
 
   // Whether each of the viewport settings is enabled.
   // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
@@ -1282,6 +1328,23 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
   assert_true(vport_xscale_enable == vport_yscale_enable ==
               vport_zscale_enable == vport_xoffset_enable ==
               vport_yoffset_enable == vport_zoffset_enable);
+
+  // Viewport scaling. Only enabled if the flags are all set.
+  buffer_ptr->viewport_scale.x =
+      vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1;  // 640
+  buffer_ptr->viewport_offset.x = vport_xoffset_enable
+                                      ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
+                                      : 0;  // 640
+  buffer_ptr->viewport_scale.y = vport_yscale_enable
+                                     ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
+                                     : 1;  // -360
+  buffer_ptr->viewport_offset.y = vport_yoffset_enable
+                                      ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
+                                      : 0;  // 360
+  buffer_ptr->viewport_scale.z =
+      vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1;  // 1
+  buffer_ptr->viewport_offset.z =
+      vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0;  // 0
   // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0.
   //            = false: multiply the X, Y coordinates by 1/W0.
   bool vtx_xy_fmt = (vte_control >> 8) & 0x1;
@@ -1504,7 +1567,9 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
                 stencil_op_map[(depth_control & 0x0001C000) >> 14]);
   }
 
-  glUnmapNamedBuffer(uniform_data_buffer_);
+  // Stash - program setup will bind this to uniforms.
+  draw_command->state_data_gpu_ptr = allocation.gpu_ptr;
+  scratch_buffer_.Commit(std::move(allocation));
 
   return true;
 }
@@ -1590,11 +1655,80 @@ bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) {
 
   // TEST TEST TEST TEST TEST TEST TEST TEST TEST TEST
   // Pretend we are drawing.
-  glEnable(GL_SCISSOR_TEST);
-  glScissor(100, 100, 100, 100);
-  float red[] = {rand() / (float)RAND_MAX, 0, 0, 1.0f};
-  glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0, red);
-  glDisable(GL_SCISSOR_TEST);
+  // glEnable(GL_SCISSOR_TEST);
+  // glScissor(100, 100, 100, 100);
+  // float red[] = {rand() / (float)RAND_MAX, 0, 0, 1.0f};
+  // glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0,
+  // red);
+  // glDisable(GL_SCISSOR_TEST);
+
+  return true;
+}
+
+bool CommandProcessor::UpdateShaders(DrawCommand* draw_command) {
+  SCOPE_profile_cpu_f("gpu");
+  auto& regs = *register_file_;
+  auto& cmd = *draw_command;
+
+  xe_gpu_program_cntl_t program_cntl;
+  program_cntl.dword_0 = regs[XE_GPU_REG_SQ_PROGRAM_CNTL].u32;
+  if (!active_vertex_shader_->has_prepared()) {
+    if (!active_vertex_shader_->PrepareVertexShader(program_cntl)) {
+      XELOGE("Unable to prepare vertex shader");
+      return false;
+    }
+  } else if (!active_vertex_shader_->is_valid()) {
+    XELOGE("Vertex shader invalid");
+    return false;
+  }
+
+  if (!active_pixel_shader_->has_prepared()) {
+    if (!active_pixel_shader_->PreparePixelShader(program_cntl,
+                                                  active_vertex_shader_)) {
+      XELOGE("Unable to prepare pixel shader");
+      return false;
+    }
+  } else if (!active_pixel_shader_->is_valid()) {
+    XELOGE("Pixel shader invalid");
+    return false;
+  }
+
+  GLuint vertex_program = active_vertex_shader_->program();
+  GLuint geometry_program = 0;
+  GLuint fragment_program = active_pixel_shader_->program();
+
+  GLuint pipeline;
+  glCreateProgramPipelines(1, &pipeline);
+  glUseProgramStages(pipeline, GL_VERTEX_SHADER_BIT, vertex_program);
+  glUseProgramStages(pipeline, GL_GEOMETRY_SHADER_BIT, geometry_program);
+  glUseProgramStages(pipeline, GL_FRAGMENT_SHADER_BIT, fragment_program);
+
+  // HACK: layout(location=0) on a bindless uniform crashes nvidia driver.
+  GLint vertex_state_loc = glGetUniformLocation(vertex_program, "state");
+  assert_true(vertex_state_loc == -1 || vertex_state_loc == 0);
+  GLint geometry_state_loc =
+      geometry_program ? glGetUniformLocation(geometry_program, "state") : -1;
+  assert_true(geometry_state_loc == -1 || geometry_state_loc == 0);
+  GLint fragment_state_loc = glGetUniformLocation(fragment_program, "state");
+  assert_true(fragment_state_loc == -1 || fragment_state_loc == 0);
+
+  // TODO(benvanik): do we need to do this for all stages if the locations
+  // match?
+  if (vertex_state_loc != -1) {
+    glProgramUniformHandleui64ARB(vertex_program, vertex_state_loc,
+                                  cmd.state_data_gpu_ptr);
+  }
+  if (geometry_program && geometry_state_loc != -1) {
+    glProgramUniformHandleui64ARB(geometry_program, geometry_state_loc,
+                                  cmd.state_data_gpu_ptr);
+  }
+  if (fragment_state_loc != -1) {
+    glProgramUniformHandleui64ARB(fragment_program, fragment_state_loc,
+                                  cmd.state_data_gpu_ptr);
+  }
+
+  glBindProgramPipeline(pipeline);
+  // glDeleteProgramPipelines(1, &pipeline);
 
   return true;
 }
@@ -1641,15 +1775,9 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
   SCOPE_profile_cpu_f("gpu");
   auto& regs = *register_file_;
   auto& cmd = *draw_command;
+  assert_not_null(active_vertex_shader_);
 
-  if (!cmd.vertex_shader) {
-    // No vertex shader, no-op.
-    return true;
-  }
-
-  const auto& buffer_inputs = cmd.vertex_shader->buffer_inputs();
-
-  // glBindVertexArray(vertex_array);
+  const auto& buffer_inputs = active_vertex_shader_->buffer_inputs();
 
   for (size_t n = 0; n < buffer_inputs.count; n++) {
     const auto& desc = buffer_inputs.descs[n];
@@ -1685,9 +1813,100 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
         reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
         fetch->size);
 
-    /*glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV,
-                           desc.input_index,
-                           allocation.gpu_ptr, allocation.length);*/
+    uint32_t el_index = 0;
+    for (uint32_t i = 0; i < desc.element_count; ++i) {
+      const auto& el = desc.elements[i];
+      GLuint comp_count;
+      GLuint comp_size;
+      GLenum comp_type;
+      switch (el.format) {
+        case VertexFormat::k_8_8_8_8:
+          comp_count = 4;
+          comp_size = 1;
+          comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE;
+          break;
+        case VertexFormat::k_2_10_10_10:
+          comp_count = 4;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT_2_10_10_10_REV
+                                   : GL_UNSIGNED_INT_2_10_10_10_REV;
+          break;
+        case VertexFormat::k_10_11_11:
+          comp_count = 3;
+          comp_size = 4;
+          assert_false(el.is_signed);
+          comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV;
+          break;
+        /*case VertexFormat::k_11_11_10:
+          break;*/
+        case VertexFormat::k_16_16:
+          comp_count = 2;
+          comp_size = 2;
+          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
+          break;
+        case VertexFormat::k_16_16_FLOAT:
+          comp_count = 2;
+          comp_size = 2;
+          comp_type = GL_HALF_FLOAT;
+          break;
+        case VertexFormat::k_16_16_16_16:
+          comp_count = 4;
+          comp_size = 2;
+          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
+          break;
+        case VertexFormat::k_16_16_16_16_FLOAT:
+          comp_count = 4;
+          comp_size = 2;
+          comp_type = GL_HALF_FLOAT;
+          break;
+        case VertexFormat::k_32:
+          comp_count = 1;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_32:
+          comp_count = 2;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_32_32_32:
+          comp_count = 4;
+          comp_size = 4;
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_FLOAT:
+          comp_count = 1;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_FLOAT:
+          comp_count = 2;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_32_FLOAT:
+          comp_count = 3;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_32_32_FLOAT:
+          comp_count = 4;
+          comp_size = 4;
+          comp_type = GL_FLOAT;
+          break;
+        default:
+          assert_unhandled_case(el.format);
+          break;
+      }
+      size_t offset = el.offset_words * sizeof(uint32_t);
+      glEnableVertexAttribArray(el_index);
+      glVertexAttribFormatNV(el_index, comp_count, comp_type, el.is_normalized,
+                             desc.stride_words * sizeof(uint32_t));
+      glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index,
+                             allocation.gpu_ptr + offset,
+                             allocation.length - offset);
+      ++el_index;
+    }
 
     // Flush buffer before we draw.
     scratch_buffer_.Commit(std::move(allocation));
@@ -1782,7 +2001,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
   GLenum read_format;
   GLenum read_type;
   switch (copy_dest_format) {
-    case ColorFormat::kColor_8_8_8_8:
+    case ColorFormat::k_8_8_8_8:
       read_format = copy_dest_swap ? GL_BGRA : GL_RGBA;
       read_type = GL_UNSIGNED_BYTE;
       break;
@@ -1832,10 +2051,10 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
         // glBindBuffer(GL_READ_FRAMEBUFFER, framebuffer)
         glNamedFramebufferReadBuffer(source_framebuffer->framebuffer,
                                      GL_COLOR_ATTACHMENT0 + copy_src_select);
-        glReadPixels(x, y, w, h, read_format, read_type, ptr);
+        //glReadPixels(x, y, w, h, read_format, read_type, ptr);
       } else {
         // Source from the bound depth/stencil target.
-        glReadPixels(x, y, w, h, GL_DEPTH_STENCIL, read_type, ptr);
+        //glReadPixels(x, y, w, h, GL_DEPTH_STENCIL, read_type, ptr);
       }
       break;
     case CopyCommand::kRaw:
@@ -1876,7 +2095,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
     glClearNamedFramebufferfi(source_framebuffer->framebuffer, GL_DEPTH_STENCIL,
                               depth.float_value, stencil);
   }
-
+  
   return true;
 }
 
@@ -1890,8 +2109,8 @@ GLuint CommandProcessor::GetColorRenderTarget(uint32_t pitch,
   uint32_t height = 2560;
 
   // NOTE: we strip gamma formats down to normal ones.
-  if (format == ColorRenderTargetFormat::k8888Gamma) {
-    format = ColorRenderTargetFormat::k8888;
+  if (format == ColorRenderTargetFormat::k_8_8_8_8_GAMMA) {
+    format = ColorRenderTargetFormat::k_8_8_8_8;
   }
 
   for (auto& it = cached_color_render_targets_.begin();
@@ -1910,8 +2129,8 @@ GLuint CommandProcessor::GetColorRenderTarget(uint32_t pitch,
 
   GLenum internal_format;
   switch (format) {
-    case ColorRenderTargetFormat::k8888:
-    case ColorRenderTargetFormat::k8888Gamma:
+    case ColorRenderTargetFormat::k_8_8_8_8:
+    case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
       internal_format = GL_RGBA8;
       break;
     default:
diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h
index 242325699..e73464d38 100644
--- a/src/xenia/gpu/gl4/command_processor.h
+++ b/src/xenia/gpu/gl4/command_processor.h
@@ -47,9 +47,6 @@ struct DrawCommand {
   uint32_t index_count;
   uint32_t base_vertex;
 
-  GL4Shader* vertex_shader;
-  GL4Shader* pixel_shader;
-
   // Index buffer, if present.
   // If index_count > 0 but buffer is nullptr then auto draw.
   struct {
@@ -69,6 +66,8 @@ struct DrawCommand {
   size_t vertex_shader_sampler_count;
   SamplerInput pixel_shader_samplers[32];
   size_t pixel_shader_sampler_count;
+
+  GLuint64 state_data_gpu_ptr;
 };
 
 class CommandProcessor {
@@ -188,6 +187,7 @@ class CommandProcessor {
   bool IssueDraw(DrawCommand* draw_command);
   bool UpdateState(DrawCommand* draw_command);
   bool UpdateRenderTargets(DrawCommand* draw_command);
+  bool UpdateShaders(DrawCommand* draw_command);
   bool PopulateIndexBuffer(DrawCommand* draw_command);
   bool PopulateVertexBuffers(DrawCommand* draw_command);
   bool IssueCopy(DrawCommand* draw_command);
diff --git a/src/xenia/gpu/gl4/gl4_graphics_system.cc b/src/xenia/gpu/gl4/gl4_graphics_system.cc
index c61d4a6fa..a977f2a6f 100644
--- a/src/xenia/gpu/gl4/gl4_graphics_system.cc
+++ b/src/xenia/gpu/gl4/gl4_graphics_system.cc
@@ -11,6 +11,7 @@
 
 #include <poly/threading.h>
 #include <xenia/cpu/processor.h>
+#include <xenia/gpu/gl4/gl4_gpu-private.h>
 #include <xenia/gpu/gpu-private.h>
 
 namespace xe {
@@ -42,11 +43,19 @@ X_STATUS GL4GraphicsSystem::Setup() {
     control_ = std::make_unique<WGLControl>(loop);
     emulator_->main_window()->AddChild(control_.get());
 
+    if (FLAGS_thread_safe_gl) {
+      control_->context()->MakeCurrent();
+    }
+
     // Setup the GL context the command processor will do all its drawing in.
     // It's shared with the control context so that we can resolve framebuffers
     // from it.
     processor_context = control_->context()->CreateShared();
 
+    if (FLAGS_thread_safe_gl) {
+      control_->context()->ClearCurrent();
+    }
+
     control_ready_fence.Signal();
   });
   control_ready_fence.Wait();
diff --git a/src/xenia/gpu/gl4/gl4_shader.cc b/src/xenia/gpu/gl4/gl4_shader.cc
index d4861f1e2..251e9f197 100644
--- a/src/xenia/gpu/gl4/gl4_shader.cc
+++ b/src/xenia/gpu/gl4/gl4_shader.cc
@@ -15,7 +15,163 @@ namespace xe {
 namespace gpu {
 namespace gl4 {
 
-bool GL4Shader::TranslateImpl() { return true; }
+extern "C" GLEWContext* glewGetContext();
+
+GL4Shader::GL4Shader(ShaderType shader_type, uint64_t data_hash,
+                     const uint32_t* dword_ptr, uint32_t dword_count)
+    : Shader(shader_type, data_hash, dword_ptr, dword_count), program_(0) {}
+
+GL4Shader::~GL4Shader() { glDeleteProgram(program_); }
+
+const std::string header =
+    "#version 450\n"
+    "#extension all : warn\n"
+    "#extension GL_ARB_bindless_texture : require\n"
+    "#extension GL_ARB_explicit_uniform_location : require\n"
+    "#extension GL_ARB_shading_language_420pack : require\n"
+    "#extension GL_ARB_shader_storage_buffer_object : require\n"
+    "#extension GL_NV_shader_buffer_load : require\n"
+    "precision highp float;\n"
+    "precision highp int;\n"
+    "layout(std140, column_major) uniform;\n"
+    "layout(std430, column_major) buffer;\n"
+    "struct StateData {\n"
+    "  vec4 window_offset;\n"
+    "  vec4 window_scissor;\n"
+    "  vec4 viewport_offset;\n"
+    "  vec4 viewport_scale;\n"
+    "  vec4 alpha_test;\n"
+    "  vec4 float_consts[512];\n"
+    "  uint fetch_consts[32 * 6];\n"
+    "  int bool_consts[8];\n"
+    "  int loop_consts[32];\n"
+    "};\n"
+    "struct VertexData {\n"
+    "  vec4 o[16];\n"
+    "};\n"
+    "\n"
+    "uniform StateData* state;\n";
+
+bool GL4Shader::PrepareVertexShader(
+    const xenos::xe_gpu_program_cntl_t& program_cntl) {
+  if (has_prepared_) {
+    return is_valid_;
+  }
+  has_prepared_ = true;
+
+  std::string apply_viewport =
+      "vec4 applyViewport(vec4 pos) {\n"
+      // TODO(benvanik): piecewise viewport_enable -> offset/scale logic.
+      "  if (false) {\n"
+      "  } else {\n"
+      /*"    pos.xy = pos.xy / vec2(state->window_offset.z / 2.0, "
+      "-state->window_offset.w / 2.0) + vec2(-1.0, 1.0);\n"
+      "    pos.zw = vec2(0.0, 1.0);\n"*/
+      "    pos.xy = pos.xy / vec2(1280.0 / 2.0, "
+      "-720.0 / 2.0) + vec2(-1.0, 1.0);\n"
+      "    //pos.zw = vec2(0.0, 1.0);\n"
+      "  }\n"
+      "  pos.x = pos.x * state->viewport_scale.x + \n"
+      "      state->viewport_offset.x;\n"
+      "  pos.y = pos.y * state->viewport_scale.y + \n"
+      "      state->viewport_offset.y;\n"
+      "  pos.z = pos.z * state->viewport_scale.z + \n"
+      "      state->viewport_offset.z;\n"
+      "  pos.xy += state->window_offset.xy;\n"
+      "  return pos;\n"
+      "}\n";
+  std::string source =
+      header + apply_viewport +
+      "out gl_PerVertex {\n"
+      "  vec4 gl_Position;\n"
+      "  float gl_PointSize;\n"
+      "  float gl_ClipDistance[];\n"
+      "};\n"
+      "layout(location = 0) in vec3 iF0;\n"
+      "layout(location = 1) in vec4 iF1;\n"
+      "layout(location = 0) out VertexData vtx;\n"
+      "void main() {\n"
+      //"  vec4 oPos = vec4(iF0.xy, 0.0, 1.0);\n"
+      "  vec4 oPos = iF0.xxxx * state->float_consts[0];\n"
+      "  oPos = (iF0.yyyy * state->float_consts[1]) + oPos;\n"
+      "  oPos = (iF0.zzzz * state->float_consts[2]) + oPos;\n"
+      "  oPos = (vec4(1.0, 1.0, 1.0, 1.0) * state->float_consts[3]) + oPos;\n"
+      //"  gl_PointSize = 1.0;\n"
+      "  for (int i = 0; i < vtx.o.length(); ++i) {\n"
+      "    vtx.o[0] = vec4(0.0, 0.0, 0.0, 0.0);\n"
+      "  }\n"
+      "  vtx.o[0] = iF1;\n"
+      "  gl_Position = applyViewport(oPos);\n"
+      //"  gl_Position = oPos;\n"
+      "}\n";
+
+  if (!CompileProgram(source)) {
+    return false;
+  }
+
+  is_valid_ = true;
+  return true;
+}
+
+bool GL4Shader::PreparePixelShader(
+    const xenos::xe_gpu_program_cntl_t& program_cntl,
+    GL4Shader* vertex_shader) {
+  if (has_prepared_) {
+    return is_valid_;
+  }
+  has_prepared_ = true;
+
+  std::string source = header +
+                       "layout(location = 0) in VertexData vtx;\n"
+                       "layout(location = 0) out vec4 oC[4];\n"
+                       "void main() {\n"
+                       "  for (int i = 0; i < oC.length(); ++i) {\n"
+                       "    oC[i] = vec4(1.0, 0.0, 0.0, 1.0);\n"
+                       "  }\n"
+                       "  oC[0] = vtx.o[0];\n"
+                       //"  gl_FragDepth = 0.0;\n"
+                       "}\n";
+
+  if (!CompileProgram(source)) {
+    return false;
+  }
+
+  is_valid_ = true;
+  return true;
+}
+
+bool GL4Shader::CompileProgram(std::string source) {
+  assert_zero(program_);
+
+  translated_disassembly_ = std::move(source);
+  const char* source_str = translated_disassembly_.c_str();
+
+  program_ = glCreateShaderProgramv(shader_type_ == ShaderType::kVertex
+                                        ? GL_VERTEX_SHADER
+                                        : GL_FRAGMENT_SHADER,
+                                    1, &source_str);
+  if (!program_) {
+    PLOGE("Unable to create shader program");
+    return false;
+  }
+
+  GLint link_status = 0;
+  glGetProgramiv(program_, GL_LINK_STATUS, &link_status);
+  if (!link_status) {
+    // log_length includes the null character.
+    GLint log_length = 0;
+    glGetProgramiv(program_, GL_INFO_LOG_LENGTH, &log_length);
+    std::string info_log;
+    info_log.resize(log_length - 1);
+    glGetProgramInfoLog(program_, log_length, &log_length,
+                        const_cast<char*>(info_log.data()));
+    PLOGE("Unable to link program: %s", info_log.c_str());
+    error_log_ = std::move(info_log);
+    return false;
+  }
+
+  return true;
+}
 
 }  // namespace gl4
 }  // namespace gpu
diff --git a/src/xenia/gpu/gl4/gl4_shader.h b/src/xenia/gpu/gl4/gl4_shader.h
index 079307e7a..94489d766 100644
--- a/src/xenia/gpu/gl4/gl4_shader.h
+++ b/src/xenia/gpu/gl4/gl4_shader.h
@@ -11,6 +11,7 @@
 #define XENIA_GPU_GL4_GL4_SHADER_H_
 
 #include <xenia/common.h>
+#include <xenia/gpu/gl4/gl_context.h>
 #include <xenia/gpu/shader.h>
 
 namespace xe {
@@ -19,10 +20,20 @@ namespace gl4 {
 
 class GL4Shader : public Shader {
  public:
-  using Shader::Shader;
+  GL4Shader(ShaderType shader_type, uint64_t data_hash,
+            const uint32_t* dword_ptr, uint32_t dword_count);
+  ~GL4Shader() override;
+
+  GLuint program() const { return program_; }
+
+  bool PrepareVertexShader(const xenos::xe_gpu_program_cntl_t& program_cntl);
+  bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl,
+                          GL4Shader* vertex_shader);
 
  protected:
-  bool TranslateImpl() override;
+  bool CompileProgram(std::string source);
+
+  GLuint program_;
 };
 
 }  // namespace gl4
diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc
index 334331716..d2cb0bd5d 100644
--- a/src/xenia/gpu/shader.cc
+++ b/src/xenia/gpu/shader.cc
@@ -16,10 +16,14 @@ namespace xe {
 namespace gpu {
 
 using namespace xe::gpu::ucode;
+using namespace xe::gpu::xenos;
 
 Shader::Shader(ShaderType shader_type, uint64_t data_hash,
                const uint32_t* dword_ptr, uint32_t dword_count)
-    : shader_type_(shader_type), data_hash_(data_hash), is_valid_(false) {
+    : shader_type_(shader_type),
+      data_hash_(data_hash),
+      has_prepared_(false),
+      is_valid_(false) {
   data_.resize(dword_count);
   poly::copy_and_swap(data_.data(), dword_ptr, dword_count);
   std::memset(&alloc_counts_, 0, sizeof(alloc_counts_));
@@ -35,18 +39,7 @@ Shader::Shader(ShaderType shader_type, uint64_t data_hash,
   GatherIO();
 }
 
-bool Shader::Translate() {
-  assert_false(is_valid_);
-
-  // TODO(benvanik): disk cache/etc - lookup hash and load if found.
-  // TODO(benvanik): dump to disk.
-
-  // Attempt implementation-specific translation.
-  // This may take awhile, and probably will fail.
-  // TODO(benvanik): parallelize? (allow two translations at once, etc).
-  is_valid_ = TranslateImpl();
-  return is_valid_;
-}
+Shader::~Shader() = default;
 
 void Shader::GatherIO() {
   // Process all execution blocks.
@@ -203,44 +196,43 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
   }
 
   el->vtx_fetch = *vtx;
-  el->format = vtx->format;
+  el->format = static_cast<VertexFormat>(vtx->format);
   el->is_normalized = vtx->num_format_all == 0;
   el->is_signed = vtx->format_comp_all == 1;
   el->offset_words = vtx->offset;
   el->size_words = 0;
   switch (el->format) {
-    case FMT_8_8_8_8:
-    case FMT_2_10_10_10:
-    case FMT_10_11_11:
-    case FMT_11_11_10:
+    case VertexFormat::k_8_8_8_8:
+    case VertexFormat::k_2_10_10_10:
+    case VertexFormat::k_10_11_11:
+    case VertexFormat::k_11_11_10:
       el->size_words = 1;
       break;
-    case FMT_16_16:
-    case FMT_16_16_FLOAT:
+    case VertexFormat::k_16_16:
+    case VertexFormat::k_16_16_FLOAT:
       el->size_words = 1;
       break;
-    case FMT_16_16_16_16:
-    case FMT_16_16_16_16_FLOAT:
+    case VertexFormat::k_16_16_16_16:
+    case VertexFormat::k_16_16_16_16_FLOAT:
       el->size_words = 2;
       break;
-    case FMT_32:
-    case FMT_32_FLOAT:
+    case VertexFormat::k_32:
+    case VertexFormat::k_32_FLOAT:
       el->size_words = 1;
       break;
-    case FMT_32_32:
-    case FMT_32_32_FLOAT:
+    case VertexFormat::k_32_32:
+    case VertexFormat::k_32_32_FLOAT:
       el->size_words = 2;
       break;
-    case FMT_32_32_32_FLOAT:
+    case VertexFormat::k_32_32_32_FLOAT:
       el->size_words = 3;
       break;
-    case FMT_32_32_32_32:
-    case FMT_32_32_32_32_FLOAT:
+    case VertexFormat::k_32_32_32_32:
+    case VertexFormat::k_32_32_32_32_FLOAT:
       el->size_words = 4;
       break;
     default:
-      XELOGE("Unknown vertex format: %d", el->format);
-      assert_always();
+      assert_unhandled_case(el->format);
       break;
   }
 }
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index 0b755e7c9..05438657c 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -20,21 +20,19 @@ namespace gpu {
 
 class Shader {
  public:
-  Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr,
-         uint32_t dword_count);
+  virtual ~Shader();
 
   ShaderType type() const { return shader_type_; }
+  bool has_prepared() const { return has_prepared_; }
   bool is_valid() const { return is_valid_; }
   const std::string& ucode_disassembly() const { return ucode_disassembly_; }
   const std::string& translated_disassembly() const {
     return translated_disassembly_;
   }
 
-  bool Translate();
-
   struct BufferDescElement {
     ucode::instr_fetch_vtx_t vtx_fetch;
-    uint32_t format;
+    xenos::VertexFormat format;
     uint32_t offset_words;
     uint32_t size_words;
     bool is_signed;
@@ -76,7 +74,8 @@ class Shader {
   const std::vector<ucode::instr_cf_alloc_t>& allocs() const { return allocs_; }
 
  protected:
-  virtual bool TranslateImpl() = 0;
+  Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr,
+         uint32_t dword_count);
 
   void GatherIO();
   void GatherAlloc(const ucode::instr_cf_alloc_t* cf);
@@ -87,10 +86,12 @@ class Shader {
   ShaderType shader_type_;
   uint64_t data_hash_;
   std::vector<uint32_t> data_;
+  bool has_prepared_;
   bool is_valid_;
 
   std::string ucode_disassembly_;
   std::string translated_disassembly_;
+  std::string error_log_;
 
   AllocCounts alloc_counts_;
   std::vector<ucode::instr_cf_exec_t> execs_;
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 97c48254c..668f94aae 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -72,9 +72,9 @@ enum class MsaaSamples : uint32_t {
 };
 
 enum class ColorRenderTargetFormat : uint32_t {
-  k8888 = 0,       // D3DFMT_A8R8G8B8 (or ABGR?)
-  k8888Gamma = 1,  // D3DFMT_A8R8G8B8 with gamma correction
-                   // ...
+  k_8_8_8_8 = 0,        // D3DFMT_A8R8G8B8 (or ABGR?)
+  k_8_8_8_8_GAMMA = 1,  // D3DFMT_A8R8G8B8 with gamma correction
+                        // ...
 };
 
 enum class DepthRenderTargetFormat : uint32_t {
@@ -98,29 +98,47 @@ enum class CopyCommand : uint32_t {
 
 // Subset of a2xx_sq_surfaceformat.
 enum class ColorFormat : uint32_t {
-  kColor_8 = 2,
-  kColor_1_5_5_5 = 3,
-  kColor_5_6_5 = 4,
-  kColor_6_5_5 = 5,
-  kColor_8_8_8_8 = 6,
-  kColor_2_10_10_10 = 7,
-  kColor_8_A = 8,
-  kColor_8_B = 9,
-  kColor_8_8 = 10,
-  kColor_8_8_8_8_A = 14,
-  kColor_4_4_4_4 = 15,
-  kColor_10_11_11 = 16,
-  kColor_11_11_10 = 17,
-  kColor_16 = 24,
-  kColor_16_16 = 25,
-  kColor_16_16_16_16 = 26,
-  kColor_16_FLOAT = 30,
-  kColor_16_16_FLOAT = 31,
-  kColor_16_16_16_16_FLOAT = 32,
-  kColor_32_FLOAT = 36,
-  kColor_32_32_FLOAT = 37,
-  kColor_32_32_32_32_FLOAT = 38,
-  kColor_2_10_10_10_FLOAT = 62,
+  k_8 = 2,
+  k_1_5_5_5 = 3,
+  k_5_6_5 = 4,
+  k_6_5_5 = 5,
+  k_8_8_8_8 = 6,
+  k_2_10_10_10 = 7,
+  k_8_A = 8,
+  k_8_B = 9,
+  k_8_8 = 10,
+  k_8_8_8_8_A = 14,
+  k_4_4_4_4 = 15,
+  k_10_11_11 = 16,
+  k_11_11_10 = 17,
+  k_16 = 24,
+  k_16_16 = 25,
+  k_16_16_16_16 = 26,
+  k_16_FLOAT = 30,
+  k_16_16_FLOAT = 31,
+  k_16_16_16_16_FLOAT = 32,
+  k_32_FLOAT = 36,
+  k_32_32_FLOAT = 37,
+  k_32_32_32_32_FLOAT = 38,
+  k_2_10_10_10_FLOAT = 62,
+};
+
+enum class VertexFormat : uint32_t {
+  k_8_8_8_8 = 6,
+  k_2_10_10_10 = 7,
+  k_10_11_11 = 16,
+  k_11_11_10 = 17,
+  k_16_16 = 25,
+  k_16_16_16_16 = 26,
+  k_16_16_FLOAT = 31,
+  k_16_16_16_16_FLOAT = 32,
+  k_32 = 33,
+  k_32_32 = 34,
+  k_32_32_32_32 = 35,
+  k_32_FLOAT = 36,
+  k_32_32_FLOAT = 37,
+  k_32_32_32_32_FLOAT = 38,
+  k_32_32_32_FLOAT = 57,
 };
 
 #define XE_GPU_MAKE_SWIZZLE(x, y, z, w)                        \