From 4fcf9c6229a1837d816c372b627ba8fe5f92def5 Mon Sep 17 00:00:00 2001
From: Ben Vanik <ben.vanik@gmail.com>
Date: Sun, 4 Jan 2015 11:20:42 -0800
Subject: [PATCH] MultiDrawIndirect draw batching - now down to <20us per draw.

---
 src/xenia/gpu/gl4/circular_buffer.h        |   2 +
 src/xenia/gpu/gl4/command_processor.cc     | 756 ++++++++-------------
 src/xenia/gpu/gl4/command_processor.h      | 120 +---
 src/xenia/gpu/gl4/draw_batcher.cc          | 384 +++++++++++
 src/xenia/gpu/gl4/draw_batcher.h           | 230 +++++++
 src/xenia/gpu/gl4/gl4_shader.cc            | 206 ++++--
 src/xenia/gpu/gl4/gl4_shader.h             |   6 +
 src/xenia/gpu/gl4/gl4_shader_translator.cc |   4 +-
 src/xenia/gpu/gl4/gl_context.cc            |  11 +-
 src/xenia/gpu/gl4/sources.gypi             |   2 +
 10 files changed, 1120 insertions(+), 601 deletions(-)
 create mode 100644 src/xenia/gpu/gl4/draw_batcher.cc
 create mode 100644 src/xenia/gpu/gl4/draw_batcher.h

diff --git a/src/xenia/gpu/gl4/circular_buffer.h b/src/xenia/gpu/gl4/circular_buffer.h
index d7288d5a8..aaa7ea7e8 100644
--- a/src/xenia/gpu/gl4/circular_buffer.h
+++ b/src/xenia/gpu/gl4/circular_buffer.h
@@ -35,6 +35,8 @@ class CircularBuffer {
   void Shutdown();
 
   GLuint handle() const { return buffer_; }
+  GLuint64 gpu_handle() const { return gpu_base_; }
+  size_t capacity() const { return capacity_; }
 
   bool CanAcquire(size_t length);
   Allocation Acquire(size_t length);
diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc
index 750079ae5..c686a94af 100644
--- a/src/xenia/gpu/gl4/command_processor.cc
+++ b/src/xenia/gpu/gl4/command_processor.cc
@@ -25,6 +25,8 @@
 #define XETRACECP(fmt, ...) \
   if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__)
 
+#define FINE_GRAINED_DRAW_SCOPES 1
+
 namespace xe {
 namespace gpu {
 namespace gl4 {
@@ -39,6 +41,7 @@ const GLuint kAnyTarget = UINT_MAX;
 // with the GPU, so this should be large enough to prevent that in a normal
 // frame.
 const size_t kScratchBufferCapacity = 256 * 1024 * 1024;
+const size_t kScratchBufferAlignment = 256;
 
 CommandProcessor::CachedPipeline::CachedPipeline()
     : vertex_program(0), fragment_program(0), handles({0}) {}
@@ -69,12 +72,12 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
       active_vertex_shader_(nullptr),
       active_pixel_shader_(nullptr),
       active_framebuffer_(nullptr),
-      vertex_array_(0),
       point_list_geometry_program_(0),
       rect_list_geometry_program_(0),
       quad_list_geometry_program_(0),
-      scratch_buffer_(kScratchBufferCapacity) {
-  std::memset(&draw_command_, 0, sizeof(draw_command_));
+      draw_index_count_(0),
+      draw_batcher_(graphics_system_->register_file()),
+      scratch_buffer_(kScratchBufferCapacity, kScratchBufferAlignment) {
   LARGE_INTEGER perf_counter;
   QueryPerformanceCounter(&perf_counter);
   time_base_ = perf_counter.QuadPart;
@@ -163,6 +166,9 @@ void CommandProcessor::WorkerMain() {
 }
 
 bool CommandProcessor::SetupGL() {
+  if (FLAGS_vendor_gl_extensions && GLEW_NV_vertex_buffer_unified_memory) {
+    has_bindless_vbos_ = true;
+  }
 
   // Circular buffer holding scratch vertex/index data.
   if (!scratch_buffer_.Initialize()) {
@@ -170,27 +176,18 @@ bool CommandProcessor::SetupGL() {
     return false;
   }
 
+  // Command buffer.
+  if (!draw_batcher_.Initialize(&scratch_buffer_)) {
+    PLOGE("Unable to initialize command buffer");
+    return false;
+  }
+
   // Texture cache that keeps track of any textures/samplers used.
   if (!texture_cache_.Initialize(membase_, &scratch_buffer_)) {
     PLOGE("Unable to initialize texture cache");
     return false;
   }
 
-  // TODO(benvanik): cache.
-  glGenVertexArrays(1, &vertex_array_);
-  glBindVertexArray(vertex_array_);
-
-  if (FLAGS_vendor_gl_extensions && GLEW_NV_vertex_buffer_unified_memory) {
-    has_bindless_vbos_ = true;
-    glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
-    glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
-  }
-  GLint max_vertex_attribs = 0;
-  glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &max_vertex_attribs);
-  for (GLint i = 0; i < max_vertex_attribs; ++i) {
-    glEnableVertexAttribArray(i);
-  }
-
   const std::string geometry_header =
       "#version 450\n"
       "#extension all : warn\n"
@@ -346,8 +343,8 @@ void CommandProcessor::ShutdownGL() {
   glDeleteProgram(point_list_geometry_program_);
   glDeleteProgram(rect_list_geometry_program_);
   glDeleteProgram(quad_list_geometry_program_);
-  glDeleteVertexArrays(1, &vertex_array_);
   texture_cache_.Shutdown();
+  draw_batcher_.Shutdown();
   scratch_buffer_.Shutdown();
 }
 
@@ -437,7 +434,8 @@ void CommandProcessor::PrepareForWait() {
   // TODO(benvanik): fences and fancy stuff. We should figure out a way to
   // make interrupt callbacks from the GPU so that we don't have to do a full
   // synchronize here.
-  glFlush();
+  //glFlush();
+  glFinish();
 
   if (FLAGS_thread_safe_gl) {
     context_->ClearCurrent();
@@ -1106,46 +1104,45 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
   uint32_t index_count = dword1 >> 16;
   auto prim_type = static_cast<PrimitiveType>(dword1 & 0x3F);
 
-  uint32_t index_base = 0;
-  uint32_t index_size = 0;
-  Endian index_endianness = Endian::kUnspecified;
-  bool index_32bit = false;
   uint32_t src_sel = (dword1 >> 6) & 0x3;
   if (src_sel == 0x0) {
     // Indexed draw.
-    index_base = reader->Read();
-    index_size = reader->Read();
-    index_endianness = static_cast<Endian>(index_size >> 30);
+    index_buffer_info_.guest_base = reader->Read();
+    uint32_t index_size = reader->Read();
+    index_buffer_info_.endianness = static_cast<Endian>(index_size >> 30);
     index_size &= 0x00FFFFFF;
-    index_32bit = (dword1 >> 11) & 0x1;
+    bool index_32bit = (dword1 >> 11) & 0x1;
+    index_buffer_info_.format =
+        index_32bit ? IndexFormat::kInt32 : IndexFormat::kInt16;
     index_size *= index_32bit ? 4 : 2;
+    index_buffer_info_.length = index_size;
+    index_buffer_info_.count = index_count;
   } else if (src_sel == 0x2) {
     // Auto draw.
+    index_buffer_info_.guest_base = 0;
+    index_buffer_info_.length = 0;
   } else {
     // Unknown source select.
     assert_always();
   }
+  draw_index_count_ = index_count;
 
-  PrepareDraw(&draw_command_);
-  draw_command_.prim_type = prim_type;
-  draw_command_.start_index = 0;
-  draw_command_.index_count = index_count;
-  draw_command_.base_vertex = 0;
+  bool draw_valid = false;
   if (src_sel == 0x0) {
     // Indexed draw.
-    draw_command_.index_buffer.address = membase_ + index_base;
-    draw_command_.index_buffer.size = index_size;
-    draw_command_.index_buffer.endianness = index_endianness;
-    draw_command_.index_buffer.format =
-        index_32bit ? IndexFormat::kInt32 : IndexFormat::kInt16;
+    draw_valid = draw_batcher_.BeginDrawElements(prim_type, index_count,
+                                                 index_buffer_info_.format);
   } else if (src_sel == 0x2) {
     // Auto draw.
-    draw_command_.index_buffer.address = nullptr;
+    draw_valid = draw_batcher_.BeginDrawArrays(prim_type, index_count);
   } else {
     // Unknown source select.
     assert_always();
   }
-  return IssueDraw(&draw_command_);
+  if (!draw_valid) {
+    return false;
+  }
+  return IssueDraw();
 }
 
 bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
@@ -1164,14 +1161,15 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
   uint32_t indices_size = index_count * (index_32bit ? 4 : 2);
   reader->CheckRead(indices_size / sizeof(uint32_t));
   uint32_t index_ptr = reader->ptr();
+  index_buffer_info_.guest_base = 0;
+  index_buffer_info_.length = 0;
   reader->Advance(count - 1);
-  PrepareDraw(&draw_command_);
-  draw_command_.prim_type = prim_type;
-  draw_command_.start_index = 0;
-  draw_command_.index_count = index_count;
-  draw_command_.base_vertex = 0;
-  draw_command_.index_buffer.address = nullptr;
-  return IssueDraw(&draw_command_);
+  draw_index_count_ = index_count;
+  bool draw_valid = draw_batcher_.BeginDrawArrays(prim_type, index_count);
+  if (!draw_valid) {
+    return false;
+  }
+  return IssueDraw();
 }
 
 bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader,
@@ -1319,58 +1317,30 @@ bool CommandProcessor::LoadShader(ShaderType shader_type,
   return true;
 }
 
-void CommandProcessor::PrepareDraw(DrawCommand* draw_command) {
-  auto& regs = *register_file_;
-  auto& cmd = *draw_command;
-
-  // Reset the things we don't modify so that we have clean state.
-  cmd.prim_type = PrimitiveType::kPointList;
-  cmd.index_count = 0;
-  cmd.index_buffer.address = nullptr;
-
-  // Starting index when drawing indexed.
-  cmd.start_index = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32;
-
-  // Min/max index ranges. This is often [0,FFFF|FFFFFF], but if it's not we
-  // can use it to do a glDrawRangeElements.
-  cmd.min_index = regs[XE_GPU_REG_VGT_MIN_VTX_INDX].u32;
-  cmd.max_index = regs[XE_GPU_REG_VGT_MAX_VTX_INDX].u32;
-
-  // ?
-  cmd.base_vertex = 0;
-
-  cmd.state_data = nullptr;
-}
-
-bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
+bool CommandProcessor::IssueDraw() {
+#if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   auto& regs = *register_file_;
-  auto& cmd = *draw_command;
 
   auto enable_mode =
       static_cast<ModeControl>(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7);
   if (enable_mode == ModeControl::kIgnore) {
     // Ignored.
+    draw_batcher_.DiscardDraw();
     return true;
   } else if (enable_mode == ModeControl::kCopy) {
     // Special copy handling.
-    return IssueCopy(draw_command);
-  }
-
-  // Allocate a state data block.
-  // Everything the shaders access lives here.
-  auto allocation = scratch_buffer_.Acquire(sizeof(UniformDataBlock));
-  scratch_buffer_stats_.total_state_data_size += sizeof(UniformDataBlock);
-  cmd.state_data = reinterpret_cast<UniformDataBlock*>(allocation.host_ptr);
-  if (!cmd.state_data) {
-    PLOGE("Unable to allocate uniform data buffer");
-    return false;
+    draw_batcher_.DiscardDraw();
+    return IssueCopy();
   }
 
 #define CHECK_ISSUE_UPDATE_STATUS(status, mismatch, error_message) \
   {                                                                \
     if (status == UpdateStatus::kError) {                          \
       PLOGE(error_message);                                        \
+      draw_batcher_.DiscardDraw();                                 \
       return false;                                                \
     } else if (status == UpdateStatus::kMismatch) {                \
       mismatch = true;                                             \
@@ -1379,93 +1349,31 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
 
   UpdateStatus status;
   bool mismatch = false;
-  status = UpdateShaders(draw_command);
+  status = UpdateShaders(draw_batcher_.prim_type());
   CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to prepare draw shaders");
-  status = UpdateRenderTargets(draw_command);
+  status = UpdateRenderTargets();
   CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup render targets");
   if (!active_framebuffer_) {
     // No framebuffer, so nothing we do will actually have an effect.
     // Treat it as a no-op.
+    // TODO(benvanik): if we have a vs export, still allow it to go.
     XETRACECP("No-op draw (no framebuffer set)");
+    draw_batcher_.DiscardDraw();
     return true;
   }
 
-  status = UpdateState(draw_command);
+  status = UpdateState();
   CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup render state");
-  status = UpdateConstants(draw_command);
-  CHECK_ISSUE_UPDATE_STATUS(status, mismatch,
-                            "Unable to update shader constants");
-  status = PopulateSamplers(draw_command);
+  status = PopulateSamplers();
   CHECK_ISSUE_UPDATE_STATUS(status, mismatch,
                             "Unable to prepare draw samplers");
 
-  status = PopulateIndexBuffer(draw_command);
+  status = PopulateIndexBuffer();
   CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup index buffer");
-  status = PopulateVertexBuffers(draw_command);
+  status = PopulateVertexBuffers();
   CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup vertex buffers");
 
-  GLenum prim_type = 0;
-  switch (cmd.prim_type) {
-    case PrimitiveType::kPointList:
-      prim_type = GL_POINTS;
-      break;
-    case PrimitiveType::kLineList:
-      prim_type = GL_LINES;
-      break;
-    case PrimitiveType::kLineStrip:
-      prim_type = GL_LINE_STRIP;
-      break;
-    case PrimitiveType::kLineLoop:
-      prim_type = GL_LINE_LOOP;
-      break;
-    case PrimitiveType::kTriangleList:
-      prim_type = GL_TRIANGLES;
-      break;
-    case PrimitiveType::kTriangleStrip:
-      prim_type = GL_TRIANGLE_STRIP;
-      break;
-    case PrimitiveType::kTriangleFan:
-      prim_type = GL_TRIANGLE_FAN;
-      break;
-    case PrimitiveType::kRectangleList:
-      prim_type = GL_TRIANGLE_STRIP;
-      break;
-    case PrimitiveType::kQuadList:
-      prim_type = GL_LINES_ADJACENCY;
-      break;
-    default:
-    case PrimitiveType::kUnknown0x07:
-      prim_type = GL_POINTS;
-      XELOGE("unsupported primitive type %d", cmd.prim_type);
-      assert_unhandled_case(cmd.prim_type);
-      return false;
-  }
-
-  // Commit the state buffer - nothing can change after this.
-  glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, scratch_buffer_.handle(),
-                    allocation.offset, allocation.length);
-  scratch_buffer_.Commit(std::move(allocation));
-  scratch_buffer_.Flush();
-
-  if (cmd.index_buffer.address) {
-    // Indexed draw.
-    // PopulateIndexBuffer has our element array setup.
-    size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32
-                              ? sizeof(uint32_t)
-                              : sizeof(uint16_t);
-    glDrawElementsBaseVertex(
-        prim_type, cmd.index_count,
-        cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT
-                                                       : GL_UNSIGNED_SHORT,
-        reinterpret_cast<void*>(cmd.index_buffer.buffer_offset +
-                                cmd.start_index * element_size),
-        cmd.base_vertex);
-  } else {
-    // Auto draw.
-    glDrawArrays(prim_type, cmd.start_index, cmd.index_count);
-  }
-
-  return true;
+  return draw_batcher_.CommitDraw();
 }
 
 bool CommandProcessor::SetShadowRegister(uint32_t& dest,
@@ -1487,8 +1395,129 @@ bool CommandProcessor::SetShadowRegister(float& dest, uint32_t register_name) {
   return true;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::UpdateShaders(
+    PrimitiveType prim_type) {
+  auto& regs = update_shaders_regs_;
+
+  bool dirty = false;
+  dirty |= SetShadowRegister(regs.sq_program_cntl, XE_GPU_REG_SQ_PROGRAM_CNTL);
+  dirty |= regs.vertex_shader != active_vertex_shader_;
+  dirty |= regs.pixel_shader != active_pixel_shader_;
+  dirty |= regs.prim_type != prim_type;
+  if (!dirty) {
+    return UpdateStatus::kCompatible;
+  }
+  regs.vertex_shader = active_vertex_shader_;
+  regs.pixel_shader = active_pixel_shader_;
+  regs.prim_type = prim_type;
+
+  SCOPE_profile_cpu_f("gpu");
+
+  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);
+
+  xe_gpu_program_cntl_t program_cntl;
+  program_cntl.dword_0 = regs.sq_program_cntl;
+  if (!active_vertex_shader_->has_prepared()) {
+    if (!active_vertex_shader_->PrepareVertexShader(program_cntl)) {
+      XELOGE("Unable to prepare vertex shader");
+      return UpdateStatus::kError;
+    }
+  } else if (!active_vertex_shader_->is_valid()) {
+    XELOGE("Vertex shader invalid");
+    return UpdateStatus::kError;
+  }
+
+  if (!active_pixel_shader_->has_prepared()) {
+    if (!active_pixel_shader_->PreparePixelShader(program_cntl)) {
+      XELOGE("Unable to prepare pixel shader");
+      return UpdateStatus::kError;
+    }
+  } else if (!active_pixel_shader_->is_valid()) {
+    XELOGE("Pixel shader invalid");
+    return UpdateStatus::kError;
+  }
+
+  GLuint vertex_program = active_vertex_shader_->program();
+  GLuint fragment_program = active_pixel_shader_->program();
+
+  uint64_t key = (uint64_t(vertex_program) << 32) | fragment_program;
+  CachedPipeline* cached_pipeline = nullptr;
+  auto it = cached_pipelines_.find(key);
+  if (it == cached_pipelines_.end()) {
+    // Existing pipeline for these programs not found - create it.
+    auto new_pipeline = std::make_unique<CachedPipeline>();
+    new_pipeline->vertex_program = vertex_program;
+    new_pipeline->fragment_program = fragment_program;
+    new_pipeline->handles.default_pipeline = 0;
+    cached_pipeline = new_pipeline.get();
+    all_pipelines_.emplace_back(std::move(new_pipeline));
+    cached_pipelines_.insert({key, cached_pipeline});
+  } else {
+    // Found a pipeline container - it may or may not have what we want.
+    cached_pipeline = it->second;
+  }
+  if (!cached_pipeline->handles.default_pipeline) {
+    // Perhaps it's a bit wasteful to do all of these, but oh well.
+    GLuint pipelines[4];
+    glCreateProgramPipelines(GLsizei(poly::countof(pipelines)), pipelines);
+
+    glUseProgramStages(pipelines[0], GL_VERTEX_SHADER_BIT, vertex_program);
+    glUseProgramStages(pipelines[0], GL_FRAGMENT_SHADER_BIT, fragment_program);
+    cached_pipeline->handles.default_pipeline = pipelines[0];
+
+    glUseProgramStages(pipelines[1], GL_VERTEX_SHADER_BIT, vertex_program);
+    glUseProgramStages(pipelines[1], GL_GEOMETRY_SHADER_BIT,
+                       point_list_geometry_program_);
+    glUseProgramStages(pipelines[1], GL_FRAGMENT_SHADER_BIT, fragment_program);
+    cached_pipeline->handles.point_list_pipeline = pipelines[1];
+
+    glUseProgramStages(pipelines[2], GL_VERTEX_SHADER_BIT, vertex_program);
+    glUseProgramStages(pipelines[2], GL_GEOMETRY_SHADER_BIT,
+                       rect_list_geometry_program_);
+    glUseProgramStages(pipelines[2], GL_FRAGMENT_SHADER_BIT, fragment_program);
+    cached_pipeline->handles.rect_list_pipeline = pipelines[2];
+
+    glUseProgramStages(pipelines[3], GL_VERTEX_SHADER_BIT, vertex_program);
+    glUseProgramStages(pipelines[3], GL_GEOMETRY_SHADER_BIT,
+                       quad_list_geometry_program_);
+    glUseProgramStages(pipelines[3], GL_FRAGMENT_SHADER_BIT, fragment_program);
+    cached_pipeline->handles.quad_list_pipeline = pipelines[3];
+
+    // This can be set once, as the buffer never changes.
+    if (has_bindless_vbos_) {
+      glBindVertexArray(active_vertex_shader_->vao());
+      glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0,
+                             scratch_buffer_.gpu_handle(),
+                             scratch_buffer_.capacity());
+    } else {
+      glVertexArrayElementBuffer(active_vertex_shader_->vao(),
+                                 scratch_buffer_.handle());
+    }
+  }
+
+  GLuint pipeline = cached_pipeline->handles.default_pipeline;
+  switch (regs.prim_type) {
+    case PrimitiveType::kPointList:
+      pipeline = cached_pipeline->handles.point_list_pipeline;
+      break;
+    case PrimitiveType::kRectangleList:
+      pipeline = cached_pipeline->handles.rect_list_pipeline;
+      break;
+    case PrimitiveType::kQuadList:
+      pipeline = cached_pipeline->handles.quad_list_pipeline;
+      break;
+  }
+
+  draw_batcher_.ReconfigurePipeline(active_vertex_shader_, active_pixel_shader_,
+                                    pipeline);
+
+  glBindProgramPipeline(pipeline);
+  glBindVertexArray(active_vertex_shader_->vao());
+
+  return UpdateStatus::kMismatch;
+}
+
+CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets() {
   auto& regs = update_render_targets_regs_;
 
   bool dirty = false;
@@ -1509,6 +1538,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets(
 
   SCOPE_profile_cpu_f("gpu");
 
+  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);
+
   auto enable_mode = static_cast<ModeControl>(regs.rb_modecontrol & 0x7);
 
   // RB_SURFACE_INFO
@@ -1586,10 +1617,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets(
   return UpdateStatus::kMismatch;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::UpdateState(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::UpdateState() {
   auto& regs = *register_file_;
-  auto state_data = draw_command->state_data;
 
   bool mismatch = false;
 
@@ -1597,10 +1626,9 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateState(
   // Deprecated in GL, implemented in shader.
   // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard;
   uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32;
-  state_data->alpha_test.x =
-      (color_control & 0x4) ? 1.0f : 0.0f;                // ALPAHTESTENABLE
-  state_data->alpha_test.y = float(color_control & 0x3);  // ALPHAFUNC
-  state_data->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
+  draw_batcher_.set_alpha_test((color_control & 0x4) != 0,  // ALPAHTESTENABLE
+                               color_control & 0x3,         // ALPHAFUNC
+                               regs[XE_GPU_REG_RB_ALPHA_REF].f32);
 
 #define CHECK_UPDATE_STATUS(status, mismatch, error_message) \
   {                                                          \
@@ -1613,22 +1641,20 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateState(
   }
 
   UpdateStatus status;
-  status = UpdateViewportState(draw_command);
+  status = UpdateViewportState();
   CHECK_UPDATE_STATUS(status, mismatch, "Unable to update viewport state");
-  status = UpdateRasterizerState(draw_command);
+  status = UpdateRasterizerState();
   CHECK_UPDATE_STATUS(status, mismatch, "Unable to update rasterizer state");
-  status = UpdateBlendState(draw_command);
+  status = UpdateBlendState();
   CHECK_UPDATE_STATUS(status, mismatch, "Unable to update blend state");
-  status = UpdateDepthStencilState(draw_command);
+  status = UpdateDepthStencilState();
   CHECK_UPDATE_STATUS(status, mismatch, "Unable to update depth/stencil state");
 
   return mismatch ? UpdateStatus::kMismatch : UpdateStatus::kCompatible;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState() {
   auto& regs = *register_file_;
-  auto state_data = draw_command->state_data;
 
   // NOTE: we don't track state here as this is all cheap to update (ish).
 
@@ -1644,18 +1670,16 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState(
   // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
   if ((mode_control >> 17) & 1) {
     uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32;
-    state_data->window_offset.x = float(window_offset & 0x7FFF);
-    state_data->window_offset.y = float((window_offset >> 16) & 0x7FFF);
+    draw_batcher_.set_window_offset(window_offset & 0x7FFF,
+                                    (window_offset >> 16) & 0x7FFF);
   } else {
-    state_data->window_offset.x = 0;
-    state_data->window_offset.y = 0;
+    draw_batcher_.set_window_offset(0, 0);
   }
   uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
   uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
-  state_data->window_scissor.x = float(window_scissor_tl & 0x7FFF);
-  state_data->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF);
-  state_data->window_scissor.z = float(window_scissor_br & 0x7FFF);
-  state_data->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);
+  draw_batcher_.set_window_scissor(
+      window_scissor_tl & 0x7FFF, (window_scissor_tl >> 16) & 0x7FFF,
+      window_scissor_br & 0x7FFF, (window_scissor_br >> 16) & 0x7FFF);
 
   // HACK: no clue where to get these values.
   // RB_SURFACE_INFO
@@ -1676,8 +1700,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState(
       window_height_scalar = 2;
       break;
   }
-  state_data->window_offset.z = window_width_scalar;
-  state_data->window_offset.w = window_height_scalar;
+  draw_batcher_.set_window_scalar(window_width_scalar, window_height_scalar);
 
   // Whether each of the viewport settings is enabled.
   // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
@@ -1693,33 +1716,25 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState(
               vport_yoffset_enable == vport_zoffset_enable);
 
   // Viewport scaling. Only enabled if the flags are all set.
-  state_data->viewport_scale.x =
-      vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1;  // 640
-  state_data->viewport_offset.x = vport_xoffset_enable
-                                      ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
-                                      : 0;  // 640
-  state_data->viewport_scale.y = vport_yscale_enable
-                                     ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
-                                     : 1;  // -360
-  state_data->viewport_offset.y = vport_yoffset_enable
-                                      ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
-                                      : 0;  // 360
-  state_data->viewport_scale.z =
-      vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1;  // 1
-  state_data->viewport_offset.z =
-      vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0;  // 0
+  draw_batcher_.set_viewport_offset(
+      vport_xoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 : 0,
+      vport_yoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 : 0,
+      vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0);
+  draw_batcher_.set_viewport_scale(
+      vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1,
+      vport_yscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 : 1,
+      vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1);
 
   // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
   // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0.
   //            = false: multiply the X, Y coordinates by 1/W0.
-  state_data->vtx_fmt.x = state_data->vtx_fmt.y =
-      (vte_control >> 8) & 0x1 ? 1.0f : 0.0f;
   // VTX_Z_FMT = true: the incoming Z has already been multiplied by 1/W0.
   //           = false: multiply the Z coordinate by 1/W0.
-  state_data->vtx_fmt.z = (vte_control >> 9) & 0x1 ? 1.0f : 0.0f;
   // VTX_W0_FMT = true: the incoming W0 is not 1/W0. Perform the reciprocal to
   //                    get 1/W0.
-  state_data->vtx_fmt.w = (vte_control >> 10) & 0x1 ? 1.0f : 0.0f;
+  draw_batcher_.set_vtx_fmt((vte_control >> 8) & 0x1 ? 1.0f : 0.0f,
+                            (vte_control >> 9) & 0x1 ? 1.0f : 0.0f,
+                            (vte_control >> 10) & 0x1 ? 1.0f : 0.0f);
 
   // Clipping.
   // https://github.com/freedreno/amd-gpu/blob/master/include/reg/yamato/14/yamato_genenum.h#L1587
@@ -1732,8 +1747,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState(
   return UpdateStatus::kCompatible;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState() {
   auto& regs = update_rasterizer_state_regs_;
 
   bool dirty = false;
@@ -1749,6 +1763,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState(
 
   SCOPE_profile_cpu_f("gpu");
 
+  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);
+
   // Scissoring.
   if (regs.pa_sc_screen_scissor_tl != 0 &&
       regs.pa_sc_screen_scissor_br != 0x20002000) {
@@ -1766,10 +1782,6 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState(
     glDisable(GL_SCISSOR_TEST);
   }
 
-  // Rect lists aren't culled. There may be other things they skip too.
-  assert_true((regs.pa_su_sc_mode_cntl & 0x3) == 0 ||
-              draw_command->prim_type != PrimitiveType::kRectangleList);
-
   switch (regs.pa_su_sc_mode_cntl & 0x3) {
     case 0:
       glDisable(GL_CULL_FACE);
@@ -1784,6 +1796,12 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState(
       break;
   }
 
+  if (regs.pa_su_sc_mode_cntl & (1 << 20)) {
+    glProvokingVertex(GL_LAST_VERTEX_CONVENTION);
+  } else {
+    glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
+  }
+
   if (regs.pa_su_sc_mode_cntl & 0x4) {
     glFrontFace(GL_CW);
   } else {
@@ -1797,8 +1815,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState(
   return UpdateStatus::kMismatch;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState() {
   auto& regs = update_blend_state_regs_;
 
   bool dirty = false;
@@ -1820,6 +1837,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState(
 
   SCOPE_profile_cpu_f("gpu");
 
+  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);
+
   static const GLenum blend_map[] = {
       /*  0 */ GL_ZERO,
       /*  1 */ GL_ONE,
@@ -1882,8 +1901,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState(
   return UpdateStatus::kMismatch;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState() {
   auto& regs = update_depth_stencil_state_regs_;
 
   bool dirty = false;
@@ -1896,6 +1914,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState(
 
   SCOPE_profile_cpu_f("gpu");
 
+  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);
+
   static const GLenum compare_func_map[] = {
       /*  0 */ GL_NEVER,
       /*  1 */ GL_LESS,
@@ -1977,192 +1997,72 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState(
   return UpdateStatus::kMismatch;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::UpdateConstants(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer() {
   auto& regs = *register_file_;
-  auto state_data = draw_command->state_data;
-
-  // TODO(benvanik): partial updates, etc. We could use shader constant access
-  // knowledge that we get at compile time to only upload those constants
-  // required. If we did this as a variable length then we could really cut
-  // down on state block sizes.
-
-  // Copy over all constants.
-  std::memcpy(&state_data->float_consts,
-              &regs[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
-              sizeof(state_data->float_consts));
-  std::memcpy(
-      &state_data->bool_consts,
-      &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].f32,
-      sizeof(state_data->bool_consts) + sizeof(state_data->loop_consts));
-
-  return UpdateStatus::kCompatible;
-}
-
-CommandProcessor::UpdateStatus CommandProcessor::UpdateShaders(
-    DrawCommand* draw_command) {
-  auto& regs = update_shaders_regs_;
-  auto& cmd = *draw_command;
-
-  bool dirty = false;
-  dirty |= SetShadowRegister(regs.sq_program_cntl, XE_GPU_REG_SQ_PROGRAM_CNTL);
-  dirty |= regs.vertex_shader != active_vertex_shader_;
-  dirty |= regs.pixel_shader != active_pixel_shader_;
-  dirty |= regs.prim_type != cmd.prim_type;
-  if (!dirty) {
+  auto& info = index_buffer_info_;
+  if (!info.guest_base) {
+    // No index buffer or auto draw.
     return UpdateStatus::kCompatible;
   }
-  regs.vertex_shader = active_vertex_shader_;
-  regs.pixel_shader = active_pixel_shader_;
-  regs.prim_type = cmd.prim_type;
 
+#if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
 
-  xe_gpu_program_cntl_t program_cntl;
-  program_cntl.dword_0 = regs.sq_program_cntl;
-  if (!active_vertex_shader_->has_prepared()) {
-    if (!active_vertex_shader_->PrepareVertexShader(program_cntl)) {
-      XELOGE("Unable to prepare vertex shader");
-      return UpdateStatus::kError;
-    }
-  } else if (!active_vertex_shader_->is_valid()) {
-    XELOGE("Vertex shader invalid");
-    return UpdateStatus::kError;
-  }
-
-  if (!active_pixel_shader_->has_prepared()) {
-    if (!active_pixel_shader_->PreparePixelShader(program_cntl)) {
-      XELOGE("Unable to prepare pixel shader");
-      return UpdateStatus::kError;
-    }
-  } else if (!active_pixel_shader_->is_valid()) {
-    XELOGE("Pixel shader invalid");
-    return UpdateStatus::kError;
-  }
-
-  GLuint vertex_program = active_vertex_shader_->program();
-  GLuint fragment_program = active_pixel_shader_->program();
-
-  uint64_t key = (uint64_t(vertex_program) << 32) | fragment_program;
-  CachedPipeline* cached_pipeline = nullptr;
-  auto it = cached_pipelines_.find(key);
-  if (it == cached_pipelines_.end()) {
-    // Existing pipeline for these programs not found - create it.
-    auto new_pipeline = std::make_unique<CachedPipeline>();
-    new_pipeline->vertex_program = vertex_program;
-    new_pipeline->fragment_program = fragment_program;
-    new_pipeline->handles.default_pipeline = 0;
-    cached_pipeline = new_pipeline.get();
-    all_pipelines_.emplace_back(std::move(new_pipeline));
-    cached_pipelines_.insert({key, cached_pipeline});
-  } else {
-    // Found a pipeline container - it may or may not have what we want.
-    cached_pipeline = it->second;
-  }
-  if (!cached_pipeline->handles.default_pipeline) {
-    // Perhaps it's a bit wasteful to do all of these, but oh well.
-    GLuint pipelines[4];
-    glCreateProgramPipelines(GLsizei(poly::countof(pipelines)), pipelines);
-
-    glUseProgramStages(pipelines[0], GL_VERTEX_SHADER_BIT, vertex_program);
-    glUseProgramStages(pipelines[0], GL_FRAGMENT_SHADER_BIT, fragment_program);
-    cached_pipeline->handles.default_pipeline = pipelines[0];
-
-    glUseProgramStages(pipelines[1], GL_VERTEX_SHADER_BIT, vertex_program);
-    glUseProgramStages(pipelines[1], GL_GEOMETRY_SHADER_BIT,
-                       point_list_geometry_program_);
-    glUseProgramStages(pipelines[1], GL_FRAGMENT_SHADER_BIT, fragment_program);
-    cached_pipeline->handles.point_list_pipeline = pipelines[1];
-
-    glUseProgramStages(pipelines[2], GL_VERTEX_SHADER_BIT, vertex_program);
-    glUseProgramStages(pipelines[2], GL_GEOMETRY_SHADER_BIT,
-                       rect_list_geometry_program_);
-    glUseProgramStages(pipelines[2], GL_FRAGMENT_SHADER_BIT, fragment_program);
-    cached_pipeline->handles.rect_list_pipeline = pipelines[2];
-
-    glUseProgramStages(pipelines[3], GL_VERTEX_SHADER_BIT, vertex_program);
-    glUseProgramStages(pipelines[3], GL_GEOMETRY_SHADER_BIT,
-                       quad_list_geometry_program_);
-    glUseProgramStages(pipelines[3], GL_FRAGMENT_SHADER_BIT, fragment_program);
-    cached_pipeline->handles.quad_list_pipeline = pipelines[3];
-  }
-
-  GLuint pipeline = cached_pipeline->handles.default_pipeline;
-  switch (regs.prim_type) {
-    case PrimitiveType::kPointList:
-      pipeline = cached_pipeline->handles.point_list_pipeline;
-      break;
-    case PrimitiveType::kRectangleList:
-      pipeline = cached_pipeline->handles.rect_list_pipeline;
-      break;
-    case PrimitiveType::kQuadList:
-      pipeline = cached_pipeline->handles.quad_list_pipeline;
-      break;
-  }
-  glBindProgramPipeline(pipeline);
-
-  return UpdateStatus::kMismatch;
-}
-
-CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer(
-    DrawCommand* draw_command) {
-  auto& cmd = *draw_command;
-
-  auto& info = cmd.index_buffer;
-  if (!cmd.index_count || !info.address) {
-    // No index buffer or auto draw.
-    return UpdateStatus::kMismatch;  // ?
-  }
-
-  SCOPE_profile_cpu_f("gpu");
+  // Min/max index ranges. This is often [0g,FFFF|FFFFFF], but if it's not we
+  // can use it to do a glDrawRangeElements.
+  uint32_t min_index = regs[XE_GPU_REG_VGT_MIN_VTX_INDX].u32;
+  uint32_t max_index = regs[XE_GPU_REG_VGT_MAX_VTX_INDX].u32;
+  assert_true(min_index == 0);
+  assert_true(max_index == 0xFFFF || max_index == 0xFFFFFF);
 
   assert_true(info.endianness == Endian::k8in16 ||
               info.endianness == Endian::k8in32);
 
   size_t total_size =
-      cmd.index_count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
-                                                            : sizeof(uint16_t));
+      info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
+                                                       : sizeof(uint16_t));
   auto allocation = scratch_buffer_.Acquire(total_size);
-  scratch_buffer_stats_.total_indices_size += total_size;
 
   if (info.format == IndexFormat::kInt32) {
-    poly::copy_and_swap_32_aligned(
-        reinterpret_cast<uint32_t*>(allocation.host_ptr),
-        reinterpret_cast<const uint32_t*>(cmd.index_buffer.address),
-        cmd.index_count);
+    auto dest = reinterpret_cast<uint32_t*>(allocation.host_ptr);
+    auto src = reinterpret_cast<const uint32_t*>(membase_ + info.guest_base);
+    uint32_t max_index_found;
+    poly::copy_and_swap_32_aligned(dest, src, info.count, &max_index_found);
+    index_buffer_info_.max_index_found = max_index_found;
   } else {
-    poly::copy_and_swap_16_aligned(
-        reinterpret_cast<uint16_t*>(allocation.host_ptr),
-        reinterpret_cast<const uint16_t*>(cmd.index_buffer.address),
-        cmd.index_count);
+    auto dest = reinterpret_cast<uint16_t*>(allocation.host_ptr);
+    auto src = reinterpret_cast<const uint16_t*>(membase_ + info.guest_base);
+    uint16_t max_index_found;
+    poly::copy_and_swap_16_aligned(dest, src, info.count, &max_index_found);
+    index_buffer_info_.max_index_found = max_index_found;
   }
 
-  if (has_bindless_vbos_) {
-    glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, allocation.gpu_ptr,
-                           allocation.length);
-  } else {
-    // Offset is used in glDrawElements.
-    cmd.index_buffer.buffer_offset = allocation.offset;
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, scratch_buffer_.handle());
-  }
+  draw_batcher_.set_index_buffer(allocation);
+
   scratch_buffer_.Commit(std::move(allocation));
 
-  return UpdateStatus::kMismatch;
+  return UpdateStatus::kCompatible;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers() {
+#if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   auto& regs = *register_file_;
-  auto& cmd = *draw_command;
   assert_not_null(active_vertex_shader_);
 
-  const auto& buffer_inputs = active_vertex_shader_->buffer_inputs();
+  if (!has_bindless_vbos_) {
+    // TODO(benvanik): find a way to get around glVertexArrayVertexBuffer below.
+    draw_batcher_.Flush(DrawBatcher::FlushMode::kMakeCoherent);
+  }
 
   uint32_t el_index = 0;
-  for (uint32_t n = 0; n < buffer_inputs.count; n++) {
-    const auto& desc = buffer_inputs.descs[n];
-
+  const auto& buffer_inputs = active_vertex_shader_->buffer_inputs();
+  for (uint32_t buffer_index = 0; buffer_index < buffer_inputs.count;
+       ++buffer_index) {
+    const auto& desc = buffer_inputs.descs[buffer_index];
     int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6;
     auto group = reinterpret_cast<xe_gpu_fetch_group_t*>(&regs.values[r]);
     xe_gpu_vertex_fetch_t* fetch = nullptr;
@@ -2177,14 +2077,16 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers(
         fetch = &group->vertex_fetch_2;
         break;
     }
-    assert_not_null(fetch);
-    assert_true(fetch->type == 0x3);  // must be of type vertex
-    // TODO(benvanik): some games have type 2, which is texture - maybe
-    //                 fetch_slot wrong?
-    assert_not_zero(fetch->size);
 
-    auto allocation = scratch_buffer_.Acquire(fetch->size * sizeof(uint32_t));
-    scratch_buffer_stats_.total_vertices_size += fetch->size * sizeof(uint32_t);
+    // Constrain the vertex upload to just what we are interested in.
+    const size_t kRangeKludge = 5;  // could pick index count based on prim.
+    uint32_t max_index = index_buffer_info_.guest_base
+                             ? index_buffer_info_.max_index_found
+                             : draw_index_count_;
+    size_t valid_range = (max_index + kRangeKludge) * desc.stride_words * 4;
+    valid_range = std::min(valid_range, size_t(fetch->size * 4));
+
+    auto allocation = scratch_buffer_.Acquire(valid_range);
 
     // Copy and byte swap the entire buffer.
     // We could be smart about this to save GPU bandwidth by building a CRC
@@ -2193,93 +2095,35 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers(
     poly::copy_and_swap_32_aligned(
         reinterpret_cast<uint32_t*>(allocation.host_ptr),
         reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
-        fetch->size);
+        valid_range / 4);
 
     if (!has_bindless_vbos_) {
-      glBindVertexBuffer(n, scratch_buffer_.handle(), allocation.offset,
-                         desc.stride_words * 4);
+      // TODO(benvanik): if we could find a way to avoid this, we could use
+      // multidraw without flushing.
+      glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index,
+                                scratch_buffer_.handle(), allocation.offset,
+                                desc.stride_words * 4);
     }
 
-    for (uint32_t i = 0; i < desc.element_count; ++i) {
-      const auto& el = desc.elements[i];
-      auto comp_count = GetVertexFormatComponentCount(el.format);
-      GLenum comp_type;
-      switch (el.format) {
-        case VertexFormat::k_8_8_8_8:
-          comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE;
-          break;
-        case VertexFormat::k_2_10_10_10:
-          comp_type = el.is_signed ? GL_INT_2_10_10_10_REV
-                                   : GL_UNSIGNED_INT_2_10_10_10_REV;
-          break;
-        case VertexFormat::k_10_11_11:
-          assert_false(el.is_signed);
-          comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV;
-          break;
-        /*case VertexFormat::k_11_11_10:
-          break;*/
-        case VertexFormat::k_16_16:
-          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
-          break;
-        case VertexFormat::k_16_16_FLOAT:
-          comp_type = GL_HALF_FLOAT;
-          break;
-        case VertexFormat::k_16_16_16_16:
-          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
-          break;
-        case VertexFormat::k_16_16_16_16_FLOAT:
-          comp_type = GL_HALF_FLOAT;
-          break;
-        case VertexFormat::k_32:
-          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
-          break;
-        case VertexFormat::k_32_32:
-          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
-          break;
-        case VertexFormat::k_32_32_32_32:
-          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
-          break;
-        case VertexFormat::k_32_FLOAT:
-          comp_type = GL_FLOAT;
-          break;
-        case VertexFormat::k_32_32_FLOAT:
-          comp_type = GL_FLOAT;
-          break;
-        case VertexFormat::k_32_32_32_FLOAT:
-          comp_type = GL_FLOAT;
-          break;
-        case VertexFormat::k_32_32_32_32_FLOAT:
-          comp_type = GL_FLOAT;
-          break;
-        default:
-          assert_unhandled_case(el.format);
-          break;
+    if (has_bindless_vbos_) {
+      for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
+        const auto& el = desc.elements[i];
+        draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4,
+                                        allocation);
       }
-      if (has_bindless_vbos_) {
-        glVertexAttribFormatNV(el_index, comp_count, comp_type,
-                               el.is_normalized,
-                               desc.stride_words * sizeof(uint32_t));
-        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index,
-                               allocation.gpu_ptr + (el.offset_words * 4),
-                               allocation.length - (el.offset_words * 4));
-      } else {
-        glVertexAttribBinding(el_index, n);
-        glVertexAttribFormat(el_index, comp_count, comp_type, el.is_normalized,
-                             el.offset_words * 4);
-      }
-      ++el_index;
     }
 
-    // Flush buffer before we draw.
     scratch_buffer_.Commit(std::move(allocation));
   }
 
-  return UpdateStatus::kMismatch;
+  return UpdateStatus::kCompatible;
 }
 
-CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers(
-    DrawCommand* draw_command) {
+CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers() {
+#if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   auto& regs = *register_file_;
 
   bool mismatch = false;
@@ -2296,7 +2140,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers(
       continue;
     }
     has_setup_sampler[desc.fetch_slot] = true;
-    auto status = PopulateSampler(draw_command, desc);
+    auto status = PopulateSampler(desc);
     if (status == UpdateStatus::kError) {
       return status;
     } else if (status == UpdateStatus::kMismatch) {
@@ -2312,7 +2156,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers(
       continue;
     }
     has_setup_sampler[desc.fetch_slot] = true;
-    auto status = PopulateSampler(draw_command, desc);
+    auto status = PopulateSampler(desc);
     if (status == UpdateStatus::kError) {
       return UpdateStatus::kError;
     } else if (status == UpdateStatus::kMismatch) {
@@ -2324,7 +2168,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers(
 }
 
 CommandProcessor::UpdateStatus CommandProcessor::PopulateSampler(
-    DrawCommand* draw_command, const Shader::SamplerDesc& desc) {
+    const Shader::SamplerDesc& desc) {
   auto& regs = *register_file_;
   int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + desc.fetch_slot * 6;
   auto group = reinterpret_cast<const xe_gpu_fetch_group_t*>(&regs.values[r]);
@@ -2332,7 +2176,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSampler(
 
   // Reset slot.
   // If we fail, we still draw but with an invalid texture.
-  draw_command->state_data->texture_samplers[desc.fetch_slot] = 0;
+  draw_batcher_.set_texture_sampler(desc.fetch_slot, 0);
 
   if (FLAGS_disable_textures) {
     return UpdateStatus::kCompatible;
@@ -2363,13 +2207,13 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSampler(
   }
 
   // Shaders will use bindless to fetch right from it.
-  draw_command->state_data->texture_samplers[desc.fetch_slot] =
-      entry_view->texture_sampler_handle;
+  draw_batcher_.set_texture_sampler(desc.fetch_slot,
+                                    entry_view->texture_sampler_handle);
 
   return UpdateStatus::kCompatible;
 }
 
-bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
+bool CommandProcessor::IssueCopy() {
   SCOPE_profile_cpu_f("gpu");
   auto& regs = *register_file_;
 
diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h
index d1cdcd141..481bb98ff 100644
--- a/src/xenia/gpu/gl4/command_processor.h
+++ b/src/xenia/gpu/gl4/command_processor.h
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include <xenia/gpu/gl4/circular_buffer.h>
+#include <xenia/gpu/gl4/draw_batcher.h>
 #include <xenia/gpu/gl4/gl_context.h>
 #include <xenia/gpu/gl4/gl4_shader.h>
 #include <xenia/gpu/gl4/texture_cache.h>
@@ -41,73 +42,6 @@ struct SwapParameters {
   GLenum attachment;
 };
 
-// This must match the layout in gl4_shader.cc.
-struct UniformDataBlock {
-  union float4 {
-    float v[4];
-    struct {
-      float x, y, z, w;
-    };
-  };
-
-  float4 window_offset;   // tx,ty,sx,sy
-  float4 window_scissor;  // x0,y0,x1,y1
-  float4 vtx_fmt;
-  float4 viewport_offset;  // tx,ty,tz,?
-  float4 viewport_scale;   // sx,sy,sz,?
-                           // TODO(benvanik): vertex format xyzw?
-
-  float4 alpha_test;  // alpha test enable, func, ref, ?
-
-  // TODO(benvanik): pack tightly
-  uint64_t texture_samplers[32];
-
-  // Register data from 0x4000 to 0x4927.
-  // UpdateConstants relies on the packing of these.
-  struct {
-    // SHADER_CONSTANT_000_X...
-    float4 float_consts[512];
-    // SHADER_CONSTANT_FETCH_00_0 is omitted
-    // SHADER_CONSTANT_BOOL_000_031...
-    int32_t bool_consts[8];
-    // SHADER_CONSTANT_LOOP_00...
-    int32_t loop_consts[32];
-  };
-};
-static_assert(sizeof(UniformDataBlock) <= 16 * 1024, "Need <=16k uniform data");
-
-// TODO(benvanik): move more of the enums in here?
-struct DrawCommand {
-  PrimitiveType prim_type;
-  uint32_t start_index;
-  uint32_t min_index;
-  uint32_t max_index;
-  uint32_t index_count;
-  uint32_t base_vertex;
-
-  // Index buffer, if present.
-  // If index_count > 0 but buffer is nullptr then auto draw.
-  struct {
-    const uint8_t* address;
-    size_t size;
-    xenos::Endian endianness;
-    xenos::IndexFormat format;
-    size_t buffer_offset;
-  } index_buffer;
-
-  // Texture samplers.
-  struct SamplerInput {
-    uint32_t input_index;
-    // TextureResource* texture;
-    // SamplerStateResource* sampler_state;
-  };
-  SamplerInput vertex_shader_samplers[32];
-  SamplerInput pixel_shader_samplers[32];
-
-  // NOTE: do not read from this - the mapped memory is likely write combined.
-  UniformDataBlock* state_data;
-};
-
 class CommandProcessor {
  public:
   CommandProcessor(GL4GraphicsSystem* graphics_system);
@@ -241,22 +175,19 @@ class CommandProcessor {
   bool LoadShader(ShaderType shader_type, const uint32_t* address,
                   uint32_t dword_count);
 
-  void PrepareDraw(DrawCommand* draw_command);
-  bool IssueDraw(DrawCommand* draw_command);
-  UpdateStatus UpdateRenderTargets(DrawCommand* draw_command);
-  UpdateStatus UpdateState(DrawCommand* draw_command);
-  UpdateStatus UpdateViewportState(DrawCommand* draw_command);
-  UpdateStatus UpdateRasterizerState(DrawCommand* draw_command);
-  UpdateStatus UpdateBlendState(DrawCommand* draw_command);
-  UpdateStatus UpdateDepthStencilState(DrawCommand* draw_command);
-  UpdateStatus UpdateConstants(DrawCommand* draw_command);
-  UpdateStatus UpdateShaders(DrawCommand* draw_command);
-  UpdateStatus PopulateIndexBuffer(DrawCommand* draw_command);
-  UpdateStatus PopulateVertexBuffers(DrawCommand* draw_command);
-  UpdateStatus PopulateSamplers(DrawCommand* draw_command);
-  UpdateStatus PopulateSampler(DrawCommand* draw_command,
-                               const Shader::SamplerDesc& desc);
-  bool IssueCopy(DrawCommand* draw_command);
+  bool IssueDraw();
+  UpdateStatus UpdateShaders(PrimitiveType prim_type);
+  UpdateStatus UpdateRenderTargets();
+  UpdateStatus UpdateState();
+  UpdateStatus UpdateViewportState();
+  UpdateStatus UpdateRasterizerState();
+  UpdateStatus UpdateBlendState();
+  UpdateStatus UpdateDepthStencilState();
+  UpdateStatus PopulateIndexBuffer();
+  UpdateStatus PopulateVertexBuffers();
+  UpdateStatus PopulateSamplers();
+  UpdateStatus PopulateSampler(const Shader::SamplerDesc& desc);
+  bool IssueCopy();
 
   CachedFramebuffer* GetFramebuffer(GLuint color_targets[4],
                                     GLuint depth_target);
@@ -306,21 +237,23 @@ class CommandProcessor {
   std::vector<CachedDepthRenderTarget> cached_depth_render_targets_;
   std::vector<std::unique_ptr<CachedPipeline>> all_pipelines_;
   std::unordered_map<uint64_t, CachedPipeline*> cached_pipelines_;
-  GLuint vertex_array_;
   GLuint point_list_geometry_program_;
   GLuint rect_list_geometry_program_;
   GLuint quad_list_geometry_program_;
+  struct {
+    xenos::IndexFormat format;
+    xenos::Endian endianness;
+    uint32_t count;
+    uint32_t guest_base;
+    size_t length;
+    uint32_t max_index_found;
+  } index_buffer_info_;
+  uint32_t draw_index_count_;
 
   TextureCache texture_cache_;
 
+  DrawBatcher draw_batcher_;
   CircularBuffer scratch_buffer_;
-  struct ScratchBufferStats {
-    size_t total_state_data_size = 0;
-    size_t total_indices_size = 0;
-    size_t total_vertices_size = 0;
-  } scratch_buffer_stats_;
-
-  DrawCommand draw_command_;
 
  private:
   bool SetShadowRegister(uint32_t& dest, uint32_t register_name);
@@ -341,7 +274,6 @@ class CommandProcessor {
     void Reset() { std::memset(this, 0, sizeof(*this)); }
   } update_render_targets_regs_;
   struct UpdateViewportStateRegisters {
-    //
     UpdateViewportStateRegisters() { Reset(); }
     void Reset() { std::memset(this, 0, sizeof(*this)); }
   } update_viewport_state_regs_;
@@ -367,7 +299,6 @@ class CommandProcessor {
     UpdateDepthStencilStateRegisters() { Reset(); }
     void Reset() { std::memset(this, 0, sizeof(*this)); }
   } update_depth_stencil_state_regs_;
-  // TODO(benvanik): constant bitmask?
   struct UpdateShadersRegisters {
     PrimitiveType prim_type;
     uint32_t sq_program_cntl;
@@ -380,9 +311,6 @@ class CommandProcessor {
       vertex_shader = pixel_shader = nullptr;
     }
   } update_shaders_regs_;
-  // ib
-  // vb
-  // samplers
 };
 
 }  // namespace gl4
diff --git a/src/xenia/gpu/gl4/draw_batcher.cc b/src/xenia/gpu/gl4/draw_batcher.cc
new file mode 100644
index 000000000..c5d0bdb39
--- /dev/null
+++ b/src/xenia/gpu/gl4/draw_batcher.cc
@@ -0,0 +1,384 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <xenia/gpu/gl4/draw_batcher.h>
+
+#include <poly/cxx_compat.h>
+#include <poly/math.h>
+#include <xenia/gpu/gl4/gl4_gpu-private.h>
+#include <xenia/gpu/gpu-private.h>
+
+namespace xe {
+namespace gpu {
+namespace gl4 {
+
+using namespace xe::gpu::xenos;
+
+extern "C" GLEWContext* glewGetContext();
+
+const size_t kCommandBufferCapacity = 16 * (1024 * 1024);
+const size_t kCommandBufferAlignment = 4;
+const size_t kStateBufferCapacity = 64 * (1024 * 1024);
+const size_t kStateBufferAlignment = 256;
+
+DrawBatcher::DrawBatcher(RegisterFile* register_file)
+    : register_file_(register_file),
+      command_buffer_(kCommandBufferCapacity, kCommandBufferAlignment),
+      state_buffer_(kStateBufferCapacity, kStateBufferAlignment),
+      array_data_buffer_(nullptr),
+      has_bindless_mdi_(false),
+      draw_open_(false) {
+  std::memset(&batch_state_, 0, sizeof(batch_state_));
+  batch_state_.needs_reconfigure = true;
+  batch_state_.command_range_start = batch_state_.state_range_start =
+      UINTPTR_MAX;
+  std::memset(&active_draw_, 0, sizeof(active_draw_));
+}
+
+bool DrawBatcher::Initialize(CircularBuffer* array_data_buffer) {
+  array_data_buffer_ = array_data_buffer;
+  if (!command_buffer_.Initialize()) {
+    return false;
+  }
+  if (!state_buffer_.Initialize()) {
+    return false;
+  }
+  glBindBuffer(GL_DRAW_INDIRECT_BUFFER, command_buffer_.handle());
+  if (FLAGS_vendor_gl_extensions && GLEW_NV_bindless_multi_draw_indirect) {
+    has_bindless_mdi_ = true;
+  }
+  return true;
+}
+
+void DrawBatcher::Shutdown() {
+  command_buffer_.Shutdown();
+  state_buffer_.Shutdown();
+}
+
+bool DrawBatcher::ReconfigurePipeline(GL4Shader* vertex_shader,
+                                      GL4Shader* pixel_shader,
+                                      GLuint pipeline) {
+  if (batch_state_.pipeline == pipeline) {
+    // No-op.
+    return true;
+  }
+  if (!Flush(FlushMode::kReconfigure)) {
+    return false;
+  }
+
+  batch_state_.vertex_shader = vertex_shader;
+  batch_state_.pixel_shader = pixel_shader;
+  batch_state_.pipeline = pipeline;
+
+  return true;
+}
+
+bool DrawBatcher::BeginDrawArrays(PrimitiveType prim_type,
+                                  uint32_t index_count) {
+  assert_false(draw_open_);
+  if (batch_state_.prim_type != prim_type || batch_state_.indexed) {
+    if (!Flush(FlushMode::kReconfigure)) {
+      return false;
+    }
+  }
+  batch_state_.prim_type = prim_type;
+  batch_state_.indexed = false;
+
+  if (!BeginDraw()) {
+    return false;
+  }
+
+  auto cmd = active_draw_.draw_arrays_cmd;
+  cmd->base_instance = 0;
+  cmd->instance_count = 1;
+  cmd->count = index_count;
+  cmd->first_index = 0;
+
+  return true;
+}
+
+bool DrawBatcher::BeginDrawElements(PrimitiveType prim_type,
+                                    uint32_t index_count,
+                                    IndexFormat index_format) {
+  assert_false(draw_open_);
+  GLenum index_type =
+      index_format == IndexFormat::kInt32 ? GL_UNSIGNED_INT : GL_UNSIGNED_SHORT;
+  if (batch_state_.prim_type != prim_type || !batch_state_.indexed ||
+      batch_state_.index_type != index_type) {
+    if (!Flush(FlushMode::kReconfigure)) {
+      return false;
+    }
+  }
+  batch_state_.prim_type = prim_type;
+  batch_state_.indexed = true;
+  batch_state_.index_type = index_type;
+
+  if (!BeginDraw()) {
+    return false;
+  }
+
+  uint32_t start_index = register_file_->values[XE_GPU_REG_VGT_INDX_OFFSET].u32;
+  assert_zero(start_index);
+
+  auto cmd = active_draw_.draw_elements_cmd;
+  cmd->base_instance = 0;
+  cmd->instance_count = 1;
+  cmd->count = index_count;
+  cmd->first_index = start_index;
+  cmd->base_vertex = 0;
+
+  if (has_bindless_mdi_) {
+    auto bindless_cmd = active_draw_.draw_elements_bindless_cmd;
+    bindless_cmd->reserved_zero = 0;
+  }
+  return true;
+}
+
+bool DrawBatcher::BeginDraw() {
+  draw_open_ = true;
+
+  if (batch_state_.needs_reconfigure) {
+    batch_state_.needs_reconfigure = false;
+    // Have been reconfigured since last draw - need to compute state size.
+    // Layout:
+    //   [draw command]
+    //   [common header]
+    //   [consts]
+
+    // Padded to max.
+    GLsizei command_size = 0;
+    if (has_bindless_mdi_) {
+      if (batch_state_.indexed) {
+        command_size = sizeof(DrawElementsIndirectBindlessCommandNV);
+      } else {
+        command_size = sizeof(DrawArraysIndirectBindlessCommandNV);
+      }
+    } else {
+      if (batch_state_.indexed) {
+        command_size = sizeof(DrawElementsIndirectCommand);
+      } else {
+        command_size = sizeof(DrawArraysIndirectCommand);
+      }
+    }
+    batch_state_.command_stride =
+        poly::round_up(command_size, GLsizei(kCommandBufferAlignment));
+
+    GLsizei header_size = sizeof(CommonHeader);
+
+    // TODO(benvanik); consts sizing.
+    // GLsizei float_consts_size = sizeof(float4) * 512;
+    // GLsizei bool_consts_size = sizeof(uint32_t) * 8;
+    // GLsizei loop_consts_size = sizeof(uint32_t) * 32;
+    // GLsizei consts_size =
+    //    float_consts_size + bool_consts_size + loop_consts_size;
+    // batch_state_.float_consts_offset = batch_state_.header_offset +
+    // header_size;
+    // batch_state_.bool_consts_offset =
+    //    batch_state_.float_consts_offset + float_consts_size;
+    // batch_state_.loop_consts_offset =
+    //    batch_state_.bool_consts_offset + bool_consts_size;
+    GLsizei consts_size = 0;
+
+    batch_state_.state_stride = header_size + consts_size;
+  }
+
+  // Allocate a command data block.
+  // We should treat it as write-only.
+  if (!command_buffer_.CanAcquire(batch_state_.command_stride)) {
+    Flush(FlushMode::kMakeCoherent);
+  }
+  active_draw_.command_allocation =
+      command_buffer_.Acquire(batch_state_.command_stride);
+  assert_not_null(active_draw_.command_allocation.host_ptr);
+
+  // Allocate a state data block.
+  // We should treat it as write-only.
+  if (!state_buffer_.CanAcquire(batch_state_.state_stride)) {
+    Flush(FlushMode::kMakeCoherent);
+  }
+  active_draw_.state_allocation =
+      state_buffer_.Acquire(batch_state_.state_stride);
+  assert_not_null(active_draw_.state_allocation.host_ptr);
+
+  active_draw_.command_address =
+      reinterpret_cast<uintptr_t>(active_draw_.command_allocation.host_ptr);
+  auto state_host_ptr =
+      reinterpret_cast<uintptr_t>(active_draw_.state_allocation.host_ptr);
+  active_draw_.header = reinterpret_cast<CommonHeader*>(state_host_ptr);
+  // active_draw_.float_consts =
+  //    reinterpret_cast<float4*>(state_host_ptr +
+  //    batch_state_.float_consts_offset);
+  // active_draw_.bool_consts =
+  //    reinterpret_cast<uint32_t*>(state_host_ptr +
+  //    batch_state_.bool_consts_offset);
+  // active_draw_.loop_consts =
+  //    reinterpret_cast<uint32_t*>(state_host_ptr +
+  //    batch_state_.loop_consts_offset);
+  return true;
+}
+
+void DrawBatcher::DiscardDraw() {
+  if (!draw_open_) {
+    // No-op.
+    return;
+  }
+  draw_open_ = false;
+
+  command_buffer_.Discard(std::move(active_draw_.command_allocation));
+  state_buffer_.Discard(std::move(active_draw_.state_allocation));
+}
+
+bool DrawBatcher::CommitDraw() {
+  assert_true(draw_open_);
+  draw_open_ = false;
+
+  // Copy over required constants.
+  CopyConstants();
+
+  if (batch_state_.state_range_start == UINTPTR_MAX) {
+    batch_state_.command_range_start = active_draw_.command_allocation.offset;
+    batch_state_.state_range_start = active_draw_.state_allocation.offset;
+  }
+  batch_state_.command_range_length +=
+      active_draw_.command_allocation.aligned_length;
+  batch_state_.state_range_length +=
+      active_draw_.state_allocation.aligned_length;
+
+  command_buffer_.Commit(std::move(active_draw_.command_allocation));
+  state_buffer_.Commit(std::move(active_draw_.state_allocation));
+
+  ++batch_state_.draw_count;
+  return true;
+}
+
+bool DrawBatcher::Flush(FlushMode mode) {
+  if (batch_state_.draw_count) {
+    SCOPE_profile_cpu_f("gpu");
+
+    assert_not_zero(batch_state_.command_stride);
+    assert_not_zero(batch_state_.state_stride);
+
+    // Flush pending buffer changes.
+    command_buffer_.Flush();
+    state_buffer_.Flush();
+    array_data_buffer_->Flush();
+
+    // State data is indexed by draw ID.
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, state_buffer_.handle(),
+                      batch_state_.state_range_start,
+                      batch_state_.state_range_length);
+
+    GLenum prim_type = 0;
+    switch (batch_state_.prim_type) {
+      case PrimitiveType::kPointList:
+        prim_type = GL_POINTS;
+        break;
+      case PrimitiveType::kLineList:
+        prim_type = GL_LINES;
+        break;
+      case PrimitiveType::kLineStrip:
+        prim_type = GL_LINE_STRIP;
+        break;
+      case PrimitiveType::kLineLoop:
+        prim_type = GL_LINE_LOOP;
+        break;
+      case PrimitiveType::kTriangleList:
+        prim_type = GL_TRIANGLES;
+        break;
+      case PrimitiveType::kTriangleStrip:
+        prim_type = GL_TRIANGLE_STRIP;
+        break;
+      case PrimitiveType::kTriangleFan:
+        prim_type = GL_TRIANGLE_FAN;
+        break;
+      case PrimitiveType::kRectangleList:
+        prim_type = GL_TRIANGLE_STRIP;
+        // Rect lists aren't culled. There may be other things they skip too.
+        // assert_true((register_file_->values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32
+        // &
+        //             0x3) == 0);
+        break;
+      case PrimitiveType::kQuadList:
+        prim_type = GL_LINES_ADJACENCY;
+        break;
+      default:
+      case PrimitiveType::kUnknown0x07:
+        prim_type = GL_POINTS;
+        XELOGE("unsupported primitive type %d", batch_state_.prim_type);
+        assert_unhandled_case(batch_state_.prim_type);
+        DiscardDraw();
+        return false;
+    }
+
+    // Fast path for single draws.
+    void* indirect_offset =
+        reinterpret_cast<void*>(batch_state_.command_range_start);
+
+    if (has_bindless_mdi_) {
+      int vertex_buffer_count =
+          batch_state_.vertex_shader->buffer_inputs().total_elements_count;
+      assert_true(vertex_buffer_count < 8);
+      if (batch_state_.indexed) {
+        glMultiDrawElementsIndirectBindlessNV(
+            prim_type, batch_state_.index_type, indirect_offset,
+            batch_state_.draw_count, batch_state_.command_stride,
+            vertex_buffer_count);
+      } else {
+        glMultiDrawArraysIndirectBindlessNV(
+            prim_type, indirect_offset, batch_state_.draw_count,
+            batch_state_.command_stride, vertex_buffer_count);
+      }
+    } else {
+      if (batch_state_.indexed) {
+        glMultiDrawElementsIndirect(prim_type, batch_state_.index_type,
+                                    indirect_offset, batch_state_.draw_count,
+                                    batch_state_.command_stride);
+      } else {
+        glMultiDrawArraysIndirect(prim_type, indirect_offset,
+                                  batch_state_.draw_count,
+                                  batch_state_.command_stride);
+      }
+    }
+
+    batch_state_.command_range_start = UINTPTR_MAX;
+    batch_state_.command_range_length = 0;
+    batch_state_.state_range_start = UINTPTR_MAX;
+    batch_state_.state_range_length = 0;
+    batch_state_.draw_count = 0;
+  }
+
+  if (mode == FlushMode::kReconfigure) {
+    // Reset - we'll update it as soon as we have all the information.
+    batch_state_.needs_reconfigure = true;
+  }
+
+  return true;
+}
+
+void DrawBatcher::CopyConstants() {
+  // TODO(benvanik): partial updates, etc. We could use shader constant access
+  // knowledge that we get at compile time to only upload those constants
+  // required. If we did this as a variable length then we could really cut
+  // down on state block sizes.
+
+  std::memcpy(active_draw_.header->float_consts,
+              &register_file_->values[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
+              sizeof(active_draw_.header->float_consts));
+  std::memcpy(
+      active_draw_.header->bool_consts,
+      &register_file_->values[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].f32,
+      sizeof(active_draw_.header->bool_consts));
+  std::memcpy(active_draw_.header->loop_consts,
+              &register_file_->values[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].f32,
+              sizeof(active_draw_.header->loop_consts));
+}
+
+}  // namespace gl4
+}  // namespace gpu
+}  // namespace xe
diff --git a/src/xenia/gpu/gl4/draw_batcher.h b/src/xenia/gpu/gl4/draw_batcher.h
new file mode 100644
index 000000000..f930ba2a0
--- /dev/null
+++ b/src/xenia/gpu/gl4/draw_batcher.h
@@ -0,0 +1,230 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_
+#define XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_
+
+#include <xenia/common.h>
+#include <xenia/gpu/gl4/circular_buffer.h>
+#include <xenia/gpu/gl4/gl_context.h>
+#include <xenia/gpu/gl4/gl4_shader.h>
+#include <xenia/gpu/register_file.h>
+#include <xenia/gpu/xenos.h>
+
+namespace xe {
+namespace gpu {
+namespace gl4 {
+
+union float4 {
+  float v[4];
+  struct {
+    float x, y, z, w;
+  };
+};
+
+#pragma pack(push, 4)
+struct DrawArraysIndirectCommand {
+  GLuint count;
+  GLuint instance_count;
+  GLuint first_index;
+  GLuint base_instance;
+};
+struct DrawElementsIndirectCommand {
+  GLuint count;
+  GLuint instance_count;
+  GLuint first_index;
+  GLint base_vertex;
+  GLuint base_instance;
+};
+struct BindlessPtrNV {
+  GLuint index;
+  GLuint reserved_zero;
+  GLuint64 address;
+  GLuint64 length;
+};
+struct DrawArraysIndirectBindlessCommandNV {
+  DrawArraysIndirectCommand cmd;
+  // NOTE: the spec is wrong here. For fucks sake.
+  // GLuint reserved_zero;
+  BindlessPtrNV vertex_buffers[8];
+};
+struct DrawElementsIndirectBindlessCommandNV {
+  DrawElementsIndirectCommand cmd;
+  GLuint reserved_zero;
+  BindlessPtrNV index_buffer;
+  BindlessPtrNV vertex_buffers[8];
+};
+#pragma pack(pop)
+
+class DrawBatcher {
+ public:
+  enum class FlushMode {
+    kMakeCoherent,
+    kStateChange,
+    kReconfigure,
+  };
+
+  DrawBatcher(RegisterFile* register_file);
+
+  bool Initialize(CircularBuffer* array_data_buffer);
+  void Shutdown();
+
+  PrimitiveType prim_type() const { return batch_state_.prim_type; }
+
+  void set_window_offset(uint32_t x, uint32_t y) {
+    active_draw_.header->window_offset.x = float(x);
+    active_draw_.header->window_offset.y = float(y);
+  }
+  void set_window_scissor(uint32_t left, uint32_t top, uint32_t right,
+                          uint32_t bottom) {
+    active_draw_.header->window_scissor.x = float(left);
+    active_draw_.header->window_scissor.y = float(top);
+    active_draw_.header->window_scissor.z = float(right);
+    active_draw_.header->window_scissor.w = float(bottom);
+  }
+  void set_window_scalar(float width_scalar, float height_scalar) {
+    active_draw_.header->window_offset.z = width_scalar;
+    active_draw_.header->window_offset.w = height_scalar;
+  }
+  void set_viewport_offset(float offset_x, float offset_y, float offset_z) {
+    active_draw_.header->viewport_offset.x = offset_x;
+    active_draw_.header->viewport_offset.y = offset_y;
+    active_draw_.header->viewport_offset.z = offset_z;
+  }
+  void set_viewport_scale(float scale_x, float scale_y, float scale_z) {
+    active_draw_.header->viewport_scale.x = scale_x;
+    active_draw_.header->viewport_scale.y = scale_y;
+    active_draw_.header->viewport_scale.z = scale_z;
+  }
+  void set_vtx_fmt(float xy, float z, float w) {
+    active_draw_.header->vtx_fmt.x = xy;
+    active_draw_.header->vtx_fmt.y = xy;
+    active_draw_.header->vtx_fmt.z = z;
+    active_draw_.header->vtx_fmt.w = w;
+  }
+  void set_alpha_test(bool enabled, uint32_t func, float ref) {
+    active_draw_.header->alpha_test.x = enabled ? 1.0f : 0.0f;
+    active_draw_.header->alpha_test.y = float(func);
+    active_draw_.header->alpha_test.z = ref;
+  }
+  void set_texture_sampler(int index, GLuint64 handle) {
+    active_draw_.header->texture_samplers[index] = handle;
+  }
+  void set_index_buffer(const CircularBuffer::Allocation& allocation) {
+    if (has_bindless_mdi_) {
+      auto& ptr = active_draw_.draw_elements_bindless_cmd->index_buffer;
+      ptr.reserved_zero = 0;
+      ptr.index = 0;
+      ptr.address = allocation.gpu_ptr;
+      ptr.length = allocation.length;
+    } else {
+      // Offset is used in glDrawElements.
+      auto& cmd = active_draw_.draw_elements_cmd;
+      size_t index_size = batch_state_.index_type == GL_UNSIGNED_SHORT ? 2 : 4;
+      cmd->first_index = GLuint(allocation.offset / index_size);
+    }
+  }
+  void set_vertex_buffer(int index, GLsizei offset, GLsizei stride,
+                         const CircularBuffer::Allocation& allocation) {
+    if (has_bindless_mdi_) {
+      BindlessPtrNV* ptr;
+      if (batch_state_.indexed) {
+        ptr = &active_draw_.draw_elements_bindless_cmd->vertex_buffers[index];
+      } else {
+        ptr = &active_draw_.draw_arrays_bindless_cmd->vertex_buffers[index];
+      }
+      ptr->reserved_zero = 0;
+      ptr->index = index;
+      ptr->address = allocation.gpu_ptr + offset;
+      ptr->length = allocation.length - offset;
+    }
+  }
+
+  bool ReconfigurePipeline(GL4Shader* vertex_shader, GL4Shader* pixel_shader,
+                           GLuint pipeline);
+
+  bool BeginDrawArrays(PrimitiveType prim_type, uint32_t index_count);
+  bool BeginDrawElements(PrimitiveType prim_type, uint32_t index_count,
+                         xenos::IndexFormat index_format);
+  void DiscardDraw();
+  bool CommitDraw();
+  bool Flush(FlushMode mode);
+
+ private:
+  bool BeginDraw();
+  void CopyConstants();
+
+  RegisterFile* register_file_;
+  CircularBuffer command_buffer_;
+  CircularBuffer state_buffer_;
+  CircularBuffer* array_data_buffer_;
+
+  bool has_bindless_mdi_;
+
+  struct BatchState {
+    bool needs_reconfigure;
+    PrimitiveType prim_type;
+    bool indexed;
+    GLenum index_type;
+
+    GL4Shader* vertex_shader;
+    GL4Shader* pixel_shader;
+    GLuint pipeline;
+
+    GLsizei command_stride;
+    GLsizei state_stride;
+    GLsizei float_consts_offset;
+    GLsizei bool_consts_offset;
+    GLsizei loop_consts_offset;
+
+    uintptr_t command_range_start;
+    uintptr_t command_range_length;
+    uintptr_t state_range_start;
+    uintptr_t state_range_length;
+    GLsizei draw_count;
+  } batch_state_;
+
+  // This must match GL4Shader's header.
+  struct CommonHeader {
+    float4 window_offset;    // tx,ty,sx,sy
+    float4 window_scissor;   // x0,y0,x1,y1
+    float4 viewport_offset;  // tx,ty,tz,?
+    float4 viewport_scale;   // sx,sy,sz,?
+    float4 vtx_fmt;          //
+    float4 alpha_test;       // alpha test enable, func, ref, ?
+
+    // TODO(benvanik): pack tightly
+    GLuint64 texture_samplers[32];
+
+    float4 float_consts[512];
+    uint32_t bool_consts[8];
+    uint32_t loop_consts[32];
+  };
+  struct {
+    CircularBuffer::Allocation command_allocation;
+    CircularBuffer::Allocation state_allocation;
+
+    union {
+      DrawArraysIndirectCommand* draw_arrays_cmd;
+      DrawElementsIndirectCommand* draw_elements_cmd;
+      DrawArraysIndirectBindlessCommandNV* draw_arrays_bindless_cmd;
+      DrawElementsIndirectBindlessCommandNV* draw_elements_bindless_cmd;
+      uintptr_t command_address;
+    };
+
+    CommonHeader* header;
+  } active_draw_;
+  bool draw_open_;
+};
+
+}  // namespace gl4
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_
diff --git a/src/xenia/gpu/gl4/gl4_shader.cc b/src/xenia/gpu/gl4/gl4_shader.cc
index 3409824a2..2255be226 100644
--- a/src/xenia/gpu/gl4/gl4_shader.cc
+++ b/src/xenia/gpu/gl4/gl4_shader.cc
@@ -11,6 +11,7 @@
 
 #include <poly/cxx_compat.h>
 #include <poly/math.h>
+#include <xenia/gpu/gl4/gl4_gpu-private.h>
 #include <xenia/gpu/gl4/gl4_shader_translator.h>
 #include <xenia/gpu/gpu-private.h>
 
@@ -18,6 +19,8 @@ namespace xe {
 namespace gpu {
 namespace gl4 {
 
+using namespace xe::gpu::xenos;
+
 extern "C" GLEWContext* glewGetContext();
 
 // Stateful, but minimally.
@@ -25,41 +28,147 @@ thread_local GL4ShaderTranslator shader_translator_;
 
 GL4Shader::GL4Shader(ShaderType shader_type, uint64_t data_hash,
                      const uint32_t* dword_ptr, uint32_t dword_count)
-    : Shader(shader_type, data_hash, dword_ptr, dword_count), program_(0) {}
+    : Shader(shader_type, data_hash, dword_ptr, dword_count),
+      program_(0),
+      vao_(0) {}
 
-GL4Shader::~GL4Shader() { glDeleteProgram(program_); }
+GL4Shader::~GL4Shader() {
+  glDeleteProgram(program_);
+  glDeleteVertexArrays(1, &vao_);
+}
 
-const std::string header =
-    "#version 450\n"
-    "#extension all : warn\n"
-    "#extension GL_ARB_bindless_texture : require\n"
-    "#extension GL_ARB_explicit_uniform_location : require\n"
-    "#extension GL_ARB_shading_language_420pack : require\n"
-    "#extension GL_ARB_shader_storage_buffer_object : require\n"
-    "precision highp float;\n"
-    "precision highp int;\n"
-    "layout(std140, column_major) uniform;\n"
-    "layout(std430, column_major) buffer;\n"
-    "struct StateData {\n"
-    "  vec4 window_offset;\n"
-    "  vec4 window_scissor;\n"
-    "  vec4 vtx_fmt;\n"
-    "  vec4 viewport_offset;\n"
-    "  vec4 viewport_scale;\n"
-    "  vec4 alpha_test;\n"
-    "  uvec2 texture_samplers[32];\n"
-    "  vec4 float_consts[512];\n"
-    "  uint fetch_consts[32 * 6];\n"
-    "  int bool_consts[8];\n"
-    "  int loop_consts[32];\n"
-    "};\n"
-    "struct VertexData {\n"
-    "  vec4 o[16];\n"
-    "};\n"
-    "\n"
-    "layout(binding = 0) buffer State {\n"
-    "  StateData state;\n"
-    "};\n";
+std::string GL4Shader::GetHeader() {
+  static const std::string header =
+      "#version 450\n"
+      "#extension all : warn\n"
+      "#extension GL_ARB_bindless_texture : require\n"
+      "#extension GL_ARB_explicit_uniform_location : require\n"
+      "#extension GL_ARB_shader_draw_parameters : require\n"
+      "#extension GL_ARB_shader_storage_buffer_object : require\n"
+      "#extension GL_ARB_shading_language_420pack : require\n"
+      "precision highp float;\n"
+      "precision highp int;\n"
+      "layout(std140, column_major) uniform;\n"
+      "layout(std430, column_major) buffer;\n"
+      "\n"
+      // This must match DrawBatcher::CommonHeader.
+      "struct StateData {\n"
+      "  vec4 window_offset;\n"
+      "  vec4 window_scissor;\n"
+      "  vec4 viewport_offset;\n"
+      "  vec4 viewport_scale;\n"
+      "  vec4 vtx_fmt;\n"
+      "  vec4 alpha_test;\n"
+      // TODO(benvanik): variable length.
+      "  uvec2 texture_samplers[32];\n"
+      "  vec4 float_consts[512];\n"
+      "  int bool_consts[8];\n"
+      "  int loop_consts[32];\n"
+      "};\n"
+      "layout(binding = 0) buffer State {\n"
+      "  StateData states[];\n"
+      "};\n"
+      "\n"
+      "struct VertexData {\n"
+      "  vec4 o[16];\n"
+      "};\n";
+  return header;
+}
+
+bool GL4Shader::PrepareVertexArrayObject() {
+  glCreateVertexArrays(1, &vao_);
+
+  bool has_bindless_vbos = false;
+  if (FLAGS_vendor_gl_extensions && GLEW_NV_vertex_buffer_unified_memory) {
+    has_bindless_vbos = true;
+    // Nasty, but no DSA for this.
+    glBindVertexArray(vao_);
+    glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+    glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+  }
+
+  uint32_t el_index = 0;
+  for (uint32_t buffer_index = 0; buffer_index < buffer_inputs_.count;
+       ++buffer_index) {
+    const auto& desc = buffer_inputs_.descs[buffer_index];
+
+    for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
+      const auto& el = desc.elements[i];
+      auto comp_count = GetVertexFormatComponentCount(el.format);
+      GLenum comp_type;
+      switch (el.format) {
+        case VertexFormat::k_8_8_8_8:
+          comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE;
+          break;
+        case VertexFormat::k_2_10_10_10:
+          comp_type = el.is_signed ? GL_INT_2_10_10_10_REV
+                                   : GL_UNSIGNED_INT_2_10_10_10_REV;
+          break;
+        case VertexFormat::k_10_11_11:
+          assert_false(el.is_signed);
+          comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV;
+          break;
+        /*case VertexFormat::k_11_11_10:
+        break;*/
+        case VertexFormat::k_16_16:
+          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
+          break;
+        case VertexFormat::k_16_16_FLOAT:
+          comp_type = GL_HALF_FLOAT;
+          break;
+        case VertexFormat::k_16_16_16_16:
+          comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
+          break;
+        case VertexFormat::k_16_16_16_16_FLOAT:
+          comp_type = GL_HALF_FLOAT;
+          break;
+        case VertexFormat::k_32:
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_32:
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_32_32_32:
+          comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
+          break;
+        case VertexFormat::k_32_FLOAT:
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_FLOAT:
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_32_FLOAT:
+          comp_type = GL_FLOAT;
+          break;
+        case VertexFormat::k_32_32_32_32_FLOAT:
+          comp_type = GL_FLOAT;
+          break;
+        default:
+          assert_unhandled_case(el.format);
+          return false;
+      }
+
+      glEnableVertexArrayAttrib(vao_, el_index);
+      if (has_bindless_vbos) {
+        // NOTE: MultiDrawIndirectBindlessMumble doesn't handle separate
+        // vertex bindings/formats.
+        glVertexAttribFormat(el_index, comp_count, comp_type, el.is_normalized,
+                             el.offset_words * 4);
+        glVertexArrayVertexBuffer(vao_, el_index, 0, 0, desc.stride_words * 4);
+      } else {
+        glVertexArrayAttribBinding(vao_, el_index, buffer_index);
+        glVertexArrayAttribFormat(vao_, el_index, comp_count, comp_type,
+                                  el.is_normalized, el.offset_words * 4);
+      }
+    }
+  }
+
+  if (has_bindless_vbos) {
+    glBindVertexArray(0);
+  }
+
+  return true;
+}
 
 bool GL4Shader::PrepareVertexShader(
     const xenos::xe_gpu_program_cntl_t& program_cntl) {
@@ -68,8 +177,14 @@ bool GL4Shader::PrepareVertexShader(
   }
   has_prepared_ = true;
 
+  // Build static vertex array descriptor.
+  if (!PrepareVertexArrayObject()) {
+    PLOGE("Unable to prepare vertex shader array object");
+    return false;
+  }
+
   std::string apply_transform =
-      "vec4 applyTransform(vec4 pos) {\n"
+      "vec4 applyTransform(const in StateData state, vec4 pos) {\n"
       "  // Clip->NDC with perspective divide.\n"
       "  // We do this here because it's programmable on the 360.\n"
       "  float w = pos.w;\n"
@@ -107,14 +222,15 @@ bool GL4Shader::PrepareVertexShader(
       "  return pos;\n"
       "}\n";
   std::string source =
-      header + apply_transform +
+      GetHeader() + apply_transform +
       "out gl_PerVertex {\n"
       "  vec4 gl_Position;\n"
       "  float gl_PointSize;\n"
       "  float gl_ClipDistance[];\n"
       "};\n"
-      "layout(location = 0) out VertexData vtx;\n"
-      "void processVertex();\n"
+      "layout(location = 0) flat out uint draw_id;\n"
+      "layout(location = 1) out VertexData vtx;\n"
+      "void processVertex(const in StateData state);\n"
       "void main() {\n" +
       (alloc_counts().positions ? "  gl_Position = vec4(0.0, 0.0, 0.0, 1.0);\n"
                                 : "") +
@@ -122,8 +238,10 @@ bool GL4Shader::PrepareVertexShader(
       "  for (int i = 0; i < vtx.o.length(); ++i) {\n"
       "    vtx.o[i] = vec4(0.0, 0.0, 0.0, 0.0);\n"
       "  }\n"
-      "  processVertex();\n"
-      "  gl_Position = applyTransform(gl_Position);\n"
+      "  const StateData state = states[gl_DrawIDARB];\n"
+      "  processVertex(state);\n"
+      "  gl_Position = applyTransform(state, gl_Position);\n"
+      "  draw_id = gl_DrawIDARB;\n"
       "}\n";
 
   std::string translated_source =
@@ -149,12 +267,14 @@ bool GL4Shader::PreparePixelShader(
   }
   has_prepared_ = true;
 
-  std::string source = header +
-                       "layout(location = 0) in VertexData vtx;\n"
+  std::string source = GetHeader() +
+                       "layout(location = 0) flat in uint draw_id;\n"
+                       "layout(location = 1) in VertexData vtx;\n"
                        "layout(location = 0) out vec4 oC[4];\n"
-                       "void processFragment();\n"
+                       "void processFragment(const in StateData state);\n"
                        "void main() {\n" +
-                       "  processFragment();\n"
+                       "  const StateData state = states[draw_id];\n"
+                       "  processFragment(state);\n"
                        "}\n";
 
   std::string translated_source =
diff --git a/src/xenia/gpu/gl4/gl4_shader.h b/src/xenia/gpu/gl4/gl4_shader.h
index da3c3df78..1dac6b4c3 100644
--- a/src/xenia/gpu/gl4/gl4_shader.h
+++ b/src/xenia/gpu/gl4/gl4_shader.h
@@ -10,6 +10,8 @@
 #ifndef XENIA_GPU_GL4_GL4_SHADER_H_
 #define XENIA_GPU_GL4_GL4_SHADER_H_
 
+#include <string>
+
 #include <xenia/common.h>
 #include <xenia/gpu/gl4/gl_context.h>
 #include <xenia/gpu/shader.h>
@@ -25,14 +27,18 @@ class GL4Shader : public Shader {
   ~GL4Shader() override;
 
   GLuint program() const { return program_; }
+  GLuint vao() const { return vao_; }
 
   bool PrepareVertexShader(const xenos::xe_gpu_program_cntl_t& program_cntl);
   bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl);
 
  protected:
+  std::string GetHeader();
+  bool PrepareVertexArrayObject();
   bool CompileProgram(std::string source);
 
   GLuint program_;
+  GLuint vao_;
 };
 
 }  // namespace gl4
diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.cc b/src/xenia/gpu/gl4/gl4_shader_translator.cc
index e314191ea..1b3aff02b 100644
--- a/src/xenia/gpu/gl4/gl4_shader_translator.cc
+++ b/src/xenia/gpu/gl4/gl4_shader_translator.cc
@@ -91,7 +91,7 @@ std::string GL4ShaderTranslator::TranslateVertexShader(
   const auto& alloc_counts = vertex_shader->alloc_counts();
 
   // Vertex shader main() header.
-  Append("void processVertex() {\n");
+  Append("void processVertex(const in StateData state) {\n");
 
   // Add temporaries for any registers we may use.
   uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;
@@ -126,7 +126,7 @@ std::string GL4ShaderTranslator::TranslatePixelShader(
   // (and less than the number of required registers), things may die.
 
   // Pixel shader main() header.
-  Append("void processFragment() {\n");
+  Append("void processFragment(const in StateData state) {\n");
 
   // Add temporary registers.
   uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;
diff --git a/src/xenia/gpu/gl4/gl_context.cc b/src/xenia/gpu/gl4/gl_context.cc
index d5e39fa34..3d4600514 100644
--- a/src/xenia/gpu/gl4/gl_context.cc
+++ b/src/xenia/gpu/gl4/gl_context.cc
@@ -132,12 +132,15 @@ std::unique_ptr<GLContext> GLContext::CreateShared() {
     GLContextLock context_lock(this);
 
     int context_flags = 0;
+    //int profile = WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB;
+    int profile = WGL_CONTEXT_CORE_PROFILE_BIT_ARB;
 #if DEBUG
     context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB;
-#endif                                                          // DEBUG
-    int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4,      //
-                         WGL_CONTEXT_MINOR_VERSION_ARB, 5,      //
-                         WGL_CONTEXT_FLAGS_ARB, context_flags,  //
+#endif                                                           // DEBUG
+    int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4,       //
+                         WGL_CONTEXT_MINOR_VERSION_ARB, 5,       //
+                         WGL_CONTEXT_FLAGS_ARB, context_flags,   //
+                         WGL_CONTEXT_PROFILE_MASK_ARB, profile,  //
                          0};
     new_glrc = wglCreateContextAttribsARB(dc_, glrc_, attrib_list);
     if (!new_glrc) {
diff --git a/src/xenia/gpu/gl4/sources.gypi b/src/xenia/gpu/gl4/sources.gypi
index efc52f2b6..2f5c0db72 100644
--- a/src/xenia/gpu/gl4/sources.gypi
+++ b/src/xenia/gpu/gl4/sources.gypi
@@ -5,6 +5,8 @@
     'circular_buffer.h',
     'command_processor.cc',
     'command_processor.h',
+    'draw_batcher.cc',
+    'draw_batcher.h',
     'gl4_gpu-private.h',
     'gl4_gpu.cc',
     'gl4_gpu.h',