diff --git a/src/poly/math.h b/src/poly/math.h
index 0fbbb0702..49edd2dc0 100644
--- a/src/poly/math.h
+++ b/src/poly/math.h
@@ -36,7 +36,7 @@ T align(T value, T alignment) {
 // Rounds the given number up to the next highest multiple.
 template <typename T, typename V>
 T round_up(T value, V multiple) {
-  return value ? (value + multiple - 1 - (value - 1) % multiple) : multiple;
+  return value ? (((value + multiple - 1) / multiple) * multiple) : multiple;
 }
 
 inline float saturate(float value) {
diff --git a/src/xenia/gpu/gl4/circular_buffer.cc b/src/xenia/gpu/gl4/circular_buffer.cc
index a7d456c5a..d2a342646 100644
--- a/src/xenia/gpu/gl4/circular_buffer.cc
+++ b/src/xenia/gpu/gl4/circular_buffer.cc
@@ -27,29 +27,41 @@ CircularBuffer::CircularBuffer(size_t capacity)
       gpu_base_(0),
       host_base_(nullptr) {}
 
-CircularBuffer::~CircularBuffer() {
-  glUnmapNamedBuffer(buffer_);
-  glDeleteBuffers(1, &buffer_);
-}
+CircularBuffer::~CircularBuffer() { Shutdown(); }
 
 bool CircularBuffer::Initialize() {
   glCreateBuffers(1, &buffer_);
   glNamedBufferStorage(buffer_, capacity_, nullptr,
                        GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
   host_base_ = reinterpret_cast<uint8_t*>(glMapNamedBufferRange(
-      buffer_, 0, capacity_, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
-                                 GL_MAP_UNSYNCHRONIZED_BIT |
-                                 GL_MAP_PERSISTENT_BIT));
+      buffer_, 0, capacity_,
+      GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_PERSISTENT_BIT));
   assert_not_null(host_base_);
   if (!host_base_) {
     return false;
   }
-  glMakeNamedBufferResidentNV(buffer_, GL_WRITE_ONLY);
-  glGetNamedBufferParameterui64vNV(buffer_, GL_BUFFER_GPU_ADDRESS_NV,
-                                   &gpu_base_);
+
+  if (GLEW_NV_shader_buffer_load) {
+    // To use this bindlessly we must make it resident.
+    glMakeNamedBufferResidentNV(buffer_, GL_WRITE_ONLY);
+    glGetNamedBufferParameterui64vNV(buffer_, GL_BUFFER_GPU_ADDRESS_NV,
+                                     &gpu_base_);
+  }
   return true;
 }
 
+void CircularBuffer::Shutdown() {
+  if (!buffer_) {
+    return;
+  }
+  glUnmapNamedBuffer(buffer_);
+  if (GLEW_NV_shader_buffer_load) {
+    glMakeNamedBufferNonResidentNV(buffer_);
+  }
+  glDeleteBuffers(1, &buffer_);
+  buffer_ = 0;
+}
+
 CircularBuffer::Allocation CircularBuffer::Acquire(size_t length) {
   // Addresses must always be % 256.
   length = poly::round_up(length, 256);
@@ -64,6 +76,7 @@ CircularBuffer::Allocation CircularBuffer::Acquire(size_t length) {
   Allocation allocation;
   allocation.host_ptr = host_base_ + write_head_;
   allocation.gpu_ptr = gpu_base_ + write_head_;
+  allocation.offset = write_head_;
   allocation.length = length;
   write_head_ += length;
   return allocation;
diff --git a/src/xenia/gpu/gl4/circular_buffer.h b/src/xenia/gpu/gl4/circular_buffer.h
index dde0e41d1..987ce746c 100644
--- a/src/xenia/gpu/gl4/circular_buffer.h
+++ b/src/xenia/gpu/gl4/circular_buffer.h
@@ -26,10 +26,12 @@ class CircularBuffer {
   struct Allocation {
     void* host_ptr;
     GLuint64 gpu_ptr;
+    size_t offset;
     size_t length;
   };
 
   bool Initialize();
+  void Shutdown();
 
   GLuint handle() const { return buffer_; }
 
diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc
index dd1fef43f..c54c541d8 100644
--- a/src/xenia/gpu/gl4/command_processor.cc
+++ b/src/xenia/gpu/gl4/command_processor.cc
@@ -16,6 +16,8 @@
 #include <xenia/gpu/gl4/gl4_gpu-private.h>
 #include <xenia/gpu/gl4/gl4_graphics_system.h>
 #include <xenia/gpu/gpu-private.h>
+#include <xenia/gpu/sampler_info.h>
+#include <xenia/gpu/texture_info.h>
 #include <xenia/gpu/xenos.h>
 
 #include <third_party/xxhash/xxhash.h>
@@ -36,7 +38,7 @@ const GLuint kAnyTarget = UINT_MAX;
 // All uncached vertex/index data goes here. If it fills up we need to sync
 // with the GPU, so this should be large enough to prevent that in a normal
 // frame.
-const size_t kScratchBufferCapacity = 64 * 1024 * 1024;
+const size_t kScratchBufferCapacity = 256 * 1024 * 1024;
 
 CommandProcessor::CachedPipeline::CachedPipeline() = default;
 
@@ -61,6 +63,7 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
       write_ptr_index_(0),
       bin_select_(0xFFFFFFFFull),
       bin_mask_(0xFFFFFFFFull),
+      has_bindless_vbos_(false),
       active_vertex_shader_(nullptr),
       active_pixel_shader_(nullptr),
       active_framebuffer_(nullptr),
@@ -152,29 +155,34 @@ void CommandProcessor::WorkerMain() {
 }
 
 bool CommandProcessor::SetupGL() {
-  // Uniform buffer that stores the per-draw state (constants, etc).
-  glCreateBuffers(1, &uniform_data_buffer_);
-  glBindBuffer(GL_UNIFORM_BUFFER, uniform_data_buffer_);
-  glNamedBufferStorage(uniform_data_buffer_, 16 * 1024, nullptr,
-                       GL_MAP_WRITE_BIT | GL_DYNAMIC_STORAGE_BIT);
-
   // Circular buffer holding scratch vertex/index data.
   if (!scratch_buffer_.Initialize()) {
     PLOGE("Unable to initialize scratch buffer");
     return false;
   }
 
+  // Texture cache that keeps track of any textures/samplers used.
+  if (!texture_cache_.Initialize(&scratch_buffer_)) {
+    PLOGE("Unable to initialize texture cache");
+    return false;
+  }
+
   GLuint vertex_array;
   glGenVertexArrays(1, &vertex_array);
   glBindVertexArray(vertex_array);
-  glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
-  glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+
+  if (GLEW_NV_vertex_buffer_unified_memory) {
+    has_bindless_vbos_ = true;
+    glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+    glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+  }
 
   return true;
 }
 
 void CommandProcessor::ShutdownGL() {
-  glDeleteBuffers(1, &uniform_data_buffer_);
+  texture_cache_.Shutdown();
+  scratch_buffer_.Shutdown();
 }
 
 void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) {
@@ -264,6 +272,7 @@ void CommandProcessor::PrepareForWait() {
   // make interrupt callbacks from the GPU so that we don't have to do a full
   // synchronize here.
   glFlush();
+  glFinish();
 
   if (FLAGS_thread_safe_gl) {
     context_->ClearCurrent();
@@ -1142,6 +1151,8 @@ void CommandProcessor::PrepareDraw(DrawCommand* draw_command) {
   // Generic stuff.
   cmd.start_index = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32;
   cmd.base_vertex = 0;
+
+  cmd.state_data = nullptr;
 }
 
 bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
@@ -1158,6 +1169,18 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
     return IssueCopy(draw_command);
   }
 
+  // TODO(benvanik): actually cache things >_>
+  texture_cache_.Clear();
+
+  // Allocate a state data block.
+  // Everything the shaders access lives here.
+  auto allocation = scratch_buffer_.Acquire(sizeof(UniformDataBlock));
+  cmd.state_data = reinterpret_cast<UniformDataBlock*>(allocation.host_ptr);
+  if (!cmd.state_data) {
+    PLOGE("Unable to allocate uniform data buffer");
+    return false;
+  }
+
   if (!UpdateRenderTargets(draw_command)) {
     PLOGE("Unable to setup render targets");
     return false;
@@ -1172,17 +1195,15 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
     PLOGE("Unable to setup render state");
     return false;
   }
-
+  if (!UpdateConstants(draw_command)) {
+    PLOGE("Unable to update shader constants");
+    return false;
+  }
   if (!UpdateShaders(draw_command)) {
     PLOGE("Unable to prepare draw shaders");
     return false;
   }
 
-  // if (!PopulateSamplers(draw_command)) {
-  //  XELOGE("Unable to prepare draw samplers");
-  //  return false;
-  //}
-
   if (!PopulateIndexBuffer(draw_command)) {
     PLOGE("Unable to setup index buffer");
     return false;
@@ -1191,6 +1212,10 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
     PLOGE("Unable to setup vertex buffers");
     return false;
   }
+  if (!PopulateSamplers(draw_command)) {
+    PLOGE("Unable to prepare draw samplers");
+    return false;
+  }
 
   GLenum prim_type = 0;
   switch (cmd.prim_type) {
@@ -1228,6 +1253,7 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
       break;
     case PrimitiveType::kQuadList:
       prim_type = GL_LINES_ADJACENCY;
+      return false;
       /*if
       (vs->DemandGeometryShader(D3D11VertexShaderResource::QUAD_LIST_SHADER,
                                    &geometry_shader)) {
@@ -1237,10 +1263,15 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
     default:
     case PrimitiveType::kUnknown0x07:
       prim_type = GL_POINTS;
-      XELOGE("D3D11: unsupported primitive type %d", cmd.prim_type);
+      XELOGE("unsupported primitive type %d", cmd.prim_type);
       break;
   }
 
+  // Commit the state buffer - nothing can change after this.
+  glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, scratch_buffer_.handle(),
+                    allocation.offset, allocation.length);
+  scratch_buffer_.Commit(std::move(allocation));
+
   // HACK HACK HACK
   glDisable(GL_DEPTH_TEST);
 
@@ -1254,13 +1285,108 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
         prim_type, cmd.index_count,
         cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT
                                                        : GL_UNSIGNED_SHORT,
-        reinterpret_cast<void*>(cmd.start_index * element_size),
+        reinterpret_cast<void*>(cmd.index_buffer.buffer_offset +
+                                cmd.start_index * element_size),
         cmd.base_vertex);
   } else {
     // Auto draw.
     glDrawArrays(prim_type, cmd.start_index, cmd.index_count);
   }
 
+  // Hacky draw counter.
+  if (false) {
+    static int draw_count = 0;
+    glEnable(GL_SCISSOR_TEST);
+    glScissor(20, 0, 20, 20);
+    float red[] = {0, draw_count / 100.0f, 0, 1.0f};
+    draw_count = (draw_count + 1) % 100;
+    glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0,
+                              red);
+    glDisable(GL_SCISSOR_TEST);
+  }
+
+  return true;
+}
+
+bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) {
+  auto& regs = *register_file_;
+
+  auto enable_mode =
+      static_cast<ModeControl>(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7);
+
+  // RB_SURFACE_INFO
+  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
+  uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
+  uint32_t surface_pitch = surface_info & 0x3FFF;
+  auto surface_msaa = static_cast<MsaaSamples>((surface_info >> 16) & 0x3);
+
+  // Get/create all color render targets, if we are using them.
+  // In depth-only mode we don't need them.
+  GLenum draw_buffers[4] = {GL_NONE, GL_NONE, GL_NONE, GL_NONE};
+  GLuint color_targets[4] = {kAnyTarget, kAnyTarget, kAnyTarget, kAnyTarget};
+  if (enable_mode == ModeControl::kColorDepth) {
+    uint32_t color_info[4] = {
+        regs[XE_GPU_REG_RB_COLOR_INFO].u32, regs[XE_GPU_REG_RB_COLOR1_INFO].u32,
+        regs[XE_GPU_REG_RB_COLOR2_INFO].u32,
+        regs[XE_GPU_REG_RB_COLOR3_INFO].u32,
+    };
+    // A2XX_RB_COLOR_MASK_WRITE_* == D3DRS_COLORWRITEENABLE
+    uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
+    for (int n = 0; n < poly::countof(color_info); n++) {
+      uint32_t write_mask = (color_mask >> (n * 4)) & 0xF;
+      if (!write_mask) {
+        // Unused, so keep disabled and set to wildcard so we'll take any
+        // framebuffer that has it.
+        continue;
+      }
+      uint32_t color_base = color_info[n] & 0xFFF;
+      auto color_format =
+          static_cast<ColorRenderTargetFormat>((color_info[n] >> 16) & 0xF);
+      color_targets[n] = GetColorRenderTarget(surface_pitch, surface_msaa,
+                                              color_base, color_format);
+      draw_buffers[n] = GL_COLOR_ATTACHMENT0 + n;
+      glColorMaski(n, !!(write_mask & 0x1), !!(write_mask & 0x2),
+                   !!(write_mask & 0x4), !!(write_mask & 0x8));
+    }
+  }
+
+  // Get/create depth buffer, but only if we are going to use it.
+  uint32_t depth_control = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32;
+  uint32_t stencil_ref_mask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32;
+  bool uses_depth =
+      (depth_control & 0x00000002) || (depth_control & 0x00000004);
+  uint32_t stencil_write_mask = (stencil_ref_mask & 0x00FF0000) >> 16;
+  bool uses_stencil = (depth_control & 0x00000001) || (stencil_write_mask != 0);
+  GLuint depth_target = kAnyTarget;
+  if (uses_depth && uses_stencil) {
+    uint32_t depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
+    uint32_t depth_base = depth_info & 0xFFF;
+    auto depth_format =
+        static_cast<DepthRenderTargetFormat>((depth_info >> 16) & 0x1);
+    depth_target = GetDepthRenderTarget(surface_pitch, surface_msaa, depth_base,
+                                        depth_format);
+    // TODO(benvanik): when a game switches does it expect to keep the same
+    //     depth buffer contents?
+  }
+
+  // Get/create a framebuffer with the required targets.
+  // Note that none may be returned if we really don't need one.
+  auto cached_framebuffer = GetFramebuffer(color_targets, depth_target);
+  active_framebuffer_ = cached_framebuffer;
+  if (!active_framebuffer_) {
+    // Nothing to do.
+    return true;
+  }
+
+  // Setup just the targets we want.
+  glNamedFramebufferDrawBuffers(cached_framebuffer->framebuffer, 4,
+                                draw_buffers);
+
+  // Make active.
+  // TODO(benvanik): can we do this all named?
+  // TODO(benvanik): do we want this on READ too?
+  glBindFramebuffer(GL_DRAW_FRAMEBUFFER, cached_framebuffer->framebuffer);
+
   return true;
 }
 
@@ -1272,57 +1398,24 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
 
   auto& regs = *register_file_;
 
-  union float4 {
-    float v[4];
-    struct {
-      float x, y, z, w;
-    };
-  };
-  struct UniformDataBlock {
-    float4 window_offset;    // tx,ty,rt_w,rt_h
-    float4 window_scissor;   // x0,y0,x1,y1
-    float4 viewport_offset;  // tx,ty,tz,?
-    float4 viewport_scale;   // sx,sy,sz,?
-    // TODO(benvanik): vertex format xyzw?
-
-    float4 alpha_test;  // alpha test enable, func, ref, ?
-
-    // Register data from 0x4000 to 0x4927.
-    // SHADER_CONSTANT_000_X...
-    float4 float_consts[512];
-    // SHADER_CONSTANT_FETCH_00_0...
-    uint32_t fetch_consts[32 * 6];
-    // SHADER_CONSTANT_BOOL_000_031...
-    int32_t bool_consts[8];
-    // SHADER_CONSTANT_LOOP_00...
-    int32_t loop_consts[32];
-  };
-  static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
-                "Need <=16k uniform data");
-
-  auto allocation = scratch_buffer_.Acquire(16 * 1024);
-  auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(allocation.host_ptr);
-  if (!buffer_ptr) {
-    PLOGE("Unable to allocate uniform data buffer");
-    return false;
-  }
+  auto state_data = draw_command->state_data;
 
   // Window parameters.
   // See r200UpdateWindow:
   // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
   uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32;
-  buffer_ptr->window_offset.x = float(window_offset & 0x7FFF);
-  buffer_ptr->window_offset.y = float((window_offset >> 16) & 0x7FFF);
+  state_data->window_offset.x = float(window_offset & 0x7FFF);
+  state_data->window_offset.y = float((window_offset >> 16) & 0x7FFF);
   uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
   uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
-  buffer_ptr->window_scissor.x = float(window_scissor_tl & 0x7FFF);
-  buffer_ptr->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF);
-  buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF);
-  buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);
+  state_data->window_scissor.x = float(window_scissor_tl & 0x7FFF);
+  state_data->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF);
+  state_data->window_scissor.z = float(window_scissor_br & 0x7FFF);
+  state_data->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);
 
   // HACK: no clue where to get these values.
-  buffer_ptr->window_offset.z = 1280;
-  buffer_ptr->window_offset.w = 720;
+  state_data->window_offset.z = 1280;
+  state_data->window_offset.w = 720;
 
   // Whether each of the viewport settings is enabled.
   // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
@@ -1338,20 +1431,20 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
               vport_yoffset_enable == vport_zoffset_enable);
 
   // Viewport scaling. Only enabled if the flags are all set.
-  buffer_ptr->viewport_scale.x =
+  state_data->viewport_scale.x =
       vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1;  // 640
-  buffer_ptr->viewport_offset.x = vport_xoffset_enable
+  state_data->viewport_offset.x = vport_xoffset_enable
                                       ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
                                       : 0;  // 640
-  buffer_ptr->viewport_scale.y = vport_yscale_enable
+  state_data->viewport_scale.y = vport_yscale_enable
                                      ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
                                      : 1;  // -360
-  buffer_ptr->viewport_offset.y = vport_yoffset_enable
+  state_data->viewport_offset.y = vport_yoffset_enable
                                       ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
                                       : 0;  // 360
-  buffer_ptr->viewport_scale.z =
+  state_data->viewport_scale.z =
       vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1;  // 1
-  buffer_ptr->viewport_offset.z =
+  state_data->viewport_offset.z =
       vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0;  // 0
   // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0.
   //            = false: multiply the X, Y coordinates by 1/W0.
@@ -1365,15 +1458,6 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
   // TODO(benvanik): pass to shaders? disable transform? etc?
   glViewport(0, 0, 1280, 720);
 
-  // Copy over all constants.
-  // TODO(benvanik): partial updates, etc. We could use shader constant access
-  // knowledge that we get at compile time to only upload those constants
-  // required.
-  std::memcpy(
-      &buffer_ptr->float_consts, &regs[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
-      sizeof(buffer_ptr->float_consts) + sizeof(buffer_ptr->fetch_consts) +
-          sizeof(buffer_ptr->loop_consts) + sizeof(buffer_ptr->bool_consts));
-
   // Scissoring.
   int32_t screen_scissor_tl = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32;
   int32_t screen_scissor_br = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32;
@@ -1424,10 +1508,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
   // Deprecated in GL, implemented in shader.
   // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard;
   uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32;
-  buffer_ptr->alpha_test.x =
+  state_data->alpha_test.x =
       (color_control & 0x4) ? 1.0f : 0.0f;                // ALPAHTESTENABLE
-  buffer_ptr->alpha_test.y = float(color_control & 0x3);  // ALPHAFUNC
-  buffer_ptr->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
+  state_data->alpha_test.y = float(color_control & 0x3);  // ALPHAFUNC
+  state_data->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
 
   static const GLenum blend_map[] = {
       /*  0 */ GL_ZERO,
@@ -1575,91 +1659,23 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
                 stencil_op_map[(depth_control & 0x0001C000) >> 14]);
   }
 
-  // Stash - program setup will bind this to uniforms.
-  draw_command->state_data_gpu_ptr = allocation.gpu_ptr;
-  scratch_buffer_.Commit(std::move(allocation));
-
   return true;
 }
 
-bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) {
+bool CommandProcessor::UpdateConstants(DrawCommand* draw_command) {
   auto& regs = *register_file_;
+  auto state_data = draw_command->state_data;
 
-  auto enable_mode =
-      static_cast<ModeControl>(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7);
+  // TODO(benvanik): partial updates, etc. We could use shader constant access
+  // knowledge that we get at compile time to only upload those constants
+  // required. If we did this as a variable length then we could really cut
+  // down on state block sizes.
 
-  // RB_SURFACE_INFO
-  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
-  uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
-  uint32_t surface_pitch = surface_info & 0x3FFF;
-  auto surface_msaa = static_cast<MsaaSamples>((surface_info >> 16) & 0x3);
-
-  // Get/create all color render targets, if we are using them.
-  // In depth-only mode we don't need them.
-  GLenum draw_buffers[4] = {GL_NONE, GL_NONE, GL_NONE, GL_NONE};
-  GLuint color_targets[4] = {kAnyTarget, kAnyTarget, kAnyTarget, kAnyTarget};
-  if (enable_mode == ModeControl::kColorDepth) {
-    uint32_t color_info[4] = {
-        regs[XE_GPU_REG_RB_COLOR_INFO].u32, regs[XE_GPU_REG_RB_COLOR1_INFO].u32,
-        regs[XE_GPU_REG_RB_COLOR2_INFO].u32,
-        regs[XE_GPU_REG_RB_COLOR3_INFO].u32,
-    };
-    // A2XX_RB_COLOR_MASK_WRITE_* == D3DRS_COLORWRITEENABLE
-    uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
-    for (int n = 0; n < poly::countof(color_info); n++) {
-      uint32_t write_mask = (color_mask >> (n * 4)) & 0xF;
-      if (!write_mask) {
-        // Unused, so keep disabled and set to wildcard so we'll take any
-        // framebuffer that has it.
-        continue;
-      }
-      uint32_t color_base = color_info[n] & 0xFFF;
-      auto color_format =
-          static_cast<ColorRenderTargetFormat>((color_info[n] >> 16) & 0xF);
-      color_targets[n] = GetColorRenderTarget(surface_pitch, surface_msaa,
-                                              color_base, color_format);
-      draw_buffers[n] = GL_COLOR_ATTACHMENT0 + n;
-      glColorMaski(n, !!(write_mask & 0x1), !!(write_mask & 0x2),
-                   !!(write_mask & 0x4), !!(write_mask & 0x8));
-    }
-  }
-
-  // Get/create depth buffer, but only if we are going to use it.
-  uint32_t depth_control = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32;
-  uint32_t stencil_ref_mask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32;
-  bool uses_depth =
-      (depth_control & 0x00000002) || (depth_control & 0x00000004);
-  uint32_t stencil_write_mask = (stencil_ref_mask & 0x00FF0000) >> 16;
-  bool uses_stencil = (depth_control & 0x00000001) || (stencil_write_mask != 0);
-  GLuint depth_target = kAnyTarget;
-  if (uses_depth && uses_stencil) {
-    uint32_t depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
-    uint32_t depth_base = depth_info & 0xFFF;
-    auto depth_format =
-        static_cast<DepthRenderTargetFormat>((depth_info >> 16) & 0x1);
-    depth_target = GetDepthRenderTarget(surface_pitch, surface_msaa, depth_base,
-                                        depth_format);
-    // TODO(benvanik): when a game switches does it expect to keep the same
-    //     depth buffer contents?
-  }
-
-  // Get/create a framebuffer with the required targets.
-  // Note that none may be returned if we really don't need one.
-  auto cached_framebuffer = GetFramebuffer(color_targets, depth_target);
-  active_framebuffer_ = cached_framebuffer;
-  if (!active_framebuffer_) {
-    // Nothing to do.
-    return true;
-  }
-
-  // Setup just the targets we want.
-  glNamedFramebufferDrawBuffers(cached_framebuffer->framebuffer, 4,
-                                draw_buffers);
-
-  // Make active.
-  // TODO(benvanik): can we do this all named?
-  // TODO(benvanik): do we want this on READ too?
-  glBindFramebuffer(GL_DRAW_FRAMEBUFFER, cached_framebuffer->framebuffer);
+  // Copy over all constants.
+  std::memcpy(
+      &state_data->float_consts, &regs[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
+      sizeof(state_data->float_consts) + sizeof(state_data->fetch_consts) +
+          sizeof(state_data->loop_consts) + sizeof(state_data->bool_consts));
 
   return true;
 }
@@ -1718,28 +1734,10 @@ bool CommandProcessor::UpdateShaders(DrawCommand* draw_command) {
     glUseProgramStages(pipeline, GL_GEOMETRY_SHADER_BIT, geometry_program);
     glUseProgramStages(pipeline, GL_FRAGMENT_SHADER_BIT, fragment_program);
 
-    // HACK: layout(location=0) on a bindless uniform crashes nvidia driver.
-    GLint vertex_state_loc = glGetUniformLocation(vertex_program, "state");
-    assert_true(vertex_state_loc == 0);
-    GLint geometry_state_loc =
-        geometry_program ? glGetUniformLocation(geometry_program, "state") : -1;
-    assert_true(geometry_state_loc == -1 || geometry_state_loc == 0);
-    GLint fragment_state_loc = glGetUniformLocation(fragment_program, "state");
-    assert_true(fragment_state_loc == -1 || fragment_state_loc == 0);
-
     cached_pipeline->handles.default_pipeline = pipeline;
   }
 
-  // TODO(benvanik): do we need to do this for all stages if the locations
-  // match?
-  glProgramUniformHandleui64ARB(vertex_program, 0, cmd.state_data_gpu_ptr);
-  /*if (geometry_program && geometry_state_loc != -1) {
-    glProgramUniformHandleui64ARB(geometry_program, 0, cmd.state_data_gpu_ptr);
-  }*/
-  /*if (fragment_state_loc != -1) {
-    glProgramUniformHandleui64ARB(fragment_program, 0,
-                                  cmd.state_data_gpu_ptr);
-  }*/
+  // NOTE: we don't yet have our state data pointer - that comes at the end.
 
   glBindProgramPipeline(cached_pipeline->handles.default_pipeline);
 
@@ -1759,10 +1757,10 @@ bool CommandProcessor::PopulateIndexBuffer(DrawCommand* draw_command) {
   assert_true(info.endianness == Endian::k8in16 ||
               info.endianness == Endian::k8in32);
 
-  auto allocation = scratch_buffer_.Acquire(cmd.index_count *
-                                            (info.format == IndexFormat::kInt32
-                                                 ? sizeof(uint32_t)
-                                                 : sizeof(uint16_t)));
+  size_t total_size =
+      cmd.index_count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
+                                                            : sizeof(uint16_t));
+  auto allocation = scratch_buffer_.Acquire(total_size);
 
   if (info.format == IndexFormat::kInt32) {
     poly::copy_and_swap_32_aligned(
@@ -1776,9 +1774,14 @@ bool CommandProcessor::PopulateIndexBuffer(DrawCommand* draw_command) {
         cmd.index_count);
   }
 
-  glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, allocation.gpu_ptr,
-                         allocation.length);
-
+  if (has_bindless_vbos_) {
+    glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, allocation.gpu_ptr,
+                           allocation.length);
+  } else {
+    // Offset is used in glDrawElements.
+    cmd.index_buffer.buffer_offset = allocation.offset;
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, scratch_buffer_.handle());
+  }
   scratch_buffer_.Commit(std::move(allocation));
 
   return true;
@@ -1792,7 +1795,8 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
 
   const auto& buffer_inputs = active_vertex_shader_->buffer_inputs();
 
-  for (size_t n = 0; n < buffer_inputs.count; n++) {
+  uint32_t el_index = 0;
+  for (uint32_t n = 0; n < buffer_inputs.count; n++) {
     const auto& desc = buffer_inputs.descs[n];
 
     int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6;
@@ -1826,7 +1830,11 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
         reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
         fetch->size);
 
-    uint32_t el_index = 0;
+    if (!has_bindless_vbos_) {
+      glBindVertexBuffer(n, scratch_buffer_.handle(), allocation.offset,
+                         desc.stride_words * 4);
+    }
+
     for (uint32_t i = 0; i < desc.element_count; ++i) {
       const auto& el = desc.elements[i];
       auto comp_count = GetVertexFormatComponentCount(el.format);
@@ -1882,13 +1890,19 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
           assert_unhandled_case(el.format);
           break;
       }
-      size_t offset = el.offset_words * sizeof(uint32_t);
       glEnableVertexAttribArray(el_index);
-      glVertexAttribFormatNV(el_index, comp_count, comp_type, el.is_normalized,
-                             desc.stride_words * sizeof(uint32_t));
-      glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index,
-                             allocation.gpu_ptr + offset,
-                             allocation.length - offset);
+      if (has_bindless_vbos_) {
+        glVertexAttribFormatNV(el_index, comp_count, comp_type,
+                               el.is_normalized,
+                               desc.stride_words * sizeof(uint32_t));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index,
+                               allocation.gpu_ptr + (el.offset_words * 4),
+                               allocation.length - (el.offset_words * 4));
+      } else {
+        glVertexAttribBinding(el_index, n);
+        glVertexAttribFormat(el_index, comp_count, comp_type, el.is_normalized,
+                             el.offset_words * 4);
+      }
       ++el_index;
     }
 
@@ -1899,6 +1913,82 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
   return true;
 }
 
+bool CommandProcessor::PopulateSamplers(DrawCommand* draw_command) {
+  SCOPE_profile_cpu_f("gpu");
+
+  auto& regs = *register_file_;
+
+  // VS and PS samplers are shared, but may be used exclusively.
+  // We walk each and setup lazily.
+  bool has_setup_sampler[32] = {false};
+
+  // Vertex texture samplers.
+  const auto& vertex_sampler_inputs = active_vertex_shader_->sampler_inputs();
+  for (size_t i = 0; i < vertex_sampler_inputs.count; ++i) {
+    const auto& desc = vertex_sampler_inputs.descs[i];
+    if (has_setup_sampler[desc.fetch_slot]) {
+      continue;
+    }
+    has_setup_sampler[desc.fetch_slot] = true;
+    if (!PopulateSampler(draw_command, desc)) {
+      return false;
+    }
+  }
+
+  // Pixel shader texture sampler.
+  const auto& pixel_sampler_inputs = active_pixel_shader_->sampler_inputs();
+  for (size_t i = 0; i < pixel_sampler_inputs.count; ++i) {
+    const auto& desc = pixel_sampler_inputs.descs[i];
+    if (has_setup_sampler[desc.fetch_slot]) {
+      continue;
+    }
+    has_setup_sampler[desc.fetch_slot] = true;
+    if (!PopulateSampler(draw_command, desc)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CommandProcessor::PopulateSampler(DrawCommand* draw_command,
+                                       const Shader::SamplerDesc& desc) {
+  auto& regs = *register_file_;
+  int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + desc.fetch_slot * 6;
+  auto group = reinterpret_cast<const xe_gpu_fetch_group_t*>(&regs.values[r]);
+  auto& fetch = group->texture_fetch;
+
+  // ?
+  assert_true(fetch.type == 0x2);
+
+  TextureInfo texture_info;
+  if (!TextureInfo::Prepare(fetch, &texture_info)) {
+    XELOGE("Unable to parse texture fetcher info");
+    return false;  // invalid texture used
+  }
+  SamplerInfo sampler_info;
+  if (!SamplerInfo::Prepare(fetch, desc.tex_fetch, &sampler_info)) {
+    XELOGE("Unable to parse sampler info");
+    return false;  // invalid texture used
+  }
+
+  uint32_t guest_base = fetch.address << 12;
+  void* host_base = membase_ + guest_base;
+  auto entry_view = texture_cache_.Demand(host_base, texture_info.input_length,
+                                          texture_info, sampler_info);
+  if (!entry_view) {
+    // Unable to create/fetch/etc.
+    XELOGE("Failed to demand texture");
+    return false;
+  }
+
+  // Shaders will use bindless to fetch right from it.
+  draw_command->state_data->texture_samplers[desc.fetch_slot] =
+      entry_view->texture_sampler_handle;
+
+  return true;
+}
+
 bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
   auto& regs = *register_file_;
 
@@ -2045,7 +2135,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
     case CopyCommand::kConstantOne:
     case CopyCommand::kNull:
     default:
-      assert_unhandled_case(copy_command);
+      // assert_unhandled_case(copy_command);
       return false;
   }
   glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h
index ff8441215..0cbfaec1b 100644
--- a/src/xenia/gpu/gl4/command_processor.h
+++ b/src/xenia/gpu/gl4/command_processor.h
@@ -20,6 +20,7 @@
 #include <xenia/gpu/gl4/circular_buffer.h>
 #include <xenia/gpu/gl4/gl_context.h>
 #include <xenia/gpu/gl4/gl4_shader.h>
+#include <xenia/gpu/gl4/texture_cache.h>
 #include <xenia/gpu/register_file.h>
 #include <xenia/gpu/xenos.h>
 #include <xenia/memory.h>
@@ -40,6 +41,39 @@ struct SwapParameters {
   GLenum attachment;
 };
 
+// This must match the layout in gl4_shader.cc.
+struct UniformDataBlock {
+  union float4 {
+    float v[4];
+    struct {
+      float x, y, z, w;
+    };
+  };
+
+  float4 window_offset;    // tx,ty,rt_w,rt_h
+  float4 window_scissor;   // x0,y0,x1,y1
+  float4 viewport_offset;  // tx,ty,tz,?
+  float4 viewport_scale;   // sx,sy,sz,?
+                           // TODO(benvanik): vertex format xyzw?
+
+  float4 alpha_test;  // alpha test enable, func, ref, ?
+
+                      // TODO(benvanik): overlay with fetch_consts below?
+  uint64_t texture_samplers[32];
+
+  // Register data from 0x4000 to 0x4927.
+  // SHADER_CONSTANT_000_X...
+  float4 float_consts[512];
+  // SHADER_CONSTANT_FETCH_00_0...
+  uint32_t fetch_consts[32 * 6];
+  // SHADER_CONSTANT_BOOL_000_031...
+  int32_t bool_consts[8];
+  // SHADER_CONSTANT_LOOP_00...
+  int32_t loop_consts[32];
+};
+static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
+              "Need <=16k uniform data");
+
 // TODO(benvanik): move more of the enums in here?
 struct DrawCommand {
   PrimitiveType prim_type;
@@ -54,6 +88,7 @@ struct DrawCommand {
     size_t size;
     xenos::Endian endianness;
     xenos::IndexFormat format;
+    size_t buffer_offset;
   } index_buffer;
 
   // Texture samplers.
@@ -63,11 +98,9 @@ struct DrawCommand {
     // SamplerStateResource* sampler_state;
   };
   SamplerInput vertex_shader_samplers[32];
-  size_t vertex_shader_sampler_count;
   SamplerInput pixel_shader_samplers[32];
-  size_t pixel_shader_sampler_count;
 
-  GLuint64 state_data_gpu_ptr;
+  UniformDataBlock* state_data;
 };
 
 class CommandProcessor {
@@ -195,11 +228,15 @@ class CommandProcessor {
 
   void PrepareDraw(DrawCommand* draw_command);
   bool IssueDraw(DrawCommand* draw_command);
-  bool UpdateState(DrawCommand* draw_command);
   bool UpdateRenderTargets(DrawCommand* draw_command);
+  bool UpdateState(DrawCommand* draw_command);
+  bool UpdateConstants(DrawCommand* draw_command);
   bool UpdateShaders(DrawCommand* draw_command);
   bool PopulateIndexBuffer(DrawCommand* draw_command);
   bool PopulateVertexBuffers(DrawCommand* draw_command);
+  bool PopulateSamplers(DrawCommand* draw_command);
+  bool PopulateSampler(DrawCommand* draw_command,
+                       const Shader::SamplerDesc& desc);
   bool IssueCopy(DrawCommand* draw_command);
 
   CachedFramebuffer* GetFramebuffer(GLuint color_targets[4],
@@ -237,7 +274,7 @@ class CommandProcessor {
   uint64_t bin_select_;
   uint64_t bin_mask_;
 
-  GLuint uniform_data_buffer_;
+  bool has_bindless_vbos_;
 
   std::vector<std::unique_ptr<GL4Shader>> all_shaders_;
   std::unordered_map<uint64_t, GL4Shader*> shader_cache_;
@@ -251,7 +288,7 @@ class CommandProcessor {
   std::vector<CachedDepthRenderTarget> cached_depth_render_targets_;
   std::vector<std::unique_ptr<CachedPipeline>> all_pipelines_;
   std::unordered_map<uint64_t, CachedPipeline*> cached_pipelines_;
-
+  TextureCache texture_cache_;
   CircularBuffer scratch_buffer_;
 
   DrawCommand draw_command_;
diff --git a/src/xenia/gpu/gl4/gl4_gpu-private.h b/src/xenia/gpu/gl4/gl4_gpu-private.h
index da11370f5..2b24c7f26 100644
--- a/src/xenia/gpu/gl4/gl4_gpu-private.h
+++ b/src/xenia/gpu/gl4/gl4_gpu-private.h
@@ -17,6 +17,9 @@
 
 DECLARE_bool(thread_safe_gl);
 
+DECLARE_bool(gl_debug_output);
+DECLARE_bool(gl_debug_output_synchronous);
+
 namespace xe {
 namespace gpu {
 namespace gl4 {
diff --git a/src/xenia/gpu/gl4/gl4_gpu.cc b/src/xenia/gpu/gl4/gl4_gpu.cc
index 6a3fe49b8..c27a4fb75 100644
--- a/src/xenia/gpu/gl4/gl4_gpu.cc
+++ b/src/xenia/gpu/gl4/gl4_gpu.cc
@@ -15,6 +15,10 @@
 DEFINE_bool(thread_safe_gl, false,
             "Only allow one GL context to be active at a time.");
 
+DEFINE_bool(gl_debug_output, false, "Dump ARB_debug_output to stderr.");
+DEFINE_bool(gl_debug_output_synchronous, true,
+            "ARB_debug_output will synchronize to be thread safe.");
+
 namespace xe {
 namespace gpu {
 namespace gl4 {
diff --git a/src/xenia/gpu/gl4/gl4_shader.cc b/src/xenia/gpu/gl4/gl4_shader.cc
index 2994ab627..a6b2df2ae 100644
--- a/src/xenia/gpu/gl4/gl4_shader.cc
+++ b/src/xenia/gpu/gl4/gl4_shader.cc
@@ -35,7 +35,6 @@ const std::string header =
     "#extension GL_ARB_explicit_uniform_location : require\n"
     "#extension GL_ARB_shading_language_420pack : require\n"
     "#extension GL_ARB_shader_storage_buffer_object : require\n"
-    "#extension GL_NV_shader_buffer_load : require\n"
     "precision highp float;\n"
     "precision highp int;\n"
     "layout(std140, column_major) uniform;\n"
@@ -46,6 +45,7 @@ const std::string header =
     "  vec4 viewport_offset;\n"
     "  vec4 viewport_scale;\n"
     "  vec4 alpha_test;\n"
+    "  uvec2 texture_samplers[32];\n"
     "  vec4 float_consts[512];\n"
     "  uint fetch_consts[32 * 6];\n"
     "  int bool_consts[8];\n"
@@ -55,7 +55,9 @@ const std::string header =
     "  vec4 o[16];\n"
     "};\n"
     "\n"
-    "uniform StateData* state;\n";
+    "layout(binding = 0) buffer State {\n"
+    "  StateData state;\n"
+    "};\n";
 
 bool GL4Shader::PrepareVertexShader(
     const xenos::xe_gpu_program_cntl_t& program_cntl) {
@@ -69,20 +71,20 @@ bool GL4Shader::PrepareVertexShader(
       // TODO(benvanik): piecewise viewport_enable -> offset/scale logic.
       "  if (false) {\n"
       "  } else {\n"
-      /*"    pos.xy = pos.xy / vec2(state->window_offset.z / 2.0, "
-      "-state->window_offset.w / 2.0) + vec2(-1.0, 1.0);\n"
+      /*"    pos.xy = pos.xy / vec2(state.window_offset.z / 2.0, "
+      "-state.window_offset.w / 2.0) + vec2(-1.0, 1.0);\n"
       "    pos.zw = vec2(0.0, 1.0);\n"*/
       "    pos.xy = pos.xy / vec2(1280.0 / 2.0, "
       "-720.0 / 2.0) + vec2(-1.0, 1.0);\n"
       "    //pos.zw = vec2(0.0, 1.0);\n"
       "  }\n"
-      "  pos.x = pos.x * state->viewport_scale.x + \n"
-      "      state->viewport_offset.x;\n"
-      "  pos.y = pos.y * state->viewport_scale.y + \n"
-      "      state->viewport_offset.y;\n"
-      "  pos.z = pos.z * state->viewport_scale.z + \n"
-      "      state->viewport_offset.z;\n"
-      "  pos.xy += state->window_offset.xy;\n"
+      "  pos.x = pos.x * state.viewport_scale.x + \n"
+      "      state.viewport_offset.x;\n"
+      "  pos.y = pos.y * state.viewport_scale.y + \n"
+      "      state.viewport_offset.y;\n"
+      "  pos.z = pos.z * state.viewport_scale.z + \n"
+      "      state.viewport_offset.z;\n"
+      "  pos.xy += state.window_offset.xy;\n"
       "  return pos;\n"
       "}\n";
   std::string source =
@@ -105,6 +107,8 @@ bool GL4Shader::PrepareVertexShader(
       "  gl_Position = applyViewport(gl_Position);\n"
       "}\n";
 
+  // glGetTextureSamplerHandleARB()
+
   std::string translated_source =
       shader_translator_.TranslateVertexShader(this, program_cntl);
   if (translated_source.empty()) {
@@ -135,9 +139,9 @@ bool GL4Shader::PreparePixelShader(
       "void processFragment();\n"
       "void main() {\n"
       "  for (int i = 0; i < oC.length(); ++i) {\n"
-      "    oC[i] = vec4(0.0, 0.0, 0.0, 0.0);\n"
+      "    oC[i] = vec4(1.0, 0.0, 0.0, 1.0);\n"
       "  }\n" +
-      (program_cntl.ps_export_depth ? "  gl_FragDepth = 0.0\n" : "") +
+      (program_cntl.ps_export_depth ? "  gl_FragDepth = 0.0;\n" : "") +
       "  processFragment();\n"
       "}\n";
 
diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.cc b/src/xenia/gpu/gl4/gl4_shader_translator.cc
index f0b0c5bed..9ff76d3f0 100644
--- a/src/xenia/gpu/gl4/gl4_shader_translator.cc
+++ b/src/xenia/gpu/gl4/gl4_shader_translator.cc
@@ -28,25 +28,21 @@ static const char chan_names[] = {
 const char* GetVertexFormatTypeName(const GL4Shader::BufferDescElement& el) {
   switch (el.format) {
     case VertexFormat::k_32:
-      return el.is_signed ? "int" : "uint";
     case VertexFormat::k_32_FLOAT:
       return "float";
     case VertexFormat::k_16_16:
     case VertexFormat::k_32_32:
-      return el.is_signed ? "ivec2" : "uvec2";
     case VertexFormat::k_16_16_FLOAT:
     case VertexFormat::k_32_32_FLOAT:
       return "vec2";
     case VertexFormat::k_10_11_11:
     case VertexFormat::k_11_11_10:
-      return "int3";  // ?
     case VertexFormat::k_32_32_32_FLOAT:
       return "vec3";
     case VertexFormat::k_8_8_8_8:
     case VertexFormat::k_2_10_10_10:
     case VertexFormat::k_16_16_16_16:
     case VertexFormat::k_32_32_32_32:
-      return el.is_signed ? "ivec4" : "uvec4";
     case VertexFormat::k_16_16_16_16_FLOAT:
     case VertexFormat::k_32_32_32_32_FLOAT:
       return "vec4";
@@ -58,14 +54,13 @@ const char* GetVertexFormatTypeName(const GL4Shader::BufferDescElement& el) {
 }
 
 GL4ShaderTranslator::GL4ShaderTranslator()
-    : output_(kOutputCapacity), tex_fetch_index_(0), dwords_(nullptr) {}
+    : output_(kOutputCapacity), dwords_(nullptr) {}
 
 GL4ShaderTranslator::~GL4ShaderTranslator() = default;
 
 void GL4ShaderTranslator::Reset(GL4Shader* shader) {
   output_.Reset();
   shader_type_ = shader->type();
-  tex_fetch_index_ = 0;
   dwords_ = shader->data();
 }
 
@@ -76,8 +71,6 @@ std::string GL4ShaderTranslator::TranslateVertexShader(
   // Normal shaders only, for now.
   assert_true(program_cntl.vs_export_mode == 0);
 
-  AppendTextureHeader(vertex_shader->sampler_inputs());
-
   // Add vertex shader input.
   uint32_t el_index = 0;
   const auto& buffer_inputs = vertex_shader->buffer_inputs();
@@ -102,7 +95,7 @@ std::string GL4ShaderTranslator::TranslateVertexShader(
   // Add temporaries for any registers we may use.
   uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;
   for (uint32_t n = 0; n <= temp_regs; n++) {
-    Append("  vec4 r%d = state->float_consts[%d];\n", n, n);
+    Append("  vec4 r%d = state.float_consts[%d];\n", n, n);
   }
   Append("  vec4 t;\n");
 
@@ -129,15 +122,13 @@ std::string GL4ShaderTranslator::TranslatePixelShader(
   // If the same PS is used with different VS that output different amounts
   // (and less than the number of required registers), things may die.
 
-  AppendTextureHeader(pixel_shader->sampler_inputs());
-
   // Pixel shader main() header.
   Append("void processFragment() {\n");
 
   // Add temporary registers.
   uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;
   for (uint32_t n = 0; n <= std::max(15u, temp_regs); n++) {
-    Append("  vec4 r%d = state->float_consts[%d];\n", n, n + 256);
+    Append("  vec4 r%d = state.float_consts[%d];\n", n, n + 256);
   }
   Append("  vec4 t;\n");
   Append("  float s;\n");  // scalar result (used for RETAIN_PREV)
@@ -161,42 +152,6 @@ std::string GL4ShaderTranslator::TranslatePixelShader(
   return output_.to_string();
 }
 
-void GL4ShaderTranslator::AppendTextureHeader(
-    const GL4Shader::SamplerInputs& sampler_inputs) {
-  bool fetch_setup[32] = {false};
-
-  // 1 texture per constant slot, 1 sampler per fetch.
-  for (uint32_t n = 0; n < sampler_inputs.count; n++) {
-    const auto& input = sampler_inputs.descs[n];
-    const auto& fetch = input.tex_fetch;
-
-    // Add texture, if needed.
-    if (!fetch_setup[fetch.const_idx]) {
-      fetch_setup[fetch.const_idx] = true;
-      const char* texture_type = nullptr;
-      switch (fetch.dimension) {
-        case DIMENSION_1D:
-          texture_type = "Texture1D";
-          break;
-        default:
-        case DIMENSION_2D:
-          texture_type = "Texture2D";
-          break;
-        case DIMENSION_3D:
-          texture_type = "Texture3D";
-          break;
-        case DIMENSION_CUBE:
-          texture_type = "TextureCube";
-          break;
-      }
-      Append("%s x_texture_%d;\n", texture_type, fetch.const_idx);
-    }
-
-    // Add sampler.
-    Append("SamplerState x_sampler_%d;\n", n);
-  }
-}
-
 void GL4ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type,
                                        uint32_t swiz, uint32_t negate,
                                        uint32_t abs_constants) {
@@ -217,7 +172,7 @@ void GL4ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type,
     if (abs_constants) {
       Append("abs(");
     }
-    Append("state->float_consts[%u]", is_pixel_shader() ? num + 256 : num);
+    Append("state.float_consts[%u]", is_pixel_shader() ? num + 256 : num);
     if (abs_constants) {
       Append(")");
     }
@@ -258,9 +213,12 @@ void GL4ShaderTranslator::AppendDestRegName(uint32_t num, uint32_t dst_exp) {
           case 0:
             Append("oC[0]");
             break;
+          case 61:
+            // Write to t, as we need to splice just x out of it.
+            Append("t");
+            break;
           default:
             // TODO(benvanik): other render targets?
-            // TODO(benvanik): depth?
             assert_always();
             break;
         }
@@ -282,7 +240,10 @@ void GL4ShaderTranslator::AppendDestReg(uint32_t num, uint32_t mask,
 
 void GL4ShaderTranslator::AppendDestRegPost(uint32_t num, uint32_t mask,
                                             uint32_t dst_exp) {
-  if (mask != 0xF) {
+  if (num == 61) {
+    // gl_FragDepth handling to just get x from the temp result.
+    Append("  gl_FragDepth = t.x;\n");
+  } else if (mask != 0xF) {
     // Masking.
     Append("  ");
     AppendDestRegName(num, dst_exp);
@@ -399,7 +360,7 @@ bool GL4ShaderTranslator::TranslateALU_ADDv(const instr_alu_t& alu) {
                alu.abs_constants);
   Append(")");
   if (alu.vector_clamp) {
-    Append(")");
+    Append(", 0.0, 1.0)");
   }
   Append(";\n");
   AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
@@ -685,7 +646,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT4v(const instr_alu_t& alu) {
   if (alu.vector_clamp) {
     Append(", 0.0, 1.0)");
   }
-  Append(";\n");
+  Append(".xxxx;\n");
   AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
   return true;
 }
@@ -706,7 +667,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT3v(const instr_alu_t& alu) {
   if (alu.vector_clamp) {
     Append(", 0.0, 1.0)");
   }
-  Append(";\n");
+  Append(".xxxx;\n");
   AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
   return true;
 }
@@ -730,7 +691,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) {
   if (alu.vector_clamp) {
     Append(", 0.0, 1.0)");
   }
-  Append(";\n");
+  Append(".xxxx;\n");
   AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
   return true;
 }
@@ -1402,20 +1363,27 @@ bool GL4ShaderTranslator::TranslateVertexFetch(const instr_fetch_vtx_t* vtx,
 bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
                                                 int sync) {
   int src_component_count = 0;
+  const char* sampler_type;
   switch (tex->dimension) {
     case DIMENSION_1D:
       src_component_count = 1;
+      sampler_type = "sampler1D";
       break;
-    default:
     case DIMENSION_2D:
       src_component_count = 2;
+      sampler_type = "sampler2D";
       break;
     case DIMENSION_3D:
       src_component_count = 3;
+      sampler_type = "sampler3D";
       break;
     case DIMENSION_CUBE:
       src_component_count = 3;
+      sampler_type = "samplerCube";
       break;
+    default:
+      assert_unhandled_case(tex->dimension);
+      return false;
   }
 
   // Disassemble.
@@ -1500,10 +1468,10 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
   Append("\n");
 
   // Translate.
-  Append("  t = ");
-  Append("x_texture_%d.Sample(x_sampler_%d, r%u.", tex->const_idx,
-         tex_fetch_index_++,  // hacky way to line up to tex buffers
-         tex->src_reg);
+  // TODO(benvanik): if sampler == null, set to invalid color.
+  Append("  t = texture(");
+  Append("%s(state.texture_samplers[%d])", sampler_type, tex->const_idx & 0xF);
+  Append(", r%u.", tex->src_reg);
   src_swiz = tex->src_swiz;
   for (int i = 0; i < src_component_count; i++) {
     Append("%c", chan_names[src_swiz & 0x3]);
@@ -1511,6 +1479,26 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
   }
   Append(");\n");
 
+  // Output texture coordinates as color.
+  // TODO(benvanik): only if texture is invalid?
+  // Append("  t = vec4(r%u.", tex->src_reg);
+  // src_swiz = tex->src_swiz;
+  // for (int i = 0; i < src_component_count; i++) {
+  //  Append("%c", chan_names[src_swiz & 0x3]);
+  //  src_swiz >>= 2;
+  //}
+  // switch (src_component_count) {
+  //  case 1:
+  //    Append(", 0.0, 0.0, 1.0);\n");
+  //    break;
+  //  case 2:
+  //    Append(", 0.0, 1.0);\n");
+  //    break;
+  //  case 3:
+  //    Append(", 1.0);\n");
+  //    break;
+  //}
+
   Append("  r%u.xyzw = vec4(", tex->dst_reg);
   uint32_t dst_swiz = tex->dst_swiz;
   for (int i = 0; i < 4; i++) {
@@ -1524,6 +1512,7 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
     } else if ((dst_swiz & 0x7) == 6) {
       // ?
       Append("?");
+      assert_always();
     } else if ((dst_swiz & 0x7) == 7) {
       Append("r%u.%c", tex->dst_reg, chan_names[i]);
     } else {
diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.h b/src/xenia/gpu/gl4/gl4_shader_translator.h
index 64da30b04..984483744 100644
--- a/src/xenia/gpu/gl4/gl4_shader_translator.h
+++ b/src/xenia/gpu/gl4/gl4_shader_translator.h
@@ -39,7 +39,6 @@ class GL4ShaderTranslator {
 
  protected:
   ShaderType shader_type_;
-  uint32_t tex_fetch_index_;
   const uint32_t* dwords_;
 
   static const int kOutputCapacity = 64 * 1024;
@@ -56,8 +55,6 @@ class GL4ShaderTranslator {
     va_end(args);
   }
 
-  void AppendTextureHeader(const GL4Shader::SamplerInputs& sampler_inputs);
-
   void AppendSrcReg(uint32_t num, uint32_t type, uint32_t swiz, uint32_t negate,
                     uint32_t abs);
   void AppendDestRegName(uint32_t num, uint32_t dst_exp);
diff --git a/src/xenia/gpu/gl4/gl_context.cc b/src/xenia/gpu/gl4/gl_context.cc
index 3f4f48a01..bd1e85160 100644
--- a/src/xenia/gpu/gl4/gl_context.cc
+++ b/src/xenia/gpu/gl4/gl_context.cc
@@ -115,6 +115,8 @@ bool GLContext::Initialize(HWND hwnd) {
     // Clearing errors.
   }
 
+  SetupDebugging();
+
   ClearCurrent();
 
   return true;
@@ -160,11 +162,120 @@ std::unique_ptr<GLContext> GLContext::CreateShared() {
     return nullptr;
   }
 
+  SetupDebugging();
+
   new_context->ClearCurrent();
 
   return new_context;
 }
 
+void GLContext::DebugMessage(GLenum source, GLenum type, GLuint id,
+                             GLenum severity, GLsizei length,
+                             const GLchar* message) {
+  const char* source_name = nullptr;
+  switch (source) {
+    case GL_DEBUG_SOURCE_API_ARB:
+      source_name = "OpenGL";
+      break;
+    case GL_DEBUG_SOURCE_WINDOW_SYSTEM_ARB:
+      source_name = "Windows";
+      break;
+    case GL_DEBUG_SOURCE_SHADER_COMPILER_ARB:
+      source_name = "Shader Compiler";
+      break;
+    case GL_DEBUG_SOURCE_THIRD_PARTY_ARB:
+      source_name = "Third Party";
+      break;
+    case GL_DEBUG_SOURCE_APPLICATION_ARB:
+      source_name = "Application";
+      break;
+    case GL_DEBUG_SOURCE_OTHER_ARB:
+      source_name = "Other";
+      break;
+    default:
+      source_name = "(unknown source)";
+      break;
+  }
+
+  const char* type_name = nullptr;
+  switch (type) {
+    case GL_DEBUG_TYPE_ERROR:
+      type_name = "error";
+      break;
+    case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR:
+      type_name = "deprecated behavior";
+      break;
+    case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR:
+      type_name = "undefined behavior";
+      break;
+    case GL_DEBUG_TYPE_PORTABILITY:
+      type_name = "portability";
+      break;
+    case GL_DEBUG_TYPE_PERFORMANCE:
+      type_name = "performance";
+      break;
+    case GL_DEBUG_TYPE_OTHER:
+      type_name = "message";
+      break;
+    case GL_DEBUG_TYPE_MARKER:
+      type_name = "marker";
+      break;
+    case GL_DEBUG_TYPE_PUSH_GROUP:
+      type_name = "push group";
+      break;
+    case GL_DEBUG_TYPE_POP_GROUP:
+      type_name = "pop group";
+      break;
+    default:
+      type_name = "(unknown type)";
+      break;
+  }
+
+  const char* severity_name = nullptr;
+  switch (severity) {
+    case GL_DEBUG_SEVERITY_HIGH_ARB:
+      severity_name = "high";
+      break;
+    case GL_DEBUG_SEVERITY_MEDIUM_ARB:
+      severity_name = "medium";
+      break;
+    case GL_DEBUG_SEVERITY_LOW_ARB:
+      severity_name = "low";
+      break;
+    case GL_DEBUG_SEVERITY_NOTIFICATION:
+      severity_name = "notification";
+      break;
+    default:
+      severity_name = "(unknown severity)";
+      break;
+  }
+
+  XELOGE("GL4 %s: %s(%s) %d: %s", source_name, type_name, severity_name, id,
+         message);
+}
+
+void GLAPIENTRY
+GLContext::DebugMessageThunk(GLenum source, GLenum type, GLuint id,
+                             GLenum severity, GLsizei length,
+                             const GLchar* message, GLvoid* user_param) {
+  reinterpret_cast<GLContext*>(user_param)
+      ->DebugMessage(source, type, id, severity, length, message);
+}
+
+void GLContext::SetupDebugging() {
+  if (!FLAGS_gl_debug_output) {
+    return;
+  }
+  glEnable(GL_DEBUG_OUTPUT);
+  if (FLAGS_gl_debug_output_synchronous) {
+    glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
+  }
+  glDebugMessageControl(GL_DONT_CARE, GL_DONT_CARE, GL_DONT_CARE, 0, NULL,
+                        GL_TRUE);
+  glDebugMessageCallback(reinterpret_cast<GLDEBUGPROC>(&DebugMessageThunk),
+                         this);
+}
+
 bool GLContext::MakeCurrent() {
   if (FLAGS_thread_safe_gl) {
     global_gl_mutex_.lock();
diff --git a/src/xenia/gpu/gl4/gl_context.h b/src/xenia/gpu/gl4/gl_context.h
index 68386b773..196c99a15 100644
--- a/src/xenia/gpu/gl4/gl_context.h
+++ b/src/xenia/gpu/gl4/gl_context.h
@@ -35,6 +35,13 @@ class GLContext {
   void ClearCurrent();
 
  private:
+  void SetupDebugging();
+  void DebugMessage(GLenum source, GLenum type, GLuint id, GLenum severity,
+                    GLsizei length, const GLchar* message);
+  static void GLAPIENTRY
+  DebugMessageThunk(GLenum source, GLenum type, GLuint id, GLenum severity,
+                    GLsizei length, const GLchar* message, GLvoid* user_param);
+
   HWND hwnd_;
   HDC dc_;
   HGLRC glrc_;
diff --git a/src/xenia/gpu/gl4/sources.gypi b/src/xenia/gpu/gl4/sources.gypi
index 3f0c349ce..bdeeae80a 100644
--- a/src/xenia/gpu/gl4/sources.gypi
+++ b/src/xenia/gpu/gl4/sources.gypi
@@ -16,6 +16,8 @@
     'gl4_shader_translator.h',
     'gl_context.cc',
     'gl_context.h',
+    'texture_cache.cc',
+    'texture_cache.h',
   ],
 
   'conditions': [
diff --git a/src/xenia/gpu/gl4/texture_cache.cc b/src/xenia/gpu/gl4/texture_cache.cc
new file mode 100644
index 000000000..0aff172bd
--- /dev/null
+++ b/src/xenia/gpu/gl4/texture_cache.cc
@@ -0,0 +1,497 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <xenia/gpu/gl4/texture_cache.h>
+
+#include <poly/assert.h>
+#include <poly/math.h>
+#include <xenia/gpu/gpu-private.h>
+
+namespace xe {
+namespace gpu {
+namespace gl4 {
+
+using namespace xe::gpu::xenos;
+
+extern "C" GLEWContext* glewGetContext();
+extern "C" WGLEWContext* wglewGetContext();
+
+TextureCache::TextureCache() {
+  //
+}
+
+TextureCache::~TextureCache() { Shutdown(); }
+
+bool TextureCache::Initialize(CircularBuffer* scratch_buffer) {
+  scratch_buffer_ = scratch_buffer;
+  return true;
+}
+
+void TextureCache::Shutdown() {
+  Clear();
+  //
+}
+
+void TextureCache::Clear() {
+  for (auto& entry : entries_) {
+    for (auto& view : entry.views) {
+      glMakeTextureHandleNonResidentARB(view.texture_sampler_handle);
+      glDeleteSamplers(1, &view.sampler);
+    }
+    glDeleteTextures(1, &entry.base_texture);
+  }
+  entries_.clear();
+}
+
+TextureCache::EntryView* TextureCache::Demand(void* host_base, size_t length,
+                                              const TextureInfo& texture_info,
+                                              const SamplerInfo& sampler_info) {
+  entries_.emplace_back(Entry());
+  auto& entry = entries_.back();
+  entry.texture_info = texture_info;
+
+  GLenum target;
+  switch (texture_info.dimension) {
+    case Dimension::k1D:
+      target = GL_TEXTURE_1D;
+      break;
+    case Dimension::k2D:
+      target = GL_TEXTURE_2D;
+      break;
+    case Dimension::k3D:
+      target = GL_TEXTURE_3D;
+      break;
+    case Dimension::kCube:
+      target = GL_TEXTURE_CUBE_MAP;
+      break;
+  }
+
+  // Setup the base texture.
+  glCreateTextures(target, 1, &entry.base_texture);
+  if (!SetupTexture(entry.base_texture, texture_info)) {
+    PLOGE("Unable to setup texture parameters");
+    return false;
+  }
+
+  // Upload/convert.
+  bool uploaded = false;
+  switch (texture_info.dimension) {
+    case Dimension::k2D:
+      uploaded = UploadTexture2D(entry.base_texture, host_base, length,
+                                 texture_info, sampler_info);
+      break;
+    case Dimension::k1D:
+    case Dimension::k3D:
+    case Dimension::kCube:
+      assert_unhandled_case(texture_info.dimension);
+      return false;
+  }
+  if (!uploaded) {
+    PLOGE("Failed to convert/upload texture");
+    return false;
+  }
+
+  entry.views.emplace_back(EntryView());
+  auto& entry_view = entry.views.back();
+  entry_view.sampler_info = sampler_info;
+
+  // Setup the sampler.
+  glCreateSamplers(1, &entry_view.sampler);
+  if (!SetupSampler(entry_view.sampler, texture_info, sampler_info)) {
+    PLOGE("Unable to setup texture sampler parameters");
+    return false;
+  }
+
+  // Get the uvec2 handle to the texture/sampler pair and make it resident.
+  // The handle can be passed directly to the shader.
+  entry_view.texture_sampler_handle =
+      glGetTextureSamplerHandleARB(entry.base_texture, entry_view.sampler);
+  if (!entry_view.texture_sampler_handle) {
+    return nullptr;
+  }
+  glMakeTextureHandleResidentARB(entry_view.texture_sampler_handle);
+
+  return &entry_view;
+}
+
+bool TextureCache::SetupTexture(GLuint texture,
+                                const TextureInfo& texture_info) {
+  // TODO(benvanik): texture mip levels.
+  glTextureParameteri(texture, GL_TEXTURE_BASE_LEVEL, 0);
+  glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, 1);
+
+  // Pre-shader swizzle.
+  // TODO(benvanik): can this be dynamic? Maybe per view?
+  // We may have to emulate this in the shader.
+  uint32_t swizzle_r = texture_info.swizzle & 0x7;
+  uint32_t swizzle_g = (texture_info.swizzle >> 3) & 0x7;
+  uint32_t swizzle_b = (texture_info.swizzle >> 6) & 0x7;
+  uint32_t swizzle_a = (texture_info.swizzle >> 9) & 0x7;
+  static const GLenum swizzle_map[] = {
+      GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA, GL_ZERO, GL_ONE,
+  };
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_R, swizzle_map[swizzle_r]);
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_G, swizzle_map[swizzle_g]);
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_B, swizzle_map[swizzle_b]);
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_A, swizzle_map[swizzle_a]);
+
+  return true;
+}
+
+bool TextureCache::SetupSampler(GLuint sampler, const TextureInfo& texture_info,
+                                const SamplerInfo& sampler_info) {
+  // TODO(benvanik): border color from texture fetch.
+  GLfloat border_color[4] = {0.0f};
+  glSamplerParameterfv(sampler, GL_TEXTURE_BORDER_COLOR, border_color);
+
+  // TODO(benvanik): setup LODs for mipmapping.
+  glSamplerParameterf(sampler, GL_TEXTURE_LOD_BIAS, 0.0f);
+  glSamplerParameterf(sampler, GL_TEXTURE_MIN_LOD, 0.0f);
+  glSamplerParameterf(sampler, GL_TEXTURE_MAX_LOD, 0.0f);
+
+  // Texture wrapping modes.
+  // TODO(benvanik): not sure if the middle ones are correct.
+  static const GLenum wrap_map[] = {
+      GL_REPEAT,                      //
+      GL_MIRRORED_REPEAT,             //
+      GL_CLAMP_TO_EDGE,               //
+      GL_MIRROR_CLAMP_TO_EDGE,        //
+      GL_CLAMP_TO_BORDER,             // ?
+      GL_MIRROR_CLAMP_TO_BORDER_EXT,  // ?
+      GL_CLAMP_TO_BORDER,             //
+      GL_MIRROR_CLAMP_TO_BORDER_EXT,  //
+  };
+  glSamplerParameteri(sampler, GL_TEXTURE_WRAP_S,
+                      wrap_map[sampler_info.clamp_u]);
+  glSamplerParameteri(sampler, GL_TEXTURE_WRAP_T,
+                      wrap_map[sampler_info.clamp_v]);
+  glSamplerParameteri(sampler, GL_TEXTURE_WRAP_R,
+                      wrap_map[sampler_info.clamp_w]);
+
+  // Texture level filtering.
+  GLenum min_filter;
+  switch (sampler_info.min_filter) {
+    case ucode::TEX_FILTER_POINT:
+      switch (sampler_info.mip_filter) {
+        case ucode::TEX_FILTER_BASEMAP:
+          min_filter = GL_NEAREST;
+          break;
+        case ucode::TEX_FILTER_POINT:
+          // min_filter = GL_NEAREST_MIPMAP_NEAREST;
+          min_filter = GL_NEAREST;
+          break;
+        case ucode::TEX_FILTER_LINEAR:
+          // min_filter = GL_NEAREST_MIPMAP_LINEAR;
+          min_filter = GL_NEAREST;
+          break;
+        default:
+          assert_unhandled_case(sampler_info.mip_filter);
+          return false;
+      }
+      break;
+    case ucode::TEX_FILTER_LINEAR:
+      switch (sampler_info.mip_filter) {
+        case ucode::TEX_FILTER_BASEMAP:
+          min_filter = GL_LINEAR;
+          break;
+        case ucode::TEX_FILTER_POINT:
+          // min_filter = GL_LINEAR_MIPMAP_NEAREST;
+          min_filter = GL_LINEAR;
+          break;
+        case ucode::TEX_FILTER_LINEAR:
+          // min_filter = GL_LINEAR_MIPMAP_LINEAR;
+          min_filter = GL_LINEAR;
+          break;
+        default:
+          assert_unhandled_case(sampler_info.mip_filter);
+          return false;
+      }
+      break;
+    default:
+      assert_unhandled_case(sampler_info.min_filter);
+      return false;
+  }
+  GLenum mag_filter;
+  switch (sampler_info.mag_filter) {
+    case ucode::TEX_FILTER_POINT:
+      mag_filter = GL_NEAREST;
+      break;
+    case ucode::TEX_FILTER_LINEAR:
+      mag_filter = GL_LINEAR;
+      break;
+    default:
+      assert_unhandled_case(mag_filter);
+      return false;
+  }
+  glSamplerParameteri(sampler, GL_TEXTURE_MIN_FILTER, min_filter);
+  glSamplerParameteri(sampler, GL_TEXTURE_MAG_FILTER, mag_filter);
+
+  // TODO(benvanik): anisotropic filtering.
+  // GL_TEXTURE_MAX_ANISOTROPY_EXT
+
+  return true;
+}
+
+void TextureSwap(Endian endianness, void* dest, const void* src,
+                 size_t length) {
+  switch (endianness) {
+    case Endian::k8in16:
+      poly::copy_and_swap_16_aligned(reinterpret_cast<uint16_t*>(dest),
+                                     reinterpret_cast<const uint16_t*>(src),
+                                     length / 2);
+      break;
+    case Endian::k8in32:
+      poly::copy_and_swap_32_aligned(reinterpret_cast<uint32_t*>(dest),
+                                     reinterpret_cast<const uint32_t*>(src),
+                                     length / 4);
+      break;
+    case Endian::k16in32:
+      // TODO(benvanik): make more efficient.
+      /*for (uint32_t i = 0; i < length; i += 4, src += 4, dest += 4) {
+        uint32_t value = *(uint32_t*)src;
+        *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16);
+      }*/
+      assert_always("16in32 not supported");
+      break;
+    default:
+    case Endian::kUnspecified:
+      std::memcpy(dest, src, length);
+      break;
+  }
+}
+
+bool TextureCache::UploadTexture2D(GLuint texture, void* host_base,
+                                   size_t length,
+                                   const TextureInfo& texture_info,
+                                   const SamplerInfo& sampler_info) {
+  assert_true(length == texture_info.input_length);
+
+  GLenum internal_format = GL_RGBA8;
+  GLenum format = GL_RGBA;
+  GLenum type = GL_UNSIGNED_BYTE;
+  // https://code.google.com/p/glsnewton/source/browse/trunk/Source/uDDSLoader.pas?r=62
+  // http://dench.flatlib.jp/opengl/textures
+  // http://fossies.org/linux/WebKit/Source/ThirdParty/ANGLE/src/libGLESv2/formatutils.cpp
+  switch (texture_info.format) {
+    case TextureFormat::k_8:
+      internal_format = GL_R8;
+      format = GL_R;
+      type = GL_UNSIGNED_BYTE;
+      break;
+    case TextureFormat::k_1_5_5_5:
+      internal_format = GL_RGB5_A1;
+      format = GL_BGRA;
+      type = GL_UNSIGNED_SHORT_1_5_5_5_REV;
+      break;
+    case TextureFormat::k_5_6_5:
+      internal_format = GL_RGB565;
+      format = GL_RGB;
+      type = GL_UNSIGNED_SHORT_5_6_5;
+      break;
+    case TextureFormat::k_2_10_10_10:
+    case TextureFormat::k_2_10_10_10_AS_16_16_16_16:
+      internal_format = GL_RGB10_A2;
+      format = GL_RGBA;
+      type = GL_UNSIGNED_INT_2_10_10_10_REV;
+      break;
+    case TextureFormat::k_10_11_11:
+    case TextureFormat::k_10_11_11_AS_16_16_16_16:
+      // ?
+      internal_format = GL_R11F_G11F_B10F;
+      format = GL_RGB;
+      type = GL_UNSIGNED_INT_10F_11F_11F_REV;
+      break;
+    case TextureFormat::k_11_11_10:
+    case TextureFormat::k_11_11_10_AS_16_16_16_16:
+      internal_format = GL_R11F_G11F_B10F;
+      format = GL_RGB;
+      type = GL_UNSIGNED_INT_10F_11F_11F_REV;
+      break;
+    case TextureFormat::k_8_8_8_8:
+    case TextureFormat::k_8_8_8_8_AS_16_16_16_16:
+      internal_format = GL_RGBA8;
+      format = GL_RGBA;
+      type = GL_UNSIGNED_BYTE;
+      break;
+    case TextureFormat::k_4_4_4_4:
+      internal_format = GL_RGBA4;
+      format = GL_RGBA;
+      type = GL_UNSIGNED_SHORT_4_4_4_4;
+      break;
+    case TextureFormat::k_16_FLOAT:
+      internal_format = GL_R16F;
+      format = GL_RED;
+      type = GL_HALF_FLOAT;
+      break;
+    case TextureFormat::k_16_16_FLOAT:
+      internal_format = GL_RG16F;
+      format = GL_RG;
+      type = GL_HALF_FLOAT;
+      break;
+    case TextureFormat::k_16_16_16_16_FLOAT:
+      internal_format = GL_RGBA16F;
+      format = GL_RGBA;
+      type = GL_HALF_FLOAT;
+      break;
+    case TextureFormat::k_32_FLOAT:
+      internal_format = GL_R32F;
+      format = GL_R;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_32_32_FLOAT:
+      internal_format = GL_RG32F;
+      format = GL_RG;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_32_32_32_FLOAT:
+      internal_format = GL_RGB32F;
+      format = GL_RGB;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_32_32_32_32_FLOAT:
+      internal_format = GL_RGBA32F;
+      format = GL_RGBA;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_DXT1:
+    case TextureFormat::k_DXT1_AS_16_16_16_16:
+      // or GL_COMPRESSED_RGB_S3TC_DXT1_EXT?
+      internal_format = format = GL_COMPRESSED_RGBA_S3TC_DXT1_EXT;
+      break;
+    case TextureFormat::k_DXT2_3:
+    case TextureFormat::k_DXT2_3_AS_16_16_16_16:
+      internal_format = format = GL_COMPRESSED_RGBA_S3TC_DXT3_EXT;
+      break;
+    case TextureFormat::k_DXT4_5:
+    case TextureFormat::k_DXT4_5_AS_16_16_16_16:
+      internal_format = format = GL_COMPRESSED_RGBA_S3TC_DXT5_EXT;
+      break;
+    case TextureFormat::k_24_8:
+      internal_format = GL_DEPTH24_STENCIL8;
+      format = GL_DEPTH_STENCIL;
+      type = GL_UNSIGNED_INT_24_8;
+      break;
+    case TextureFormat::k_24_8_FLOAT:
+      internal_format = GL_DEPTH24_STENCIL8;
+      format = GL_DEPTH_STENCIL;
+      type = GL_FLOAT_32_UNSIGNED_INT_24_8_REV;
+      break;
+    default:
+    case TextureFormat::k_1_REVERSE:
+    case TextureFormat::k_1:
+    case TextureFormat::k_6_5_5:
+    case TextureFormat::k_8_A:
+    case TextureFormat::k_8_B:
+    case TextureFormat::k_8_8:
+    case TextureFormat::k_Cr_Y1_Cb_Y0:
+    case TextureFormat::k_Y1_Cr_Y0_Cb:
+    case TextureFormat::k_8_8_8_8_A:
+    case TextureFormat::k_16:
+    case TextureFormat::k_16_16:
+    case TextureFormat::k_16_16_16_16:
+    case TextureFormat::k_16_EXPAND:
+    case TextureFormat::k_16_16_EXPAND:
+    case TextureFormat::k_16_16_16_16_EXPAND:
+    case TextureFormat::k_32_32:
+    case TextureFormat::k_32_32_32_32:
+    case TextureFormat::k_32_AS_8:
+    case TextureFormat::k_32_AS_8_8:
+    case TextureFormat::k_16_MPEG:
+    case TextureFormat::k_16_16_MPEG:
+    case TextureFormat::k_8_INTERLACED:
+    case TextureFormat::k_32_AS_8_INTERLACED:
+    case TextureFormat::k_32_AS_8_8_INTERLACED:
+    case TextureFormat::k_16_INTERLACED:
+    case TextureFormat::k_16_MPEG_INTERLACED:
+    case TextureFormat::k_16_16_MPEG_INTERLACED:
+    case TextureFormat::k_DXN:
+    case TextureFormat::k_DXT3A:
+    case TextureFormat::k_DXT5A:
+    case TextureFormat::k_CTX1:
+    case TextureFormat::k_DXT3A_AS_1_1_1_1:
+      assert_unhandled_case(texture_info.format);
+      return false;
+  }
+
+  size_t unpack_length = texture_info.input_length;
+  glTextureStorage2D(texture, 1, internal_format,
+                     texture_info.size_2d.output_width,
+                     texture_info.size_2d.output_height);
+  assert_true(unpack_length % 4 == 0);
+
+  auto allocation = scratch_buffer_->Acquire(unpack_length);
+
+  if (!texture_info.is_tiled) {
+    TextureSwap(texture_info.endianness, allocation.host_ptr, host_base,
+                unpack_length);
+    /*const uint8_t* src = reinterpret_cast<const uint8_t*>(host_base);
+    uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
+    for (uint32_t y = 0; y < texture_info.size_2d.block_height; y++) {
+      for (uint32_t x = 0; x < texture_info.size_2d.logical_pitch;
+           x += texture_info.texel_pitch) {
+        TextureSwap(texture_info.endianness, dest + x, src + x,
+                    texture_info.texel_pitch);
+      }
+      src += texture_info.size_2d.input_pitch;
+      dest += texture_info.size_2d.input_pitch;
+    }*/
+    // std::memcpy(dest, src, unpack_length);
+  } else {
+    uint8_t* src = reinterpret_cast<uint8_t*>(host_base);
+    uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
+    uint32_t output_pitch =
+        (texture_info.size_2d.output_width / texture_info.block_size) *
+        texture_info.texel_pitch;
+    auto bpp =
+        (texture_info.texel_pitch >> 2) +
+        ((texture_info.texel_pitch >> 1) >> (texture_info.texel_pitch >> 2));
+    for (uint32_t y = 0, output_base_offset = 0;
+         y < texture_info.size_2d.block_height;
+         y++, output_base_offset += output_pitch) {
+      auto input_base_offset = TextureInfo::TiledOffset2DOuter(
+          y, (texture_info.size_2d.input_width / texture_info.block_size), bpp);
+      for (uint32_t x = 0, output_offset = output_base_offset;
+           x < texture_info.size_2d.block_width;
+           x++, output_offset += texture_info.texel_pitch) {
+        auto input_offset =
+            TextureInfo::TiledOffset2DInner(x, y, bpp, input_base_offset) >>
+            bpp;
+        TextureSwap(texture_info.endianness, dest + output_offset,
+                    src + input_offset * texture_info.texel_pitch,
+                    texture_info.texel_pitch);
+      }
+    }
+  }
+  size_t unpack_offset = allocation.offset;
+  scratch_buffer_->Commit(std::move(allocation));
+
+  // glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_TRUE);
+  // glPixelStorei(GL_UNPACK_ALIGNMENT, texture_info.texel_pitch);
+  glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width);
+  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle());
+  if (texture_info.is_compressed) {
+    glCompressedTextureSubImage2D(texture, 0, 0, 0,
+                                  texture_info.size_2d.output_width,
+                                  texture_info.size_2d.output_height, format,
+                                  static_cast<GLsizei>(unpack_length),
+                                  reinterpret_cast<void*>(unpack_offset));
+  } else {
+    glTextureSubImage2D(texture, 0, 0, 0, texture_info.size_2d.output_width,
+                        texture_info.size_2d.output_height, format, type,
+                        reinterpret_cast<void*>(unpack_offset));
+  }
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+  return true;
+}
+
+}  // namespace gl4
+}  // namespace gpu
+}  // namespace xe
diff --git a/src/xenia/gpu/gl4/texture_cache.h b/src/xenia/gpu/gl4/texture_cache.h
new file mode 100644
index 000000000..f4816d981
--- /dev/null
+++ b/src/xenia/gpu/gl4/texture_cache.h
@@ -0,0 +1,65 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_GL4_TEXTURE_CACHE_H_
+#define XENIA_GPU_GL4_TEXTURE_CACHE_H_
+
+#include <vector>
+
+#include <xenia/gpu/gl4/circular_buffer.h>
+#include <xenia/gpu/gl4/gl_context.h>
+#include <xenia/gpu/sampler_info.h>
+#include <xenia/gpu/texture_info.h>
+
+namespace xe {
+namespace gpu {
+namespace gl4 {
+
+class TextureCache {
+ public:
+  struct EntryView {
+    SamplerInfo sampler_info;
+    GLuint sampler;
+    GLuint64 texture_sampler_handle;
+  };
+  struct Entry {
+    TextureInfo texture_info;
+    GLuint base_texture;
+    std::vector<EntryView> views;
+  };
+
+  TextureCache();
+  ~TextureCache();
+
+  bool Initialize(CircularBuffer* scratch_buffer);
+  void Shutdown();
+  void Clear();
+
+  EntryView* Demand(void* host_base, size_t length,
+                    const TextureInfo& texture_info,
+                    const SamplerInfo& sampler_info);
+
+ private:
+  bool SetupTexture(GLuint texture, const TextureInfo& texture_info);
+  bool SetupSampler(GLuint sampler, const TextureInfo& texture_info,
+                    const SamplerInfo& sampler_info);
+
+  bool UploadTexture2D(GLuint texture, void* host_base, size_t length,
+                       const TextureInfo& texture_info,
+                       const SamplerInfo& sampler_info);
+
+  CircularBuffer* scratch_buffer_;
+  std::vector<Entry> entries_;
+};
+
+}  // namespace gl4
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_GL4_TEXTURE_CACHE_H_
diff --git a/src/xenia/gpu/gl4/wgl_control.cc b/src/xenia/gpu/gl4/wgl_control.cc
index 5571a1784..032ed7c9d 100644
--- a/src/xenia/gpu/gl4/wgl_control.cc
+++ b/src/xenia/gpu/gl4/wgl_control.cc
@@ -74,17 +74,32 @@ LRESULT WGLControl::WndProc(HWND hWnd, UINT message, WPARAM wParam,
                             LPARAM lParam) {
   switch (message) {
     case WM_PAINT: {
-      GLContextLock context_lock(&context_);
-      // TODO(benvanik): is viewport needed?
-      glViewport(0, 0, width_, height_);
-      float clear_color[] = {rand() / (float)RAND_MAX, 1.0f, 0, 1.0f};
-      glClearNamedFramebufferfv(0, GL_COLOR, 0, clear_color);
-      if (current_paint_callback_) {
-        current_paint_callback_();
-        current_paint_callback_ = nullptr;
+      {
+        GLContextLock context_lock(&context_);
+        wglSwapIntervalEXT(0);
+
+        // TODO(benvanik): is viewport needed?
+        glViewport(0, 0, width_, height_);
+        float clear_color[] = {rand() / (float)RAND_MAX, 1.0f, 0, 1.0f};
+        glClearNamedFramebufferfv(0, GL_COLOR, 0, clear_color);
+
+        if (current_paint_callback_) {
+          current_paint_callback_();
+          current_paint_callback_ = nullptr;
+        }
+
+        // TODO(benvanik): profiler present.
+        // Profiler::Present();
+
+        // Hacky swap timer.
+        static int swap_count = 0;
+        glEnable(GL_SCISSOR_TEST);
+        glScissor(0, 0, 20, 20);
+        float red[] = {swap_count / 60.0f, 0, 0, 1.0f};
+        swap_count = (swap_count + 1) % 60;
+        glClearNamedFramebufferfv(0, GL_COLOR, 0, red);
+        glDisable(GL_SCISSOR_TEST);
       }
-      // TODO(benvanik): profiler present.
-      // Profiler::Present();
       SwapBuffers(context_.dc());
     } break;
   }
diff --git a/src/xenia/gpu/sampler_info.cc b/src/xenia/gpu/sampler_info.cc
new file mode 100644
index 000000000..f260f7bfc
--- /dev/null
+++ b/src/xenia/gpu/sampler_info.cc
@@ -0,0 +1,31 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <xenia/gpu/sampler_info.h>
+
+namespace xe {
+namespace gpu {
+
+bool SamplerInfo::Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
+                          const ucode::instr_fetch_tex_t& fetch_instr,
+                          SamplerInfo* out_info) {
+  out_info->min_filter = static_cast<ucode::instr_tex_filter_t>(
+      fetch_instr.min_filter == 3 ? fetch.min_filter : fetch_instr.min_filter);
+  out_info->mag_filter = static_cast<ucode::instr_tex_filter_t>(
+      fetch_instr.mag_filter == 3 ? fetch.mag_filter : fetch_instr.mag_filter);
+  out_info->mip_filter = static_cast<ucode::instr_tex_filter_t>(
+      fetch_instr.mip_filter == 3 ? fetch.mip_filter : fetch_instr.mip_filter);
+  out_info->clamp_u = fetch.clamp_x;
+  out_info->clamp_v = fetch.clamp_y;
+  out_info->clamp_w = fetch.clamp_z;
+  return true;
+}
+
+}  //  namespace gpu
+}  //  namespace xe
diff --git a/src/xenia/gpu/sampler_info.h b/src/xenia/gpu/sampler_info.h
new file mode 100644
index 000000000..9aa764117
--- /dev/null
+++ b/src/xenia/gpu/sampler_info.h
@@ -0,0 +1,41 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SAMPLER_INFO_H_
+#define XENIA_GPU_SAMPLER_INFO_H_
+
+#include <xenia/gpu/ucode.h>
+#include <xenia/gpu/xenos.h>
+
+namespace xe {
+namespace gpu {
+
+struct SamplerInfo {
+  ucode::instr_tex_filter_t min_filter;
+  ucode::instr_tex_filter_t mag_filter;
+  ucode::instr_tex_filter_t mip_filter;
+  uint32_t clamp_u;
+  uint32_t clamp_v;
+  uint32_t clamp_w;
+
+  static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
+                      const ucode::instr_fetch_tex_t& fetch_instr,
+                      SamplerInfo* out_info);
+
+  bool operator==(const SamplerInfo& other) const {
+    return min_filter == other.min_filter && mag_filter == other.mag_filter &&
+           mip_filter == other.mip_filter && clamp_u == other.clamp_u &&
+           clamp_v == other.clamp_v && clamp_w == other.clamp_w;
+  }
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_SAMPLER_INFO_H_
diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc
index d2cb0bd5d..bbc84f0d6 100644
--- a/src/xenia/gpu/shader.cc
+++ b/src/xenia/gpu/shader.cc
@@ -172,6 +172,8 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
     return;
   }
 
+  assert_true(vtx->const_index <= 0x1F);
+
   uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel;
   auto& inputs = buffer_inputs_;
   BufferDescElement* el = nullptr;
@@ -240,10 +242,12 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
 void Shader::GatherTextureFetch(const instr_fetch_tex_t* tex) {
   // TODO(benvanik): check dest_swiz to see if we are writing anything.
 
+  assert_true(tex->const_idx < 0x1F);
+
   assert_true(sampler_inputs_.count + 1 < poly::countof(sampler_inputs_.descs));
   auto& input = sampler_inputs_.descs[sampler_inputs_.count++];
   input.input_index = sampler_inputs_.count - 1;
-  input.fetch_slot = tex->const_idx & 0xF;  // ?
+  input.fetch_slot = tex->const_idx & 0xF;  // ??????????????????????????????
   input.tex_fetch = *tex;
 
   // Format mangling, size estimation, etc.
diff --git a/src/xenia/gpu/sources.gypi b/src/xenia/gpu/sources.gypi
index 416884d5d..ec144c8af 100644
--- a/src/xenia/gpu/sources.gypi
+++ b/src/xenia/gpu/sources.gypi
@@ -9,8 +9,12 @@
     'register_file.cc',
     'register_file.h',
     'register_table.inc',
+    'sampler_info.cc',
+    'sampler_info.h',
     'shader.cc',
     'shader.h',
+    'texture_info.cc',
+    'texture_info.h',
     'ucode.h',
     'ucode_disassembler.cc',
     'ucode_disassembler.h',
diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc
new file mode 100644
index 000000000..8b0aaecae
--- /dev/null
+++ b/src/xenia/gpu/texture_info.cc
@@ -0,0 +1,239 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <xenia/gpu/texture_info.h>
+
+#include <poly/math.h>
+
+namespace xe {
+namespace gpu {
+
+using namespace xe::gpu::ucode;
+using namespace xe::gpu::xenos;
+
+bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
+                          TextureInfo* out_info) {
+  // http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx
+  // a2xx_sq_surfaceformat
+
+  auto& info = *out_info;
+  info.swizzle = fetch.swizzle;
+
+  info.dimension = static_cast<Dimension>(fetch.dimension);
+  switch (info.dimension) {
+    case Dimension::k1D:
+      info.width = fetch.size_1d.width;
+      break;
+    case Dimension::k2D:
+      info.width = fetch.size_2d.width;
+      info.height = fetch.size_2d.height;
+      break;
+    case Dimension::k3D:
+    case Dimension::kCube:
+      info.width = fetch.size_3d.width;
+      info.height = fetch.size_3d.height;
+      info.depth = fetch.size_3d.depth;
+      break;
+  }
+  info.endianness = static_cast<Endian>(fetch.endianness);
+
+  info.block_size = 0;
+  info.texel_pitch = 0;
+  info.is_tiled = fetch.tiled;
+  info.is_compressed = false;
+  info.input_length = 0;
+  info.format = static_cast<TextureFormat>(fetch.format);
+  switch (fetch.format) {
+    case FMT_8:
+      info.block_size = 1;
+      info.texel_pitch = 1;
+      break;
+    case FMT_1_5_5_5:
+      info.block_size = 1;
+      info.texel_pitch = 2;
+      break;
+    case FMT_8_8_8_8:
+    case FMT_8_8_8_8_AS_16_16_16_16:
+      info.block_size = 1;
+      info.texel_pitch = 4;
+      break;
+    case FMT_4_4_4_4:
+      info.block_size = 1;
+      info.texel_pitch = 2;
+      break;
+    case FMT_16_16_16_16_FLOAT:
+      info.block_size = 1;
+      info.texel_pitch = 8;
+      break;
+    case FMT_32_FLOAT:
+      info.block_size = 1;
+      info.texel_pitch = 4;
+      break;
+    case FMT_DXT1:
+      info.block_size = 4;
+      info.texel_pitch = 8;
+      info.is_compressed = true;
+      break;
+    case FMT_DXT2_3:
+    case FMT_DXT4_5:
+      info.block_size = 4;
+      info.texel_pitch = 16;
+      info.is_compressed = true;
+      break;
+    case FMT_DXT1_AS_16_16_16_16:
+      // TODO(benvanik): conversion?
+      info.block_size = 4;
+      info.texel_pitch = 8;
+      info.is_compressed = true;
+      break;
+    case FMT_DXT2_3_AS_16_16_16_16:
+    case FMT_DXT4_5_AS_16_16_16_16:
+      // TODO(benvanik): conversion?
+      info.block_size = 4;
+      info.texel_pitch = 16;
+      info.is_compressed = true;
+      break;
+    case FMT_1_REVERSE:
+    case FMT_1:
+    case FMT_5_6_5:
+    case FMT_6_5_5:
+    case FMT_2_10_10_10:
+    case FMT_8_A:
+    case FMT_8_B:
+    case FMT_8_8:
+    case FMT_Cr_Y1_Cb_Y0:
+    case FMT_Y1_Cr_Y0_Cb:
+    case FMT_5_5_5_1:
+    case FMT_8_8_8_8_A:
+    case FMT_10_11_11:
+    case FMT_11_11_10:
+    case FMT_24_8:
+    case FMT_24_8_FLOAT:
+    case FMT_16:
+    case FMT_16_16:
+    case FMT_16_16_16_16:
+    case FMT_16_EXPAND:
+    case FMT_16_16_EXPAND:
+    case FMT_16_16_16_16_EXPAND:
+    case FMT_16_FLOAT:
+    case FMT_16_16_FLOAT:
+    case FMT_32:
+    case FMT_32_32:
+    case FMT_32_32_32_32:
+    case FMT_32_32_FLOAT:
+    case FMT_32_32_32_32_FLOAT:
+    case FMT_32_AS_8:
+    case FMT_32_AS_8_8:
+    case FMT_16_MPEG:
+    case FMT_16_16_MPEG:
+    case FMT_8_INTERLACED:
+    case FMT_32_AS_8_INTERLACED:
+    case FMT_32_AS_8_8_INTERLACED:
+    case FMT_16_INTERLACED:
+    case FMT_16_MPEG_INTERLACED:
+    case FMT_16_16_MPEG_INTERLACED:
+    case FMT_DXN:
+    case FMT_2_10_10_10_AS_16_16_16_16:
+    case FMT_10_11_11_AS_16_16_16_16:
+    case FMT_11_11_10_AS_16_16_16_16:
+    case FMT_32_32_32_FLOAT:
+    case FMT_DXT3A:
+    case FMT_DXT5A:
+    case FMT_CTX1:
+    case FMT_DXT3A_AS_1_1_1_1:
+      PLOGE("Unhandled texture format");
+      return false;
+    default:
+      assert_unhandled_case(fetch.format);
+      return false;
+  }
+
+  // Must be called here when we know the format.
+  switch (info.dimension) {
+    case Dimension::k1D:
+      info.CalculateTextureSizes1D(fetch);
+      break;
+    case Dimension::k2D:
+      info.CalculateTextureSizes2D(fetch);
+      break;
+    case Dimension::k3D:
+      // TODO(benvanik): calculate size.
+      return false;
+    case Dimension::kCube:
+      // TODO(benvanik): calculate size.
+      return false;
+  }
+
+  return true;
+}
+
+void TextureInfo::CalculateTextureSizes1D(const xe_gpu_texture_fetch_t& fetch) {
+  // ?
+  size_1d.width = fetch.size_1d.width;
+}
+
+void TextureInfo::CalculateTextureSizes2D(const xe_gpu_texture_fetch_t& fetch) {
+  size_2d.logical_width = 1 + fetch.size_2d.width;
+  size_2d.logical_height = 1 + fetch.size_2d.height;
+
+  size_2d.block_width = size_2d.logical_width / block_size;
+  size_2d.block_height = size_2d.logical_height / block_size;
+
+  if (!is_compressed) {
+    // must be 32x32 but also must have a pitch that is a multiple of 256 bytes
+    uint32_t bytes_per_block = block_size * block_size * texel_pitch;
+    uint32_t width_multiple = 32;
+    if (bytes_per_block) {
+      uint32_t minimum_multiple = 256 / bytes_per_block;
+      if (width_multiple < minimum_multiple) {
+        width_multiple = minimum_multiple;
+      }
+    }
+    size_2d.input_width = poly::round_up(size_2d.logical_width, width_multiple);
+    size_2d.input_height = poly::round_up(size_2d.logical_height, 32);
+    size_2d.output_width = size_2d.logical_width;
+    size_2d.output_height = size_2d.logical_height;
+  } else {
+    // must be 128x128
+    size_2d.input_width = poly::round_up(size_2d.logical_width, 128);
+    size_2d.input_height = poly::round_up(size_2d.logical_height, 128);
+    size_2d.output_width = poly::next_pow2(size_2d.logical_width);
+    size_2d.output_height = poly::next_pow2(size_2d.logical_height);
+  }
+
+  size_2d.logical_pitch = (size_2d.logical_width / block_size) * texel_pitch;
+  size_2d.input_pitch = (size_2d.input_width / block_size) * texel_pitch;
+
+  if (!is_tiled) {
+    input_length = size_2d.block_height * size_2d.logical_pitch;
+  } else {
+    input_length = size_2d.block_height * size_2d.logical_pitch;  // ?
+  }
+}
+
+// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104
+uint32_t TextureInfo::TiledOffset2DOuter(uint32_t y, uint32_t width,
+                                         uint32_t log_bpp) {
+  uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7);
+  uint32_t micro = ((y & 6) << 2) << log_bpp;
+  return macro + ((micro & ~15) << 1) + (micro & 15) +
+         ((y & 8) << (3 + log_bpp)) + ((y & 1) << 4);
+}
+
+uint32_t TextureInfo::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp,
+                                         uint32_t base_offset) {
+  uint32_t macro = (x >> 5) << (bpp + 7);
+  uint32_t micro = (x & 7) << bpp;
+  uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15));
+  return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) +
+         ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6);
+}
+
+}  //  namespace gpu
+}  //  namespace xe
diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h
new file mode 100644
index 000000000..2cda83426
--- /dev/null
+++ b/src/xenia/gpu/texture_info.h
@@ -0,0 +1,140 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_TEXTURE_INFO_H_
+#define XENIA_GPU_TEXTURE_INFO_H_
+
+#include <xenia/gpu/ucode.h>
+#include <xenia/gpu/xenos.h>
+
+namespace xe {
+namespace gpu {
+
+// a2xx_sq_surfaceformat
+enum class TextureFormat : uint32_t {
+  k_1_REVERSE = 0,
+  k_1 = 1,
+  k_8 = 2,
+  k_1_5_5_5 = 3,
+  k_5_6_5 = 4,
+  k_6_5_5 = 5,
+  k_8_8_8_8 = 6,
+  k_2_10_10_10 = 7,
+  k_8_A = 8,
+  k_8_B = 9,
+  k_8_8 = 10,
+  k_Cr_Y1_Cb_Y0 = 11,
+  k_Y1_Cr_Y0_Cb = 12,
+  // ? hole
+  k_8_8_8_8_A = 14,
+  k_4_4_4_4 = 15,
+  k_10_11_11 = 16,
+  k_11_11_10 = 17,
+  k_DXT1 = 18,
+  k_DXT2_3 = 19,
+  k_DXT4_5 = 20,
+  // ? hole
+  k_24_8 = 22,
+  k_24_8_FLOAT = 23,
+  k_16 = 24,
+  k_16_16 = 25,
+  k_16_16_16_16 = 26,
+  k_16_EXPAND = 27,
+  k_16_16_EXPAND = 28,
+  k_16_16_16_16_EXPAND = 29,
+  k_16_FLOAT = 30,
+  k_16_16_FLOAT = 31,
+  k_16_16_16_16_FLOAT = 32,
+  k_32 = 33,
+  k_32_32 = 34,
+  k_32_32_32_32 = 35,
+  k_32_FLOAT = 36,
+  k_32_32_FLOAT = 37,
+  k_32_32_32_32_FLOAT = 38,
+  k_32_AS_8 = 39,
+  k_32_AS_8_8 = 40,
+  k_16_MPEG = 41,
+  k_16_16_MPEG = 42,
+  k_8_INTERLACED = 43,
+  k_32_AS_8_INTERLACED = 44,
+  k_32_AS_8_8_INTERLACED = 45,
+  k_16_INTERLACED = 46,
+  k_16_MPEG_INTERLACED = 47,
+  k_16_16_MPEG_INTERLACED = 48,
+  k_DXN = 49,
+  k_8_8_8_8_AS_16_16_16_16 = 50,
+  k_DXT1_AS_16_16_16_16 = 51,
+  k_DXT2_3_AS_16_16_16_16 = 52,
+  k_DXT4_5_AS_16_16_16_16 = 53,
+  k_2_10_10_10_AS_16_16_16_16 = 54,
+  k_10_11_11_AS_16_16_16_16 = 55,
+  k_11_11_10_AS_16_16_16_16 = 56,
+  k_32_32_32_FLOAT = 57,
+  k_DXT3A = 58,
+  k_DXT5A = 59,
+  k_CTX1 = 60,
+  k_DXT3A_AS_1_1_1_1 = 61,
+
+  kUnknown = 0xFFFFFFFFu,
+};
+
+struct TextureInfo {
+  uint32_t swizzle;
+  Dimension dimension;
+  uint32_t width;
+  uint32_t height;
+  uint32_t depth;
+  uint32_t block_size;
+  uint32_t texel_pitch;
+  xenos::Endian endianness;
+  bool is_tiled;
+  bool is_compressed;
+  uint32_t input_length;
+
+  TextureFormat format;
+
+  union {
+    struct {
+      uint32_t width;
+    } size_1d;
+    struct {
+      uint32_t logical_width;
+      uint32_t logical_height;
+      uint32_t block_width;
+      uint32_t block_height;
+      uint32_t input_width;
+      uint32_t input_height;
+      uint32_t output_width;
+      uint32_t output_height;
+      uint32_t logical_pitch;
+      uint32_t input_pitch;
+    } size_2d;
+    struct {
+    } size_3d;
+    struct {
+    } size_cube;
+  };
+
+  static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
+                      TextureInfo* out_info);
+
+  static uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width,
+                                     uint32_t log_bpp);
+  static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp,
+                                     uint32_t base_offset);
+
+ private:
+  void CalculateTextureSizes1D(const xenos::xe_gpu_texture_fetch_t& fetch);
+  void CalculateTextureSizes2D(const xenos::xe_gpu_texture_fetch_t& fetch);
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_TEXTURE_INFO_H_
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index f23ec50f4..e89e4ba97 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -35,6 +35,13 @@ enum class PrimitiveType : uint32_t {
   kQuadList = 0x0D,
 };
 
+enum class Dimension : uint32_t {
+  k1D = 0,
+  k2D = 1,
+  k3D = 2,
+  kCube = 3,
+};
+
 namespace xenos {
 
 typedef enum {