Textures and such.

2014-12-30 14:10:30 -08:00 · 2014-12-30 14:10:30 -08:00 · dfc260b86e
parent 5b2672a1b8
commit dfc260b86e
23 changed files with 1598 additions and 296 deletions
--- a/src/poly/math.h
+++ b/src/poly/math.h
@ -36,7 +36,7 @@ T align(T value, T alignment) {
 // Rounds the given number up to the next highest multiple.
 template <typename T, typename V>
 T round_up(T value, V multiple) {
-  return value ? (value + multiple - 1 - (value - 1) % multiple) : multiple;
+  return value ? (((value + multiple - 1) / multiple) * multiple) : multiple;
 }

 inline float saturate(float value) {
--- a/src/xenia/gpu/gl4/circular_buffer.cc
+++ b/src/xenia/gpu/gl4/circular_buffer.cc
@ -27,29 +27,41 @@ CircularBuffer::CircularBuffer(size_t capacity)
      gpu_base_(0),
      host_base_(nullptr) {}

-CircularBuffer::~CircularBuffer() {
-  glUnmapNamedBuffer(buffer_);
-  glDeleteBuffers(1, &buffer_);
-}
+CircularBuffer::~CircularBuffer() { Shutdown(); }

 bool CircularBuffer::Initialize() {
  glCreateBuffers(1, &buffer_);
  glNamedBufferStorage(buffer_, capacity_, nullptr,
                       GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
  host_base_ = reinterpret_cast<uint8_t*>(glMapNamedBufferRange(
-      buffer_, 0, capacity_, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
-                                 GL_MAP_UNSYNCHRONIZED_BIT |
-                                 GL_MAP_PERSISTENT_BIT));
+      buffer_, 0, capacity_,
+      GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_PERSISTENT_BIT));
  assert_not_null(host_base_);
  if (!host_base_) {
    return false;
  }
-  glMakeNamedBufferResidentNV(buffer_, GL_WRITE_ONLY);
-  glGetNamedBufferParameterui64vNV(buffer_, GL_BUFFER_GPU_ADDRESS_NV,
-                                   &gpu_base_);
+
+  if (GLEW_NV_shader_buffer_load) {
+    // To use this bindlessly we must make it resident.
+    glMakeNamedBufferResidentNV(buffer_, GL_WRITE_ONLY);
+    glGetNamedBufferParameterui64vNV(buffer_, GL_BUFFER_GPU_ADDRESS_NV,
+                                     &gpu_base_);
+  }
  return true;
 }

+void CircularBuffer::Shutdown() {
+  if (!buffer_) {
+    return;
+  }
+  glUnmapNamedBuffer(buffer_);
+  if (GLEW_NV_shader_buffer_load) {
+    glMakeNamedBufferNonResidentNV(buffer_);
+  }
+  glDeleteBuffers(1, &buffer_);
+  buffer_ = 0;
+}
+
 CircularBuffer::Allocation CircularBuffer::Acquire(size_t length) {
  // Addresses must always be % 256.
  length = poly::round_up(length, 256);
@ -64,6 +76,7 @@ CircularBuffer::Allocation CircularBuffer::Acquire(size_t length) {
  Allocation allocation;
  allocation.host_ptr = host_base_ + write_head_;
  allocation.gpu_ptr = gpu_base_ + write_head_;
+  allocation.offset = write_head_;
  allocation.length = length;
  write_head_ += length;
  return allocation;
--- a/src/xenia/gpu/gl4/circular_buffer.h
+++ b/src/xenia/gpu/gl4/circular_buffer.h
@ -26,10 +26,12 @@ class CircularBuffer {
  struct Allocation {
    void* host_ptr;
    GLuint64 gpu_ptr;
+    size_t offset;
    size_t length;
  };

  bool Initialize();
+  void Shutdown();

  GLuint handle() const { return buffer_; }

--- a/src/xenia/gpu/gl4/command_processor.cc
+++ b/src/xenia/gpu/gl4/command_processor.cc
@ -16,6 +16,8 @@
 #include <xenia/gpu/gl4/gl4_gpu-private.h>
 #include <xenia/gpu/gl4/gl4_graphics_system.h>
 #include <xenia/gpu/gpu-private.h>
+#include <xenia/gpu/sampler_info.h>
+#include <xenia/gpu/texture_info.h>
 #include <xenia/gpu/xenos.h>

 #include <third_party/xxhash/xxhash.h>
@ -36,7 +38,7 @@ const GLuint kAnyTarget = UINT_MAX;
 // All uncached vertex/index data goes here. If it fills up we need to sync
 // with the GPU, so this should be large enough to prevent that in a normal
 // frame.
-const size_t kScratchBufferCapacity = 64 * 1024 * 1024;
+const size_t kScratchBufferCapacity = 256 * 1024 * 1024;

 CommandProcessor::CachedPipeline::CachedPipeline() = default;

@ -61,6 +63,7 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
      write_ptr_index_(0),
      bin_select_(0xFFFFFFFFull),
      bin_mask_(0xFFFFFFFFull),
+      has_bindless_vbos_(false),
      active_vertex_shader_(nullptr),
      active_pixel_shader_(nullptr),
      active_framebuffer_(nullptr),
@ -152,29 +155,34 @@ void CommandProcessor::WorkerMain() {
 }

 bool CommandProcessor::SetupGL() {
-  // Uniform buffer that stores the per-draw state (constants, etc).
-  glCreateBuffers(1, &uniform_data_buffer_);
-  glBindBuffer(GL_UNIFORM_BUFFER, uniform_data_buffer_);
-  glNamedBufferStorage(uniform_data_buffer_, 16 * 1024, nullptr,
-                       GL_MAP_WRITE_BIT | GL_DYNAMIC_STORAGE_BIT);
-
  // Circular buffer holding scratch vertex/index data.
  if (!scratch_buffer_.Initialize()) {
    PLOGE("Unable to initialize scratch buffer");
    return false;
  }

+  // Texture cache that keeps track of any textures/samplers used.
+  if (!texture_cache_.Initialize(&scratch_buffer_)) {
+    PLOGE("Unable to initialize texture cache");
+    return false;
+  }
+
  GLuint vertex_array;
  glGenVertexArrays(1, &vertex_array);
  glBindVertexArray(vertex_array);
-  glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
-  glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+
+  if (GLEW_NV_vertex_buffer_unified_memory) {
+    has_bindless_vbos_ = true;
+    glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+    glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+  }

  return true;
 }

 void CommandProcessor::ShutdownGL() {
-  glDeleteBuffers(1, &uniform_data_buffer_);
+  texture_cache_.Shutdown();
+  scratch_buffer_.Shutdown();
 }

 void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) {
@ -264,6 +272,7 @@ void CommandProcessor::PrepareForWait() {
  // make interrupt callbacks from the GPU so that we don't have to do a full
  // synchronize here.
  glFlush();
+  glFinish();

  if (FLAGS_thread_safe_gl) {
    context_->ClearCurrent();
@ -1142,6 +1151,8 @@ void CommandProcessor::PrepareDraw(DrawCommand* draw_command) {
  // Generic stuff.
  cmd.start_index = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32;
  cmd.base_vertex = 0;
+
+  cmd.state_data = nullptr;
 }

 bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
@ -1158,6 +1169,18 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
    return IssueCopy(draw_command);
  }

+  // TODO(benvanik): actually cache things >_>
+  texture_cache_.Clear();
+
+  // Allocate a state data block.
+  // Everything the shaders access lives here.
+  auto allocation = scratch_buffer_.Acquire(sizeof(UniformDataBlock));
+  cmd.state_data = reinterpret_cast<UniformDataBlock*>(allocation.host_ptr);
+  if (!cmd.state_data) {
+    PLOGE("Unable to allocate uniform data buffer");
+    return false;
+  }
+
  if (!UpdateRenderTargets(draw_command)) {
    PLOGE("Unable to setup render targets");
    return false;
@ -1172,17 +1195,15 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
    PLOGE("Unable to setup render state");
    return false;
  }
-
+  if (!UpdateConstants(draw_command)) {
+    PLOGE("Unable to update shader constants");
+    return false;
+  }
  if (!UpdateShaders(draw_command)) {
    PLOGE("Unable to prepare draw shaders");
    return false;
  }

-  // if (!PopulateSamplers(draw_command)) {
-  //  XELOGE("Unable to prepare draw samplers");
-  //  return false;
-  //}
-
  if (!PopulateIndexBuffer(draw_command)) {
    PLOGE("Unable to setup index buffer");
    return false;
@ -1191,6 +1212,10 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
    PLOGE("Unable to setup vertex buffers");
    return false;
  }
+  if (!PopulateSamplers(draw_command)) {
+    PLOGE("Unable to prepare draw samplers");
+    return false;
+  }

  GLenum prim_type = 0;
  switch (cmd.prim_type) {
@ -1228,6 +1253,7 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
      break;
    case PrimitiveType::kQuadList:
      prim_type = GL_LINES_ADJACENCY;
+      return false;
      /*if
      (vs->DemandGeometryShader(D3D11VertexShaderResource::QUAD_LIST_SHADER,
                                   &geometry_shader)) {
@ -1237,10 +1263,15 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
    default:
    case PrimitiveType::kUnknown0x07:
      prim_type = GL_POINTS;
-      XELOGE("D3D11: unsupported primitive type %d", cmd.prim_type);
+      XELOGE("unsupported primitive type %d", cmd.prim_type);
      break;
  }

+  // Commit the state buffer - nothing can change after this.
+  glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, scratch_buffer_.handle(),
+                    allocation.offset, allocation.length);
+  scratch_buffer_.Commit(std::move(allocation));
+
  // HACK HACK HACK
  glDisable(GL_DEPTH_TEST);

@ -1254,13 +1285,108 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
        prim_type, cmd.index_count,
        cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT
                                                       : GL_UNSIGNED_SHORT,
-        reinterpret_cast<void*>(cmd.start_index * element_size),
+        reinterpret_cast<void*>(cmd.index_buffer.buffer_offset +
+                                cmd.start_index * element_size),
        cmd.base_vertex);
  } else {
    // Auto draw.
    glDrawArrays(prim_type, cmd.start_index, cmd.index_count);
  }

+  // Hacky draw counter.
+  if (false) {
+    static int draw_count = 0;
+    glEnable(GL_SCISSOR_TEST);
+    glScissor(20, 0, 20, 20);
+    float red[] = {0, draw_count / 100.0f, 0, 1.0f};
+    draw_count = (draw_count + 1) % 100;
+    glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0,
+                              red);
+    glDisable(GL_SCISSOR_TEST);
+  }
+
+  return true;
+}
+
+bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) {
+  auto& regs = *register_file_;
+
+  auto enable_mode =
+      static_cast<ModeControl>(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7);
+
+  // RB_SURFACE_INFO
+  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
+  uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
+  uint32_t surface_pitch = surface_info & 0x3FFF;
+  auto surface_msaa = static_cast<MsaaSamples>((surface_info >> 16) & 0x3);
+
+  // Get/create all color render targets, if we are using them.
+  // In depth-only mode we don't need them.
+  GLenum draw_buffers[4] = {GL_NONE, GL_NONE, GL_NONE, GL_NONE};
+  GLuint color_targets[4] = {kAnyTarget, kAnyTarget, kAnyTarget, kAnyTarget};
+  if (enable_mode == ModeControl::kColorDepth) {
+    uint32_t color_info[4] = {
+        regs[XE_GPU_REG_RB_COLOR_INFO].u32, regs[XE_GPU_REG_RB_COLOR1_INFO].u32,
+        regs[XE_GPU_REG_RB_COLOR2_INFO].u32,
+        regs[XE_GPU_REG_RB_COLOR3_INFO].u32,
+    };
+    // A2XX_RB_COLOR_MASK_WRITE_* == D3DRS_COLORWRITEENABLE
+    uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
+    for (int n = 0; n < poly::countof(color_info); n++) {
+      uint32_t write_mask = (color_mask >> (n * 4)) & 0xF;
+      if (!write_mask) {
+        // Unused, so keep disabled and set to wildcard so we'll take any
+        // framebuffer that has it.
+        continue;
+      }
+      uint32_t color_base = color_info[n] & 0xFFF;
+      auto color_format =
+          static_cast<ColorRenderTargetFormat>((color_info[n] >> 16) & 0xF);
+      color_targets[n] = GetColorRenderTarget(surface_pitch, surface_msaa,
+                                              color_base, color_format);
+      draw_buffers[n] = GL_COLOR_ATTACHMENT0 + n;
+      glColorMaski(n, !!(write_mask & 0x1), !!(write_mask & 0x2),
+                   !!(write_mask & 0x4), !!(write_mask & 0x8));
+    }
+  }
+
+  // Get/create depth buffer, but only if we are going to use it.
+  uint32_t depth_control = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32;
+  uint32_t stencil_ref_mask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32;
+  bool uses_depth =
+      (depth_control & 0x00000002) || (depth_control & 0x00000004);
+  uint32_t stencil_write_mask = (stencil_ref_mask & 0x00FF0000) >> 16;
+  bool uses_stencil = (depth_control & 0x00000001) || (stencil_write_mask != 0);
+  GLuint depth_target = kAnyTarget;
+  if (uses_depth && uses_stencil) {
+    uint32_t depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
+    uint32_t depth_base = depth_info & 0xFFF;
+    auto depth_format =
+        static_cast<DepthRenderTargetFormat>((depth_info >> 16) & 0x1);
+    depth_target = GetDepthRenderTarget(surface_pitch, surface_msaa, depth_base,
+                                        depth_format);
+    // TODO(benvanik): when a game switches does it expect to keep the same
+    //     depth buffer contents?
+  }
+
+  // Get/create a framebuffer with the required targets.
+  // Note that none may be returned if we really don't need one.
+  auto cached_framebuffer = GetFramebuffer(color_targets, depth_target);
+  active_framebuffer_ = cached_framebuffer;
+  if (!active_framebuffer_) {
+    // Nothing to do.
+    return true;
+  }
+
+  // Setup just the targets we want.
+  glNamedFramebufferDrawBuffers(cached_framebuffer->framebuffer, 4,
+                                draw_buffers);
+
+  // Make active.
+  // TODO(benvanik): can we do this all named?
+  // TODO(benvanik): do we want this on READ too?
+  glBindFramebuffer(GL_DRAW_FRAMEBUFFER, cached_framebuffer->framebuffer);
+
  return true;
 }

@ -1272,57 +1398,24 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {

  auto& regs = *register_file_;

-  union float4 {
-    float v[4];
-    struct {
-      float x, y, z, w;
-    };
-  };
-  struct UniformDataBlock {
-    float4 window_offset;    // tx,ty,rt_w,rt_h
-    float4 window_scissor;   // x0,y0,x1,y1
-    float4 viewport_offset;  // tx,ty,tz,?
-    float4 viewport_scale;   // sx,sy,sz,?
-    // TODO(benvanik): vertex format xyzw?
-
-    float4 alpha_test;  // alpha test enable, func, ref, ?
-
-    // Register data from 0x4000 to 0x4927.
-    // SHADER_CONSTANT_000_X...
-    float4 float_consts[512];
-    // SHADER_CONSTANT_FETCH_00_0...
-    uint32_t fetch_consts[32 * 6];
-    // SHADER_CONSTANT_BOOL_000_031...
-    int32_t bool_consts[8];
-    // SHADER_CONSTANT_LOOP_00...
-    int32_t loop_consts[32];
-  };
-  static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
-                "Need <=16k uniform data");
-
-  auto allocation = scratch_buffer_.Acquire(16 * 1024);
-  auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(allocation.host_ptr);
-  if (!buffer_ptr) {
-    PLOGE("Unable to allocate uniform data buffer");
-    return false;
-  }
+  auto state_data = draw_command->state_data;

  // Window parameters.
  // See r200UpdateWindow:
  // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
  uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32;
-  buffer_ptr->window_offset.x = float(window_offset & 0x7FFF);
-  buffer_ptr->window_offset.y = float((window_offset >> 16) & 0x7FFF);
+  state_data->window_offset.x = float(window_offset & 0x7FFF);
+  state_data->window_offset.y = float((window_offset >> 16) & 0x7FFF);
  uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
  uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
-  buffer_ptr->window_scissor.x = float(window_scissor_tl & 0x7FFF);
-  buffer_ptr->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF);
-  buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF);
-  buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);
+  state_data->window_scissor.x = float(window_scissor_tl & 0x7FFF);
+  state_data->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF);
+  state_data->window_scissor.z = float(window_scissor_br & 0x7FFF);
+  state_data->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);

  // HACK: no clue where to get these values.
-  buffer_ptr->window_offset.z = 1280;
-  buffer_ptr->window_offset.w = 720;
+  state_data->window_offset.z = 1280;
+  state_data->window_offset.w = 720;

  // Whether each of the viewport settings is enabled.
  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
@ -1338,20 +1431,20 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
              vport_yoffset_enable == vport_zoffset_enable);

  // Viewport scaling. Only enabled if the flags are all set.
-  buffer_ptr->viewport_scale.x =
+  state_data->viewport_scale.x =
      vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1;  // 640
-  buffer_ptr->viewport_offset.x = vport_xoffset_enable
+  state_data->viewport_offset.x = vport_xoffset_enable
                                      ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
                                      : 0;  // 640
-  buffer_ptr->viewport_scale.y = vport_yscale_enable
+  state_data->viewport_scale.y = vport_yscale_enable
                                     ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
                                     : 1;  // -360
-  buffer_ptr->viewport_offset.y = vport_yoffset_enable
+  state_data->viewport_offset.y = vport_yoffset_enable
                                      ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
                                      : 0;  // 360
-  buffer_ptr->viewport_scale.z =
+  state_data->viewport_scale.z =
      vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1;  // 1
-  buffer_ptr->viewport_offset.z =
+  state_data->viewport_offset.z =
      vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0;  // 0
  // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0.
  //            = false: multiply the X, Y coordinates by 1/W0.
@ -1365,15 +1458,6 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
  // TODO(benvanik): pass to shaders? disable transform? etc?
  glViewport(0, 0, 1280, 720);

-  // Copy over all constants.
-  // TODO(benvanik): partial updates, etc. We could use shader constant access
-  // knowledge that we get at compile time to only upload those constants
-  // required.
-  std::memcpy(
-      &buffer_ptr->float_consts, &regs[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
-      sizeof(buffer_ptr->float_consts) + sizeof(buffer_ptr->fetch_consts) +
-          sizeof(buffer_ptr->loop_consts) + sizeof(buffer_ptr->bool_consts));
-
  // Scissoring.
  int32_t screen_scissor_tl = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32;
  int32_t screen_scissor_br = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32;
@ -1424,10 +1508,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
  // Deprecated in GL, implemented in shader.
  // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard;
  uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32;
-  buffer_ptr->alpha_test.x =
+  state_data->alpha_test.x =
      (color_control & 0x4) ? 1.0f : 0.0f;                // ALPAHTESTENABLE
-  buffer_ptr->alpha_test.y = float(color_control & 0x3);  // ALPHAFUNC
-  buffer_ptr->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
+  state_data->alpha_test.y = float(color_control & 0x3);  // ALPHAFUNC
+  state_data->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32;

  static const GLenum blend_map[] = {
      /*  0 */ GL_ZERO,
@ -1575,91 +1659,23 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
                stencil_op_map[(depth_control & 0x0001C000) >> 14]);
  }

-  // Stash - program setup will bind this to uniforms.
-  draw_command->state_data_gpu_ptr = allocation.gpu_ptr;
-  scratch_buffer_.Commit(std::move(allocation));
-
  return true;
 }

-bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) {
+bool CommandProcessor::UpdateConstants(DrawCommand* draw_command) {
  auto& regs = *register_file_;
+  auto state_data = draw_command->state_data;

-  auto enable_mode =
-      static_cast<ModeControl>(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7);
+  // TODO(benvanik): partial updates, etc. We could use shader constant access
+  // knowledge that we get at compile time to only upload those constants
+  // required. If we did this as a variable length then we could really cut
+  // down on state block sizes.

-  // RB_SURFACE_INFO
-  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
-  uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
-  uint32_t surface_pitch = surface_info & 0x3FFF;
-  auto surface_msaa = static_cast<MsaaSamples>((surface_info >> 16) & 0x3);
-
-  // Get/create all color render targets, if we are using them.
-  // In depth-only mode we don't need them.
-  GLenum draw_buffers[4] = {GL_NONE, GL_NONE, GL_NONE, GL_NONE};
-  GLuint color_targets[4] = {kAnyTarget, kAnyTarget, kAnyTarget, kAnyTarget};
-  if (enable_mode == ModeControl::kColorDepth) {
-    uint32_t color_info[4] = {
-        regs[XE_GPU_REG_RB_COLOR_INFO].u32, regs[XE_GPU_REG_RB_COLOR1_INFO].u32,
-        regs[XE_GPU_REG_RB_COLOR2_INFO].u32,
-        regs[XE_GPU_REG_RB_COLOR3_INFO].u32,
-    };
-    // A2XX_RB_COLOR_MASK_WRITE_* == D3DRS_COLORWRITEENABLE
-    uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
-    for (int n = 0; n < poly::countof(color_info); n++) {
-      uint32_t write_mask = (color_mask >> (n * 4)) & 0xF;
-      if (!write_mask) {
-        // Unused, so keep disabled and set to wildcard so we'll take any
-        // framebuffer that has it.
-        continue;
-      }
-      uint32_t color_base = color_info[n] & 0xFFF;
-      auto color_format =
-          static_cast<ColorRenderTargetFormat>((color_info[n] >> 16) & 0xF);
-      color_targets[n] = GetColorRenderTarget(surface_pitch, surface_msaa,
-                                              color_base, color_format);
-      draw_buffers[n] = GL_COLOR_ATTACHMENT0 + n;
-      glColorMaski(n, !!(write_mask & 0x1), !!(write_mask & 0x2),
-                   !!(write_mask & 0x4), !!(write_mask & 0x8));
-    }
-  }
-
-  // Get/create depth buffer, but only if we are going to use it.
-  uint32_t depth_control = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32;
-  uint32_t stencil_ref_mask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32;
-  bool uses_depth =
-      (depth_control & 0x00000002) || (depth_control & 0x00000004);
-  uint32_t stencil_write_mask = (stencil_ref_mask & 0x00FF0000) >> 16;
-  bool uses_stencil = (depth_control & 0x00000001) || (stencil_write_mask != 0);
-  GLuint depth_target = kAnyTarget;
-  if (uses_depth && uses_stencil) {
-    uint32_t depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
-    uint32_t depth_base = depth_info & 0xFFF;
-    auto depth_format =
-        static_cast<DepthRenderTargetFormat>((depth_info >> 16) & 0x1);
-    depth_target = GetDepthRenderTarget(surface_pitch, surface_msaa, depth_base,
-                                        depth_format);
-    // TODO(benvanik): when a game switches does it expect to keep the same
-    //     depth buffer contents?
-  }
-
-  // Get/create a framebuffer with the required targets.
-  // Note that none may be returned if we really don't need one.
-  auto cached_framebuffer = GetFramebuffer(color_targets, depth_target);
-  active_framebuffer_ = cached_framebuffer;
-  if (!active_framebuffer_) {
-    // Nothing to do.
-    return true;
-  }
-
-  // Setup just the targets we want.
-  glNamedFramebufferDrawBuffers(cached_framebuffer->framebuffer, 4,
-                                draw_buffers);
-
-  // Make active.
-  // TODO(benvanik): can we do this all named?
-  // TODO(benvanik): do we want this on READ too?
-  glBindFramebuffer(GL_DRAW_FRAMEBUFFER, cached_framebuffer->framebuffer);
+  // Copy over all constants.
+  std::memcpy(
+      &state_data->float_consts, &regs[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
+      sizeof(state_data->float_consts) + sizeof(state_data->fetch_consts) +
+          sizeof(state_data->loop_consts) + sizeof(state_data->bool_consts));

  return true;
 }
@ -1718,28 +1734,10 @@ bool CommandProcessor::UpdateShaders(DrawCommand* draw_command) {
    glUseProgramStages(pipeline, GL_GEOMETRY_SHADER_BIT, geometry_program);
    glUseProgramStages(pipeline, GL_FRAGMENT_SHADER_BIT, fragment_program);

-    // HACK: layout(location=0) on a bindless uniform crashes nvidia driver.
-    GLint vertex_state_loc = glGetUniformLocation(vertex_program, "state");
-    assert_true(vertex_state_loc == 0);
-    GLint geometry_state_loc =
-        geometry_program ? glGetUniformLocation(geometry_program, "state") : -1;
-    assert_true(geometry_state_loc == -1 || geometry_state_loc == 0);
-    GLint fragment_state_loc = glGetUniformLocation(fragment_program, "state");
-    assert_true(fragment_state_loc == -1 || fragment_state_loc == 0);
-
    cached_pipeline->handles.default_pipeline = pipeline;
  }

-  // TODO(benvanik): do we need to do this for all stages if the locations
-  // match?
-  glProgramUniformHandleui64ARB(vertex_program, 0, cmd.state_data_gpu_ptr);
-  /*if (geometry_program && geometry_state_loc != -1) {
-    glProgramUniformHandleui64ARB(geometry_program, 0, cmd.state_data_gpu_ptr);
-  }*/
-  /*if (fragment_state_loc != -1) {
-    glProgramUniformHandleui64ARB(fragment_program, 0,
-                                  cmd.state_data_gpu_ptr);
-  }*/
+  // NOTE: we don't yet have our state data pointer - that comes at the end.

  glBindProgramPipeline(cached_pipeline->handles.default_pipeline);

@ -1759,10 +1757,10 @@ bool CommandProcessor::PopulateIndexBuffer(DrawCommand* draw_command) {
  assert_true(info.endianness == Endian::k8in16 ||
              info.endianness == Endian::k8in32);

-  auto allocation = scratch_buffer_.Acquire(cmd.index_count *
-                                            (info.format == IndexFormat::kInt32
-                                                 ? sizeof(uint32_t)
-                                                 : sizeof(uint16_t)));
+  size_t total_size =
+      cmd.index_count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
+                                                            : sizeof(uint16_t));
+  auto allocation = scratch_buffer_.Acquire(total_size);

  if (info.format == IndexFormat::kInt32) {
    poly::copy_and_swap_32_aligned(
@ -1776,9 +1774,14 @@ bool CommandProcessor::PopulateIndexBuffer(DrawCommand* draw_command) {
        cmd.index_count);
  }

-  glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, allocation.gpu_ptr,
-                         allocation.length);
-
+  if (has_bindless_vbos_) {
+    glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, allocation.gpu_ptr,
+                           allocation.length);
+  } else {
+    // Offset is used in glDrawElements.
+    cmd.index_buffer.buffer_offset = allocation.offset;
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, scratch_buffer_.handle());
+  }
  scratch_buffer_.Commit(std::move(allocation));

  return true;
@ -1792,7 +1795,8 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {

  const auto& buffer_inputs = active_vertex_shader_->buffer_inputs();

-  for (size_t n = 0; n < buffer_inputs.count; n++) {
+  uint32_t el_index = 0;
+  for (uint32_t n = 0; n < buffer_inputs.count; n++) {
    const auto& desc = buffer_inputs.descs[n];

    int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6;
@ -1826,7 +1830,11 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
        reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
        fetch->size);

-    uint32_t el_index = 0;
+    if (!has_bindless_vbos_) {
+      glBindVertexBuffer(n, scratch_buffer_.handle(), allocation.offset,
+                         desc.stride_words * 4);
+    }
+
    for (uint32_t i = 0; i < desc.element_count; ++i) {
      const auto& el = desc.elements[i];
      auto comp_count = GetVertexFormatComponentCount(el.format);
@ -1882,13 +1890,19 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
          assert_unhandled_case(el.format);
          break;
      }
-      size_t offset = el.offset_words * sizeof(uint32_t);
      glEnableVertexAttribArray(el_index);
-      glVertexAttribFormatNV(el_index, comp_count, comp_type, el.is_normalized,
-                             desc.stride_words * sizeof(uint32_t));
-      glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index,
-                             allocation.gpu_ptr + offset,
-                             allocation.length - offset);
+      if (has_bindless_vbos_) {
+        glVertexAttribFormatNV(el_index, comp_count, comp_type,
+                               el.is_normalized,
+                               desc.stride_words * sizeof(uint32_t));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index,
+                               allocation.gpu_ptr + (el.offset_words * 4),
+                               allocation.length - (el.offset_words * 4));
+      } else {
+        glVertexAttribBinding(el_index, n);
+        glVertexAttribFormat(el_index, comp_count, comp_type, el.is_normalized,
+                             el.offset_words * 4);
+      }
      ++el_index;
    }

@ -1899,6 +1913,82 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) {
  return true;
 }

+bool CommandProcessor::PopulateSamplers(DrawCommand* draw_command) {
+  SCOPE_profile_cpu_f("gpu");
+
+  auto& regs = *register_file_;
+
+  // VS and PS samplers are shared, but may be used exclusively.
+  // We walk each and setup lazily.
+  bool has_setup_sampler[32] = {false};
+
+  // Vertex texture samplers.
+  const auto& vertex_sampler_inputs = active_vertex_shader_->sampler_inputs();
+  for (size_t i = 0; i < vertex_sampler_inputs.count; ++i) {
+    const auto& desc = vertex_sampler_inputs.descs[i];
+    if (has_setup_sampler[desc.fetch_slot]) {
+      continue;
+    }
+    has_setup_sampler[desc.fetch_slot] = true;
+    if (!PopulateSampler(draw_command, desc)) {
+      return false;
+    }
+  }
+
+  // Pixel shader texture sampler.
+  const auto& pixel_sampler_inputs = active_pixel_shader_->sampler_inputs();
+  for (size_t i = 0; i < pixel_sampler_inputs.count; ++i) {
+    const auto& desc = pixel_sampler_inputs.descs[i];
+    if (has_setup_sampler[desc.fetch_slot]) {
+      continue;
+    }
+    has_setup_sampler[desc.fetch_slot] = true;
+    if (!PopulateSampler(draw_command, desc)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CommandProcessor::PopulateSampler(DrawCommand* draw_command,
+                                       const Shader::SamplerDesc& desc) {
+  auto& regs = *register_file_;
+  int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + desc.fetch_slot * 6;
+  auto group = reinterpret_cast<const xe_gpu_fetch_group_t*>(&regs.values[r]);
+  auto& fetch = group->texture_fetch;
+
+  // ?
+  assert_true(fetch.type == 0x2);
+
+  TextureInfo texture_info;
+  if (!TextureInfo::Prepare(fetch, &texture_info)) {
+    XELOGE("Unable to parse texture fetcher info");
+    return false;  // invalid texture used
+  }
+  SamplerInfo sampler_info;
+  if (!SamplerInfo::Prepare(fetch, desc.tex_fetch, &sampler_info)) {
+    XELOGE("Unable to parse sampler info");
+    return false;  // invalid texture used
+  }
+
+  uint32_t guest_base = fetch.address << 12;
+  void* host_base = membase_ + guest_base;
+  auto entry_view = texture_cache_.Demand(host_base, texture_info.input_length,
+                                          texture_info, sampler_info);
+  if (!entry_view) {
+    // Unable to create/fetch/etc.
+    XELOGE("Failed to demand texture");
+    return false;
+  }
+
+  // Shaders will use bindless to fetch right from it.
+  draw_command->state_data->texture_samplers[desc.fetch_slot] =
+      entry_view->texture_sampler_handle;
+
+  return true;
+}
+
 bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
  auto& regs = *register_file_;

@ -2045,7 +2135,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) {
    case CopyCommand::kConstantOne:
    case CopyCommand::kNull:
    default:
-      assert_unhandled_case(copy_command);
+      // assert_unhandled_case(copy_command);
      return false;
  }
  glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
--- a/src/xenia/gpu/gl4/command_processor.h
+++ b/src/xenia/gpu/gl4/command_processor.h
@ -20,6 +20,7 @@
 #include <xenia/gpu/gl4/circular_buffer.h>
 #include <xenia/gpu/gl4/gl_context.h>
 #include <xenia/gpu/gl4/gl4_shader.h>
+#include <xenia/gpu/gl4/texture_cache.h>
 #include <xenia/gpu/register_file.h>
 #include <xenia/gpu/xenos.h>
 #include <xenia/memory.h>
@ -40,6 +41,39 @@ struct SwapParameters {
  GLenum attachment;
 };

+// This must match the layout in gl4_shader.cc.
+struct UniformDataBlock {
+  union float4 {
+    float v[4];
+    struct {
+      float x, y, z, w;
+    };
+  };
+
+  float4 window_offset;    // tx,ty,rt_w,rt_h
+  float4 window_scissor;   // x0,y0,x1,y1
+  float4 viewport_offset;  // tx,ty,tz,?
+  float4 viewport_scale;   // sx,sy,sz,?
+                           // TODO(benvanik): vertex format xyzw?
+
+  float4 alpha_test;  // alpha test enable, func, ref, ?
+
+                      // TODO(benvanik): overlay with fetch_consts below?
+  uint64_t texture_samplers[32];
+
+  // Register data from 0x4000 to 0x4927.
+  // SHADER_CONSTANT_000_X...
+  float4 float_consts[512];
+  // SHADER_CONSTANT_FETCH_00_0...
+  uint32_t fetch_consts[32 * 6];
+  // SHADER_CONSTANT_BOOL_000_031...
+  int32_t bool_consts[8];
+  // SHADER_CONSTANT_LOOP_00...
+  int32_t loop_consts[32];
+};
+static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
+              "Need <=16k uniform data");
+
 // TODO(benvanik): move more of the enums in here?
 struct DrawCommand {
  PrimitiveType prim_type;
@ -54,6 +88,7 @@ struct DrawCommand {
    size_t size;
    xenos::Endian endianness;
    xenos::IndexFormat format;
+    size_t buffer_offset;
  } index_buffer;

  // Texture samplers.
@ -63,11 +98,9 @@ struct DrawCommand {
    // SamplerStateResource* sampler_state;
  };
  SamplerInput vertex_shader_samplers[32];
-  size_t vertex_shader_sampler_count;
  SamplerInput pixel_shader_samplers[32];
-  size_t pixel_shader_sampler_count;

-  GLuint64 state_data_gpu_ptr;
+  UniformDataBlock* state_data;
 };

 class CommandProcessor {
@ -195,11 +228,15 @@ class CommandProcessor {

  void PrepareDraw(DrawCommand* draw_command);
  bool IssueDraw(DrawCommand* draw_command);
-  bool UpdateState(DrawCommand* draw_command);
  bool UpdateRenderTargets(DrawCommand* draw_command);
+  bool UpdateState(DrawCommand* draw_command);
+  bool UpdateConstants(DrawCommand* draw_command);
  bool UpdateShaders(DrawCommand* draw_command);
  bool PopulateIndexBuffer(DrawCommand* draw_command);
  bool PopulateVertexBuffers(DrawCommand* draw_command);
+  bool PopulateSamplers(DrawCommand* draw_command);
+  bool PopulateSampler(DrawCommand* draw_command,
+                       const Shader::SamplerDesc& desc);
  bool IssueCopy(DrawCommand* draw_command);

  CachedFramebuffer* GetFramebuffer(GLuint color_targets[4],
@ -237,7 +274,7 @@ class CommandProcessor {
  uint64_t bin_select_;
  uint64_t bin_mask_;

-  GLuint uniform_data_buffer_;
+  bool has_bindless_vbos_;

  std::vector<std::unique_ptr<GL4Shader>> all_shaders_;
  std::unordered_map<uint64_t, GL4Shader*> shader_cache_;
@ -251,7 +288,7 @@ class CommandProcessor {
  std::vector<CachedDepthRenderTarget> cached_depth_render_targets_;
  std::vector<std::unique_ptr<CachedPipeline>> all_pipelines_;
  std::unordered_map<uint64_t, CachedPipeline*> cached_pipelines_;
-
+  TextureCache texture_cache_;
  CircularBuffer scratch_buffer_;

  DrawCommand draw_command_;
--- a/src/xenia/gpu/gl4/gl4_gpu-private.h
+++ b/src/xenia/gpu/gl4/gl4_gpu-private.h
@ -17,6 +17,9 @@

 DECLARE_bool(thread_safe_gl);

+DECLARE_bool(gl_debug_output);
+DECLARE_bool(gl_debug_output_synchronous);
+
 namespace xe {
 namespace gpu {
 namespace gl4 {
--- a/src/xenia/gpu/gl4/gl4_gpu.cc
+++ b/src/xenia/gpu/gl4/gl4_gpu.cc
@ -15,6 +15,10 @@
 DEFINE_bool(thread_safe_gl, false,
            "Only allow one GL context to be active at a time.");

+DEFINE_bool(gl_debug_output, false, "Dump ARB_debug_output to stderr.");
+DEFINE_bool(gl_debug_output_synchronous, true,
+            "ARB_debug_output will synchronize to be thread safe.");
+
 namespace xe {
 namespace gpu {
 namespace gl4 {
--- a/src/xenia/gpu/gl4/gl4_shader.cc
+++ b/src/xenia/gpu/gl4/gl4_shader.cc
@ -35,7 +35,6 @@ const std::string header =
    "#extension GL_ARB_explicit_uniform_location : require\n"
    "#extension GL_ARB_shading_language_420pack : require\n"
    "#extension GL_ARB_shader_storage_buffer_object : require\n"
-    "#extension GL_NV_shader_buffer_load : require\n"
    "precision highp float;\n"
    "precision highp int;\n"
    "layout(std140, column_major) uniform;\n"
@ -46,6 +45,7 @@ const std::string header =
    "  vec4 viewport_offset;\n"
    "  vec4 viewport_scale;\n"
    "  vec4 alpha_test;\n"
+    "  uvec2 texture_samplers[32];\n"
    "  vec4 float_consts[512];\n"
    "  uint fetch_consts[32 * 6];\n"
    "  int bool_consts[8];\n"
@ -55,7 +55,9 @@ const std::string header =
    "  vec4 o[16];\n"
    "};\n"
    "\n"
-    "uniform StateData* state;\n";
+    "layout(binding = 0) buffer State {\n"
+    "  StateData state;\n"
+    "};\n";

 bool GL4Shader::PrepareVertexShader(
    const xenos::xe_gpu_program_cntl_t& program_cntl) {
@ -69,20 +71,20 @@ bool GL4Shader::PrepareVertexShader(
      // TODO(benvanik): piecewise viewport_enable -> offset/scale logic.
      "  if (false) {\n"
      "  } else {\n"
-      /*"    pos.xy = pos.xy / vec2(state->window_offset.z / 2.0, "
-      "-state->window_offset.w / 2.0) + vec2(-1.0, 1.0);\n"
+      /*"    pos.xy = pos.xy / vec2(state.window_offset.z / 2.0, "
+      "-state.window_offset.w / 2.0) + vec2(-1.0, 1.0);\n"
      "    pos.zw = vec2(0.0, 1.0);\n"*/
      "    pos.xy = pos.xy / vec2(1280.0 / 2.0, "
      "-720.0 / 2.0) + vec2(-1.0, 1.0);\n"
      "    //pos.zw = vec2(0.0, 1.0);\n"
      "  }\n"
-      "  pos.x = pos.x * state->viewport_scale.x + \n"
-      "      state->viewport_offset.x;\n"
-      "  pos.y = pos.y * state->viewport_scale.y + \n"
-      "      state->viewport_offset.y;\n"
-      "  pos.z = pos.z * state->viewport_scale.z + \n"
-      "      state->viewport_offset.z;\n"
-      "  pos.xy += state->window_offset.xy;\n"
+      "  pos.x = pos.x * state.viewport_scale.x + \n"
+      "      state.viewport_offset.x;\n"
+      "  pos.y = pos.y * state.viewport_scale.y + \n"
+      "      state.viewport_offset.y;\n"
+      "  pos.z = pos.z * state.viewport_scale.z + \n"
+      "      state.viewport_offset.z;\n"
+      "  pos.xy += state.window_offset.xy;\n"
      "  return pos;\n"
      "}\n";
  std::string source =
@ -105,6 +107,8 @@ bool GL4Shader::PrepareVertexShader(
      "  gl_Position = applyViewport(gl_Position);\n"
      "}\n";

+  // glGetTextureSamplerHandleARB()
+
  std::string translated_source =
      shader_translator_.TranslateVertexShader(this, program_cntl);
  if (translated_source.empty()) {
@ -135,9 +139,9 @@ bool GL4Shader::PreparePixelShader(
      "void processFragment();\n"
      "void main() {\n"
      "  for (int i = 0; i < oC.length(); ++i) {\n"
-      "    oC[i] = vec4(0.0, 0.0, 0.0, 0.0);\n"
+      "    oC[i] = vec4(1.0, 0.0, 0.0, 1.0);\n"
      "  }\n" +
-      (program_cntl.ps_export_depth ? "  gl_FragDepth = 0.0\n" : "") +
+      (program_cntl.ps_export_depth ? "  gl_FragDepth = 0.0;\n" : "") +
      "  processFragment();\n"
      "}\n";

--- a/src/xenia/gpu/gl4/gl4_shader_translator.cc
+++ b/src/xenia/gpu/gl4/gl4_shader_translator.cc
@ -28,25 +28,21 @@ static const char chan_names[] = {
 const char* GetVertexFormatTypeName(const GL4Shader::BufferDescElement& el) {
  switch (el.format) {
    case VertexFormat::k_32:
-      return el.is_signed ? "int" : "uint";
    case VertexFormat::k_32_FLOAT:
      return "float";
    case VertexFormat::k_16_16:
    case VertexFormat::k_32_32:
-      return el.is_signed ? "ivec2" : "uvec2";
    case VertexFormat::k_16_16_FLOAT:
    case VertexFormat::k_32_32_FLOAT:
      return "vec2";
    case VertexFormat::k_10_11_11:
    case VertexFormat::k_11_11_10:
-      return "int3";  // ?
    case VertexFormat::k_32_32_32_FLOAT:
      return "vec3";
    case VertexFormat::k_8_8_8_8:
    case VertexFormat::k_2_10_10_10:
    case VertexFormat::k_16_16_16_16:
    case VertexFormat::k_32_32_32_32:
-      return el.is_signed ? "ivec4" : "uvec4";
    case VertexFormat::k_16_16_16_16_FLOAT:
    case VertexFormat::k_32_32_32_32_FLOAT:
      return "vec4";
@ -58,14 +54,13 @@ const char* GetVertexFormatTypeName(const GL4Shader::BufferDescElement& el) {
 }

 GL4ShaderTranslator::GL4ShaderTranslator()
-    : output_(kOutputCapacity), tex_fetch_index_(0), dwords_(nullptr) {}
+    : output_(kOutputCapacity), dwords_(nullptr) {}

 GL4ShaderTranslator::~GL4ShaderTranslator() = default;

 void GL4ShaderTranslator::Reset(GL4Shader* shader) {
  output_.Reset();
  shader_type_ = shader->type();
-  tex_fetch_index_ = 0;
  dwords_ = shader->data();
 }

@ -76,8 +71,6 @@ std::string GL4ShaderTranslator::TranslateVertexShader(
  // Normal shaders only, for now.
  assert_true(program_cntl.vs_export_mode == 0);

-  AppendTextureHeader(vertex_shader->sampler_inputs());
-
  // Add vertex shader input.
  uint32_t el_index = 0;
  const auto& buffer_inputs = vertex_shader->buffer_inputs();
@ -102,7 +95,7 @@ std::string GL4ShaderTranslator::TranslateVertexShader(
  // Add temporaries for any registers we may use.
  uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;
  for (uint32_t n = 0; n <= temp_regs; n++) {
-    Append("  vec4 r%d = state->float_consts[%d];\n", n, n);
+    Append("  vec4 r%d = state.float_consts[%d];\n", n, n);
  }
  Append("  vec4 t;\n");

@ -129,15 +122,13 @@ std::string GL4ShaderTranslator::TranslatePixelShader(
  // If the same PS is used with different VS that output different amounts
  // (and less than the number of required registers), things may die.

-  AppendTextureHeader(pixel_shader->sampler_inputs());
-
  // Pixel shader main() header.
  Append("void processFragment() {\n");

  // Add temporary registers.
  uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;
  for (uint32_t n = 0; n <= std::max(15u, temp_regs); n++) {
-    Append("  vec4 r%d = state->float_consts[%d];\n", n, n + 256);
+    Append("  vec4 r%d = state.float_consts[%d];\n", n, n + 256);
  }
  Append("  vec4 t;\n");
  Append("  float s;\n");  // scalar result (used for RETAIN_PREV)
@ -161,42 +152,6 @@ std::string GL4ShaderTranslator::TranslatePixelShader(
  return output_.to_string();
 }

-void GL4ShaderTranslator::AppendTextureHeader(
-    const GL4Shader::SamplerInputs& sampler_inputs) {
-  bool fetch_setup[32] = {false};
-
-  // 1 texture per constant slot, 1 sampler per fetch.
-  for (uint32_t n = 0; n < sampler_inputs.count; n++) {
-    const auto& input = sampler_inputs.descs[n];
-    const auto& fetch = input.tex_fetch;
-
-    // Add texture, if needed.
-    if (!fetch_setup[fetch.const_idx]) {
-      fetch_setup[fetch.const_idx] = true;
-      const char* texture_type = nullptr;
-      switch (fetch.dimension) {
-        case DIMENSION_1D:
-          texture_type = "Texture1D";
-          break;
-        default:
-        case DIMENSION_2D:
-          texture_type = "Texture2D";
-          break;
-        case DIMENSION_3D:
-          texture_type = "Texture3D";
-          break;
-        case DIMENSION_CUBE:
-          texture_type = "TextureCube";
-          break;
-      }
-      Append("%s x_texture_%d;\n", texture_type, fetch.const_idx);
-    }
-
-    // Add sampler.
-    Append("SamplerState x_sampler_%d;\n", n);
-  }
-}
-
 void GL4ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type,
                                       uint32_t swiz, uint32_t negate,
                                       uint32_t abs_constants) {
@ -217,7 +172,7 @@ void GL4ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type,
    if (abs_constants) {
      Append("abs(");
    }
-    Append("state->float_consts[%u]", is_pixel_shader() ? num + 256 : num);
+    Append("state.float_consts[%u]", is_pixel_shader() ? num + 256 : num);
    if (abs_constants) {
      Append(")");
    }
@ -258,9 +213,12 @@ void GL4ShaderTranslator::AppendDestRegName(uint32_t num, uint32_t dst_exp) {
          case 0:
            Append("oC[0]");
            break;
+          case 61:
+            // Write to t, as we need to splice just x out of it.
+            Append("t");
+            break;
          default:
            // TODO(benvanik): other render targets?
-            // TODO(benvanik): depth?
            assert_always();
            break;
        }
@ -282,7 +240,10 @@ void GL4ShaderTranslator::AppendDestReg(uint32_t num, uint32_t mask,

 void GL4ShaderTranslator::AppendDestRegPost(uint32_t num, uint32_t mask,
                                            uint32_t dst_exp) {
-  if (mask != 0xF) {
+  if (num == 61) {
+    // gl_FragDepth handling to just get x from the temp result.
+    Append("  gl_FragDepth = t.x;\n");
+  } else if (mask != 0xF) {
    // Masking.
    Append("  ");
    AppendDestRegName(num, dst_exp);
@ -399,7 +360,7 @@ bool GL4ShaderTranslator::TranslateALU_ADDv(const instr_alu_t& alu) {
               alu.abs_constants);
  Append(")");
  if (alu.vector_clamp) {
-    Append(")");
+    Append(", 0.0, 1.0)");
  }
  Append(";\n");
  AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
@ -685,7 +646,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT4v(const instr_alu_t& alu) {
  if (alu.vector_clamp) {
    Append(", 0.0, 1.0)");
  }
-  Append(";\n");
+  Append(".xxxx;\n");
  AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
  return true;
 }
@ -706,7 +667,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT3v(const instr_alu_t& alu) {
  if (alu.vector_clamp) {
    Append(", 0.0, 1.0)");
  }
-  Append(";\n");
+  Append(".xxxx;\n");
  AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
  return true;
 }
@ -730,7 +691,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) {
  if (alu.vector_clamp) {
    Append(", 0.0, 1.0)");
  }
-  Append(";\n");
+  Append(".xxxx;\n");
  AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data);
  return true;
 }
@ -1402,20 +1363,27 @@ bool GL4ShaderTranslator::TranslateVertexFetch(const instr_fetch_vtx_t* vtx,
 bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
                                                int sync) {
  int src_component_count = 0;
+  const char* sampler_type;
  switch (tex->dimension) {
    case DIMENSION_1D:
      src_component_count = 1;
+      sampler_type = "sampler1D";
      break;
-    default:
    case DIMENSION_2D:
      src_component_count = 2;
+      sampler_type = "sampler2D";
      break;
    case DIMENSION_3D:
      src_component_count = 3;
+      sampler_type = "sampler3D";
      break;
    case DIMENSION_CUBE:
      src_component_count = 3;
+      sampler_type = "samplerCube";
      break;
+    default:
+      assert_unhandled_case(tex->dimension);
+      return false;
  }

  // Disassemble.
@ -1500,10 +1468,10 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
  Append("\n");

  // Translate.
-  Append("  t = ");
-  Append("x_texture_%d.Sample(x_sampler_%d, r%u.", tex->const_idx,
-         tex_fetch_index_++,  // hacky way to line up to tex buffers
-         tex->src_reg);
+  // TODO(benvanik): if sampler == null, set to invalid color.
+  Append("  t = texture(");
+  Append("%s(state.texture_samplers[%d])", sampler_type, tex->const_idx & 0xF);
+  Append(", r%u.", tex->src_reg);
  src_swiz = tex->src_swiz;
  for (int i = 0; i < src_component_count; i++) {
    Append("%c", chan_names[src_swiz & 0x3]);
@ -1511,6 +1479,26 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
  }
  Append(");\n");

+  // Output texture coordinates as color.
+  // TODO(benvanik): only if texture is invalid?
+  // Append("  t = vec4(r%u.", tex->src_reg);
+  // src_swiz = tex->src_swiz;
+  // for (int i = 0; i < src_component_count; i++) {
+  //  Append("%c", chan_names[src_swiz & 0x3]);
+  //  src_swiz >>= 2;
+  //}
+  // switch (src_component_count) {
+  //  case 1:
+  //    Append(", 0.0, 0.0, 1.0);\n");
+  //    break;
+  //  case 2:
+  //    Append(", 0.0, 1.0);\n");
+  //    break;
+  //  case 3:
+  //    Append(", 1.0);\n");
+  //    break;
+  //}
+
  Append("  r%u.xyzw = vec4(", tex->dst_reg);
  uint32_t dst_swiz = tex->dst_swiz;
  for (int i = 0; i < 4; i++) {
@ -1524,6 +1512,7 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
    } else if ((dst_swiz & 0x7) == 6) {
      // ?
      Append("?");
+      assert_always();
    } else if ((dst_swiz & 0x7) == 7) {
      Append("r%u.%c", tex->dst_reg, chan_names[i]);
    } else {
--- a/src/xenia/gpu/gl4/gl4_shader_translator.h
+++ b/src/xenia/gpu/gl4/gl4_shader_translator.h
@ -39,7 +39,6 @@ class GL4ShaderTranslator {

 protected:
  ShaderType shader_type_;
-  uint32_t tex_fetch_index_;
  const uint32_t* dwords_;

  static const int kOutputCapacity = 64 * 1024;
@ -56,8 +55,6 @@ class GL4ShaderTranslator {
    va_end(args);
  }

-  void AppendTextureHeader(const GL4Shader::SamplerInputs& sampler_inputs);
-
  void AppendSrcReg(uint32_t num, uint32_t type, uint32_t swiz, uint32_t negate,
                    uint32_t abs);
  void AppendDestRegName(uint32_t num, uint32_t dst_exp);
--- a/src/xenia/gpu/gl4/gl_context.cc
+++ b/src/xenia/gpu/gl4/gl_context.cc
@ -115,6 +115,8 @@ bool GLContext::Initialize(HWND hwnd) {
    // Clearing errors.
  }

+  SetupDebugging();
+
  ClearCurrent();

  return true;
@ -160,11 +162,120 @@ std::unique_ptr<GLContext> GLContext::CreateShared() {
    return nullptr;
  }

+  SetupDebugging();
+
  new_context->ClearCurrent();

  return new_context;
 }

+void GLContext::DebugMessage(GLenum source, GLenum type, GLuint id,
+                             GLenum severity, GLsizei length,
+                             const GLchar* message) {
+  const char* source_name = nullptr;
+  switch (source) {
+    case GL_DEBUG_SOURCE_API_ARB:
+      source_name = "OpenGL";
+      break;
+    case GL_DEBUG_SOURCE_WINDOW_SYSTEM_ARB:
+      source_name = "Windows";
+      break;
+    case GL_DEBUG_SOURCE_SHADER_COMPILER_ARB:
+      source_name = "Shader Compiler";
+      break;
+    case GL_DEBUG_SOURCE_THIRD_PARTY_ARB:
+      source_name = "Third Party";
+      break;
+    case GL_DEBUG_SOURCE_APPLICATION_ARB:
+      source_name = "Application";
+      break;
+    case GL_DEBUG_SOURCE_OTHER_ARB:
+      source_name = "Other";
+      break;
+    default:
+      source_name = "(unknown source)";
+      break;
+  }
+
+  const char* type_name = nullptr;
+  switch (type) {
+    case GL_DEBUG_TYPE_ERROR:
+      type_name = "error";
+      break;
+    case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR:
+      type_name = "deprecated behavior";
+      break;
+    case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR:
+      type_name = "undefined behavior";
+      break;
+    case GL_DEBUG_TYPE_PORTABILITY:
+      type_name = "portability";
+      break;
+    case GL_DEBUG_TYPE_PERFORMANCE:
+      type_name = "performance";
+      break;
+    case GL_DEBUG_TYPE_OTHER:
+      type_name = "message";
+      break;
+    case GL_DEBUG_TYPE_MARKER:
+      type_name = "marker";
+      break;
+    case GL_DEBUG_TYPE_PUSH_GROUP:
+      type_name = "push group";
+      break;
+    case GL_DEBUG_TYPE_POP_GROUP:
+      type_name = "pop group";
+      break;
+    default:
+      type_name = "(unknown type)";
+      break;
+  }
+
+  const char* severity_name = nullptr;
+  switch (severity) {
+    case GL_DEBUG_SEVERITY_HIGH_ARB:
+      severity_name = "high";
+      break;
+    case GL_DEBUG_SEVERITY_MEDIUM_ARB:
+      severity_name = "medium";
+      break;
+    case GL_DEBUG_SEVERITY_LOW_ARB:
+      severity_name = "low";
+      break;
+    case GL_DEBUG_SEVERITY_NOTIFICATION:
+      severity_name = "notification";
+      break;
+    default:
+      severity_name = "(unknown severity)";
+      break;
+  }
+
+  XELOGE("GL4 %s: %s(%s) %d: %s", source_name, type_name, severity_name, id,
+         message);
+}
+
+void GLAPIENTRY
+GLContext::DebugMessageThunk(GLenum source, GLenum type, GLuint id,
+                             GLenum severity, GLsizei length,
+                             const GLchar* message, GLvoid* user_param) {
+  reinterpret_cast<GLContext*>(user_param)
+      ->DebugMessage(source, type, id, severity, length, message);
+}
+
+void GLContext::SetupDebugging() {
+  if (!FLAGS_gl_debug_output) {
+    return;
+  }
+  glEnable(GL_DEBUG_OUTPUT);
+  if (FLAGS_gl_debug_output_synchronous) {
+    glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
+  }
+  glDebugMessageControl(GL_DONT_CARE, GL_DONT_CARE, GL_DONT_CARE, 0, NULL,
+                        GL_TRUE);
+  glDebugMessageCallback(reinterpret_cast<GLDEBUGPROC>(&DebugMessageThunk),
+                         this);
+}
+
 bool GLContext::MakeCurrent() {
  if (FLAGS_thread_safe_gl) {
    global_gl_mutex_.lock();
--- a/src/xenia/gpu/gl4/gl_context.h
+++ b/src/xenia/gpu/gl4/gl_context.h
@ -35,6 +35,13 @@ class GLContext {
  void ClearCurrent();

 private:
+  void SetupDebugging();
+  void DebugMessage(GLenum source, GLenum type, GLuint id, GLenum severity,
+                    GLsizei length, const GLchar* message);
+  static void GLAPIENTRY
+  DebugMessageThunk(GLenum source, GLenum type, GLuint id, GLenum severity,
+                    GLsizei length, const GLchar* message, GLvoid* user_param);
+
  HWND hwnd_;
  HDC dc_;
  HGLRC glrc_;
--- a/src/xenia/gpu/gl4/sources.gypi
+++ b/src/xenia/gpu/gl4/sources.gypi
@ -16,6 +16,8 @@
    'gl4_shader_translator.h',
    'gl_context.cc',
    'gl_context.h',
+    'texture_cache.cc',
+    'texture_cache.h',
  ],

  'conditions': [
--- a/src/xenia/gpu/gl4/texture_cache.cc
+++ b/src/xenia/gpu/gl4/texture_cache.cc
@ -0,0 +1,497 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <xenia/gpu/gl4/texture_cache.h>
+
+#include <poly/assert.h>
+#include <poly/math.h>
+#include <xenia/gpu/gpu-private.h>
+
+namespace xe {
+namespace gpu {
+namespace gl4 {
+
+using namespace xe::gpu::xenos;
+
+extern "C" GLEWContext* glewGetContext();
+extern "C" WGLEWContext* wglewGetContext();
+
+TextureCache::TextureCache() {
+  //
+}
+
+TextureCache::~TextureCache() { Shutdown(); }
+
+bool TextureCache::Initialize(CircularBuffer* scratch_buffer) {
+  scratch_buffer_ = scratch_buffer;
+  return true;
+}
+
+void TextureCache::Shutdown() {
+  Clear();
+  //
+}
+
+void TextureCache::Clear() {
+  for (auto& entry : entries_) {
+    for (auto& view : entry.views) {
+      glMakeTextureHandleNonResidentARB(view.texture_sampler_handle);
+      glDeleteSamplers(1, &view.sampler);
+    }
+    glDeleteTextures(1, &entry.base_texture);
+  }
+  entries_.clear();
+}
+
+TextureCache::EntryView* TextureCache::Demand(void* host_base, size_t length,
+                                              const TextureInfo& texture_info,
+                                              const SamplerInfo& sampler_info) {
+  entries_.emplace_back(Entry());
+  auto& entry = entries_.back();
+  entry.texture_info = texture_info;
+
+  GLenum target;
+  switch (texture_info.dimension) {
+    case Dimension::k1D:
+      target = GL_TEXTURE_1D;
+      break;
+    case Dimension::k2D:
+      target = GL_TEXTURE_2D;
+      break;
+    case Dimension::k3D:
+      target = GL_TEXTURE_3D;
+      break;
+    case Dimension::kCube:
+      target = GL_TEXTURE_CUBE_MAP;
+      break;
+  }
+
+  // Setup the base texture.
+  glCreateTextures(target, 1, &entry.base_texture);
+  if (!SetupTexture(entry.base_texture, texture_info)) {
+    PLOGE("Unable to setup texture parameters");
+    return false;
+  }
+
+  // Upload/convert.
+  bool uploaded = false;
+  switch (texture_info.dimension) {
+    case Dimension::k2D:
+      uploaded = UploadTexture2D(entry.base_texture, host_base, length,
+                                 texture_info, sampler_info);
+      break;
+    case Dimension::k1D:
+    case Dimension::k3D:
+    case Dimension::kCube:
+      assert_unhandled_case(texture_info.dimension);
+      return false;
+  }
+  if (!uploaded) {
+    PLOGE("Failed to convert/upload texture");
+    return false;
+  }
+
+  entry.views.emplace_back(EntryView());
+  auto& entry_view = entry.views.back();
+  entry_view.sampler_info = sampler_info;
+
+  // Setup the sampler.
+  glCreateSamplers(1, &entry_view.sampler);
+  if (!SetupSampler(entry_view.sampler, texture_info, sampler_info)) {
+    PLOGE("Unable to setup texture sampler parameters");
+    return false;
+  }
+
+  // Get the uvec2 handle to the texture/sampler pair and make it resident.
+  // The handle can be passed directly to the shader.
+  entry_view.texture_sampler_handle =
+      glGetTextureSamplerHandleARB(entry.base_texture, entry_view.sampler);
+  if (!entry_view.texture_sampler_handle) {
+    return nullptr;
+  }
+  glMakeTextureHandleResidentARB(entry_view.texture_sampler_handle);
+
+  return &entry_view;
+}
+
+bool TextureCache::SetupTexture(GLuint texture,
+                                const TextureInfo& texture_info) {
+  // TODO(benvanik): texture mip levels.
+  glTextureParameteri(texture, GL_TEXTURE_BASE_LEVEL, 0);
+  glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, 1);
+
+  // Pre-shader swizzle.
+  // TODO(benvanik): can this be dynamic? Maybe per view?
+  // We may have to emulate this in the shader.
+  uint32_t swizzle_r = texture_info.swizzle & 0x7;
+  uint32_t swizzle_g = (texture_info.swizzle >> 3) & 0x7;
+  uint32_t swizzle_b = (texture_info.swizzle >> 6) & 0x7;
+  uint32_t swizzle_a = (texture_info.swizzle >> 9) & 0x7;
+  static const GLenum swizzle_map[] = {
+      GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA, GL_ZERO, GL_ONE,
+  };
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_R, swizzle_map[swizzle_r]);
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_G, swizzle_map[swizzle_g]);
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_B, swizzle_map[swizzle_b]);
+  glTextureParameteri(texture, GL_TEXTURE_SWIZZLE_A, swizzle_map[swizzle_a]);
+
+  return true;
+}
+
+bool TextureCache::SetupSampler(GLuint sampler, const TextureInfo& texture_info,
+                                const SamplerInfo& sampler_info) {
+  // TODO(benvanik): border color from texture fetch.
+  GLfloat border_color[4] = {0.0f};
+  glSamplerParameterfv(sampler, GL_TEXTURE_BORDER_COLOR, border_color);
+
+  // TODO(benvanik): setup LODs for mipmapping.
+  glSamplerParameterf(sampler, GL_TEXTURE_LOD_BIAS, 0.0f);
+  glSamplerParameterf(sampler, GL_TEXTURE_MIN_LOD, 0.0f);
+  glSamplerParameterf(sampler, GL_TEXTURE_MAX_LOD, 0.0f);
+
+  // Texture wrapping modes.
+  // TODO(benvanik): not sure if the middle ones are correct.
+  static const GLenum wrap_map[] = {
+      GL_REPEAT,                      //
+      GL_MIRRORED_REPEAT,             //
+      GL_CLAMP_TO_EDGE,               //
+      GL_MIRROR_CLAMP_TO_EDGE,        //
+      GL_CLAMP_TO_BORDER,             // ?
+      GL_MIRROR_CLAMP_TO_BORDER_EXT,  // ?
+      GL_CLAMP_TO_BORDER,             //
+      GL_MIRROR_CLAMP_TO_BORDER_EXT,  //
+  };
+  glSamplerParameteri(sampler, GL_TEXTURE_WRAP_S,
+                      wrap_map[sampler_info.clamp_u]);
+  glSamplerParameteri(sampler, GL_TEXTURE_WRAP_T,
+                      wrap_map[sampler_info.clamp_v]);
+  glSamplerParameteri(sampler, GL_TEXTURE_WRAP_R,
+                      wrap_map[sampler_info.clamp_w]);
+
+  // Texture level filtering.
+  GLenum min_filter;
+  switch (sampler_info.min_filter) {
+    case ucode::TEX_FILTER_POINT:
+      switch (sampler_info.mip_filter) {
+        case ucode::TEX_FILTER_BASEMAP:
+          min_filter = GL_NEAREST;
+          break;
+        case ucode::TEX_FILTER_POINT:
+          // min_filter = GL_NEAREST_MIPMAP_NEAREST;
+          min_filter = GL_NEAREST;
+          break;
+        case ucode::TEX_FILTER_LINEAR:
+          // min_filter = GL_NEAREST_MIPMAP_LINEAR;
+          min_filter = GL_NEAREST;
+          break;
+        default:
+          assert_unhandled_case(sampler_info.mip_filter);
+          return false;
+      }
+      break;
+    case ucode::TEX_FILTER_LINEAR:
+      switch (sampler_info.mip_filter) {
+        case ucode::TEX_FILTER_BASEMAP:
+          min_filter = GL_LINEAR;
+          break;
+        case ucode::TEX_FILTER_POINT:
+          // min_filter = GL_LINEAR_MIPMAP_NEAREST;
+          min_filter = GL_LINEAR;
+          break;
+        case ucode::TEX_FILTER_LINEAR:
+          // min_filter = GL_LINEAR_MIPMAP_LINEAR;
+          min_filter = GL_LINEAR;
+          break;
+        default:
+          assert_unhandled_case(sampler_info.mip_filter);
+          return false;
+      }
+      break;
+    default:
+      assert_unhandled_case(sampler_info.min_filter);
+      return false;
+  }
+  GLenum mag_filter;
+  switch (sampler_info.mag_filter) {
+    case ucode::TEX_FILTER_POINT:
+      mag_filter = GL_NEAREST;
+      break;
+    case ucode::TEX_FILTER_LINEAR:
+      mag_filter = GL_LINEAR;
+      break;
+    default:
+      assert_unhandled_case(mag_filter);
+      return false;
+  }
+  glSamplerParameteri(sampler, GL_TEXTURE_MIN_FILTER, min_filter);
+  glSamplerParameteri(sampler, GL_TEXTURE_MAG_FILTER, mag_filter);
+
+  // TODO(benvanik): anisotropic filtering.
+  // GL_TEXTURE_MAX_ANISOTROPY_EXT
+
+  return true;
+}
+
+void TextureSwap(Endian endianness, void* dest, const void* src,
+                 size_t length) {
+  switch (endianness) {
+    case Endian::k8in16:
+      poly::copy_and_swap_16_aligned(reinterpret_cast<uint16_t*>(dest),
+                                     reinterpret_cast<const uint16_t*>(src),
+                                     length / 2);
+      break;
+    case Endian::k8in32:
+      poly::copy_and_swap_32_aligned(reinterpret_cast<uint32_t*>(dest),
+                                     reinterpret_cast<const uint32_t*>(src),
+                                     length / 4);
+      break;
+    case Endian::k16in32:
+      // TODO(benvanik): make more efficient.
+      /*for (uint32_t i = 0; i < length; i += 4, src += 4, dest += 4) {
+        uint32_t value = *(uint32_t*)src;
+        *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16);
+      }*/
+      assert_always("16in32 not supported");
+      break;
+    default:
+    case Endian::kUnspecified:
+      std::memcpy(dest, src, length);
+      break;
+  }
+}
+
+bool TextureCache::UploadTexture2D(GLuint texture, void* host_base,
+                                   size_t length,
+                                   const TextureInfo& texture_info,
+                                   const SamplerInfo& sampler_info) {
+  assert_true(length == texture_info.input_length);
+
+  GLenum internal_format = GL_RGBA8;
+  GLenum format = GL_RGBA;
+  GLenum type = GL_UNSIGNED_BYTE;
+  // https://code.google.com/p/glsnewton/source/browse/trunk/Source/uDDSLoader.pas?r=62
+  // http://dench.flatlib.jp/opengl/textures
+  // http://fossies.org/linux/WebKit/Source/ThirdParty/ANGLE/src/libGLESv2/formatutils.cpp
+  switch (texture_info.format) {
+    case TextureFormat::k_8:
+      internal_format = GL_R8;
+      format = GL_R;
+      type = GL_UNSIGNED_BYTE;
+      break;
+    case TextureFormat::k_1_5_5_5:
+      internal_format = GL_RGB5_A1;
+      format = GL_BGRA;
+      type = GL_UNSIGNED_SHORT_1_5_5_5_REV;
+      break;
+    case TextureFormat::k_5_6_5:
+      internal_format = GL_RGB565;
+      format = GL_RGB;
+      type = GL_UNSIGNED_SHORT_5_6_5;
+      break;
+    case TextureFormat::k_2_10_10_10:
+    case TextureFormat::k_2_10_10_10_AS_16_16_16_16:
+      internal_format = GL_RGB10_A2;
+      format = GL_RGBA;
+      type = GL_UNSIGNED_INT_2_10_10_10_REV;
+      break;
+    case TextureFormat::k_10_11_11:
+    case TextureFormat::k_10_11_11_AS_16_16_16_16:
+      // ?
+      internal_format = GL_R11F_G11F_B10F;
+      format = GL_RGB;
+      type = GL_UNSIGNED_INT_10F_11F_11F_REV;
+      break;
+    case TextureFormat::k_11_11_10:
+    case TextureFormat::k_11_11_10_AS_16_16_16_16:
+      internal_format = GL_R11F_G11F_B10F;
+      format = GL_RGB;
+      type = GL_UNSIGNED_INT_10F_11F_11F_REV;
+      break;
+    case TextureFormat::k_8_8_8_8:
+    case TextureFormat::k_8_8_8_8_AS_16_16_16_16:
+      internal_format = GL_RGBA8;
+      format = GL_RGBA;
+      type = GL_UNSIGNED_BYTE;
+      break;
+    case TextureFormat::k_4_4_4_4:
+      internal_format = GL_RGBA4;
+      format = GL_RGBA;
+      type = GL_UNSIGNED_SHORT_4_4_4_4;
+      break;
+    case TextureFormat::k_16_FLOAT:
+      internal_format = GL_R16F;
+      format = GL_RED;
+      type = GL_HALF_FLOAT;
+      break;
+    case TextureFormat::k_16_16_FLOAT:
+      internal_format = GL_RG16F;
+      format = GL_RG;
+      type = GL_HALF_FLOAT;
+      break;
+    case TextureFormat::k_16_16_16_16_FLOAT:
+      internal_format = GL_RGBA16F;
+      format = GL_RGBA;
+      type = GL_HALF_FLOAT;
+      break;
+    case TextureFormat::k_32_FLOAT:
+      internal_format = GL_R32F;
+      format = GL_R;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_32_32_FLOAT:
+      internal_format = GL_RG32F;
+      format = GL_RG;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_32_32_32_FLOAT:
+      internal_format = GL_RGB32F;
+      format = GL_RGB;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_32_32_32_32_FLOAT:
+      internal_format = GL_RGBA32F;
+      format = GL_RGBA;
+      type = GL_FLOAT;
+      break;
+    case TextureFormat::k_DXT1:
+    case TextureFormat::k_DXT1_AS_16_16_16_16:
+      // or GL_COMPRESSED_RGB_S3TC_DXT1_EXT?
+      internal_format = format = GL_COMPRESSED_RGBA_S3TC_DXT1_EXT;
+      break;
+    case TextureFormat::k_DXT2_3:
+    case TextureFormat::k_DXT2_3_AS_16_16_16_16:
+      internal_format = format = GL_COMPRESSED_RGBA_S3TC_DXT3_EXT;
+      break;
+    case TextureFormat::k_DXT4_5:
+    case TextureFormat::k_DXT4_5_AS_16_16_16_16:
+      internal_format = format = GL_COMPRESSED_RGBA_S3TC_DXT5_EXT;
+      break;
+    case TextureFormat::k_24_8:
+      internal_format = GL_DEPTH24_STENCIL8;
+      format = GL_DEPTH_STENCIL;
+      type = GL_UNSIGNED_INT_24_8;
+      break;
+    case TextureFormat::k_24_8_FLOAT:
+      internal_format = GL_DEPTH24_STENCIL8;
+      format = GL_DEPTH_STENCIL;
+      type = GL_FLOAT_32_UNSIGNED_INT_24_8_REV;
+      break;
+    default:
+    case TextureFormat::k_1_REVERSE:
+    case TextureFormat::k_1:
+    case TextureFormat::k_6_5_5:
+    case TextureFormat::k_8_A:
+    case TextureFormat::k_8_B:
+    case TextureFormat::k_8_8:
+    case TextureFormat::k_Cr_Y1_Cb_Y0:
+    case TextureFormat::k_Y1_Cr_Y0_Cb:
+    case TextureFormat::k_8_8_8_8_A:
+    case TextureFormat::k_16:
+    case TextureFormat::k_16_16:
+    case TextureFormat::k_16_16_16_16:
+    case TextureFormat::k_16_EXPAND:
+    case TextureFormat::k_16_16_EXPAND:
+    case TextureFormat::k_16_16_16_16_EXPAND:
+    case TextureFormat::k_32_32:
+    case TextureFormat::k_32_32_32_32:
+    case TextureFormat::k_32_AS_8:
+    case TextureFormat::k_32_AS_8_8:
+    case TextureFormat::k_16_MPEG:
+    case TextureFormat::k_16_16_MPEG:
+    case TextureFormat::k_8_INTERLACED:
+    case TextureFormat::k_32_AS_8_INTERLACED:
+    case TextureFormat::k_32_AS_8_8_INTERLACED:
+    case TextureFormat::k_16_INTERLACED:
+    case TextureFormat::k_16_MPEG_INTERLACED:
+    case TextureFormat::k_16_16_MPEG_INTERLACED:
+    case TextureFormat::k_DXN:
+    case TextureFormat::k_DXT3A:
+    case TextureFormat::k_DXT5A:
+    case TextureFormat::k_CTX1:
+    case TextureFormat::k_DXT3A_AS_1_1_1_1:
+      assert_unhandled_case(texture_info.format);
+      return false;
+  }
+
+  size_t unpack_length = texture_info.input_length;
+  glTextureStorage2D(texture, 1, internal_format,
+                     texture_info.size_2d.output_width,
+                     texture_info.size_2d.output_height);
+  assert_true(unpack_length % 4 == 0);
+
+  auto allocation = scratch_buffer_->Acquire(unpack_length);
+
+  if (!texture_info.is_tiled) {
+    TextureSwap(texture_info.endianness, allocation.host_ptr, host_base,
+                unpack_length);
+    /*const uint8_t* src = reinterpret_cast<const uint8_t*>(host_base);
+    uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
+    for (uint32_t y = 0; y < texture_info.size_2d.block_height; y++) {
+      for (uint32_t x = 0; x < texture_info.size_2d.logical_pitch;
+           x += texture_info.texel_pitch) {
+        TextureSwap(texture_info.endianness, dest + x, src + x,
+                    texture_info.texel_pitch);
+      }
+      src += texture_info.size_2d.input_pitch;
+      dest += texture_info.size_2d.input_pitch;
+    }*/
+    // std::memcpy(dest, src, unpack_length);
+  } else {
+    uint8_t* src = reinterpret_cast<uint8_t*>(host_base);
+    uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
+    uint32_t output_pitch =
+        (texture_info.size_2d.output_width / texture_info.block_size) *
+        texture_info.texel_pitch;
+    auto bpp =
+        (texture_info.texel_pitch >> 2) +
+        ((texture_info.texel_pitch >> 1) >> (texture_info.texel_pitch >> 2));
+    for (uint32_t y = 0, output_base_offset = 0;
+         y < texture_info.size_2d.block_height;
+         y++, output_base_offset += output_pitch) {
+      auto input_base_offset = TextureInfo::TiledOffset2DOuter(
+          y, (texture_info.size_2d.input_width / texture_info.block_size), bpp);
+      for (uint32_t x = 0, output_offset = output_base_offset;
+           x < texture_info.size_2d.block_width;
+           x++, output_offset += texture_info.texel_pitch) {
+        auto input_offset =
+            TextureInfo::TiledOffset2DInner(x, y, bpp, input_base_offset) >>
+            bpp;
+        TextureSwap(texture_info.endianness, dest + output_offset,
+                    src + input_offset * texture_info.texel_pitch,
+                    texture_info.texel_pitch);
+      }
+    }
+  }
+  size_t unpack_offset = allocation.offset;
+  scratch_buffer_->Commit(std::move(allocation));
+
+  // glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_TRUE);
+  // glPixelStorei(GL_UNPACK_ALIGNMENT, texture_info.texel_pitch);
+  glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width);
+  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle());
+  if (texture_info.is_compressed) {
+    glCompressedTextureSubImage2D(texture, 0, 0, 0,
+                                  texture_info.size_2d.output_width,
+                                  texture_info.size_2d.output_height, format,
+                                  static_cast<GLsizei>(unpack_length),
+                                  reinterpret_cast<void*>(unpack_offset));
+  } else {
+    glTextureSubImage2D(texture, 0, 0, 0, texture_info.size_2d.output_width,
+                        texture_info.size_2d.output_height, format, type,
+                        reinterpret_cast<void*>(unpack_offset));
+  }
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+  return true;
+}
+
+}  // namespace gl4
+}  // namespace gpu
+}  // namespace xe
--- a/src/xenia/gpu/gl4/texture_cache.h
+++ b/src/xenia/gpu/gl4/texture_cache.h
@ -0,0 +1,65 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_GL4_TEXTURE_CACHE_H_
+#define XENIA_GPU_GL4_TEXTURE_CACHE_H_
+
+#include <vector>
+
+#include <xenia/gpu/gl4/circular_buffer.h>
+#include <xenia/gpu/gl4/gl_context.h>
+#include <xenia/gpu/sampler_info.h>
+#include <xenia/gpu/texture_info.h>
+
+namespace xe {
+namespace gpu {
+namespace gl4 {
+
+class TextureCache {
+ public:
+  struct EntryView {
+    SamplerInfo sampler_info;
+    GLuint sampler;
+    GLuint64 texture_sampler_handle;
+  };
+  struct Entry {
+    TextureInfo texture_info;
+    GLuint base_texture;
+    std::vector<EntryView> views;
+  };
+
+  TextureCache();
+  ~TextureCache();
+
+  bool Initialize(CircularBuffer* scratch_buffer);
+  void Shutdown();
+  void Clear();
+
+  EntryView* Demand(void* host_base, size_t length,
+                    const TextureInfo& texture_info,
+                    const SamplerInfo& sampler_info);
+
+ private:
+  bool SetupTexture(GLuint texture, const TextureInfo& texture_info);
+  bool SetupSampler(GLuint sampler, const TextureInfo& texture_info,
+                    const SamplerInfo& sampler_info);
+
+  bool UploadTexture2D(GLuint texture, void* host_base, size_t length,
+                       const TextureInfo& texture_info,
+                       const SamplerInfo& sampler_info);
+
+  CircularBuffer* scratch_buffer_;
+  std::vector<Entry> entries_;
+};
+
+}  // namespace gl4
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_GL4_TEXTURE_CACHE_H_
--- a/src/xenia/gpu/gl4/wgl_control.cc
+++ b/src/xenia/gpu/gl4/wgl_control.cc
@ -74,17 +74,32 @@ LRESULT WGLControl::WndProc(HWND hWnd, UINT message, WPARAM wParam,
                            LPARAM lParam) {
  switch (message) {
    case WM_PAINT: {
-      GLContextLock context_lock(&context_);
-      // TODO(benvanik): is viewport needed?
-      glViewport(0, 0, width_, height_);
-      float clear_color[] = {rand() / (float)RAND_MAX, 1.0f, 0, 1.0f};
-      glClearNamedFramebufferfv(0, GL_COLOR, 0, clear_color);
-      if (current_paint_callback_) {
-        current_paint_callback_();
-        current_paint_callback_ = nullptr;
+      {
+        GLContextLock context_lock(&context_);
+        wglSwapIntervalEXT(0);
+
+        // TODO(benvanik): is viewport needed?
+        glViewport(0, 0, width_, height_);
+        float clear_color[] = {rand() / (float)RAND_MAX, 1.0f, 0, 1.0f};
+        glClearNamedFramebufferfv(0, GL_COLOR, 0, clear_color);
+
+        if (current_paint_callback_) {
+          current_paint_callback_();
+          current_paint_callback_ = nullptr;
+        }
+
+        // TODO(benvanik): profiler present.
+        // Profiler::Present();
+
+        // Hacky swap timer.
+        static int swap_count = 0;
+        glEnable(GL_SCISSOR_TEST);
+        glScissor(0, 0, 20, 20);
+        float red[] = {swap_count / 60.0f, 0, 0, 1.0f};
+        swap_count = (swap_count + 1) % 60;
+        glClearNamedFramebufferfv(0, GL_COLOR, 0, red);
+        glDisable(GL_SCISSOR_TEST);
      }
-      // TODO(benvanik): profiler present.
-      // Profiler::Present();
      SwapBuffers(context_.dc());
    } break;
  }
--- a/src/xenia/gpu/sampler_info.cc
+++ b/src/xenia/gpu/sampler_info.cc
@ -0,0 +1,31 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <xenia/gpu/sampler_info.h>
+
+namespace xe {
+namespace gpu {
+
+bool SamplerInfo::Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
+                          const ucode::instr_fetch_tex_t& fetch_instr,
+                          SamplerInfo* out_info) {
+  out_info->min_filter = static_cast<ucode::instr_tex_filter_t>(
+      fetch_instr.min_filter == 3 ? fetch.min_filter : fetch_instr.min_filter);
+  out_info->mag_filter = static_cast<ucode::instr_tex_filter_t>(
+      fetch_instr.mag_filter == 3 ? fetch.mag_filter : fetch_instr.mag_filter);
+  out_info->mip_filter = static_cast<ucode::instr_tex_filter_t>(
+      fetch_instr.mip_filter == 3 ? fetch.mip_filter : fetch_instr.mip_filter);
+  out_info->clamp_u = fetch.clamp_x;
+  out_info->clamp_v = fetch.clamp_y;
+  out_info->clamp_w = fetch.clamp_z;
+  return true;
+}
+
+}  //  namespace gpu
+}  //  namespace xe
--- a/src/xenia/gpu/sampler_info.h
+++ b/src/xenia/gpu/sampler_info.h
@ -0,0 +1,41 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SAMPLER_INFO_H_
+#define XENIA_GPU_SAMPLER_INFO_H_
+
+#include <xenia/gpu/ucode.h>
+#include <xenia/gpu/xenos.h>
+
+namespace xe {
+namespace gpu {
+
+struct SamplerInfo {
+  ucode::instr_tex_filter_t min_filter;
+  ucode::instr_tex_filter_t mag_filter;
+  ucode::instr_tex_filter_t mip_filter;
+  uint32_t clamp_u;
+  uint32_t clamp_v;
+  uint32_t clamp_w;
+
+  static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
+                      const ucode::instr_fetch_tex_t& fetch_instr,
+                      SamplerInfo* out_info);
+
+  bool operator==(const SamplerInfo& other) const {
+    return min_filter == other.min_filter && mag_filter == other.mag_filter &&
+           mip_filter == other.mip_filter && clamp_u == other.clamp_u &&
+           clamp_v == other.clamp_v && clamp_w == other.clamp_w;
+  }
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_SAMPLER_INFO_H_
--- a/src/xenia/gpu/shader.cc
+++ b/src/xenia/gpu/shader.cc
@ -172,6 +172,8 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
    return;
  }

+  assert_true(vtx->const_index <= 0x1F);
+
  uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel;
  auto& inputs = buffer_inputs_;
  BufferDescElement* el = nullptr;
@ -240,10 +242,12 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
 void Shader::GatherTextureFetch(const instr_fetch_tex_t* tex) {
  // TODO(benvanik): check dest_swiz to see if we are writing anything.

+  assert_true(tex->const_idx < 0x1F);
+
  assert_true(sampler_inputs_.count + 1 < poly::countof(sampler_inputs_.descs));
  auto& input = sampler_inputs_.descs[sampler_inputs_.count++];
  input.input_index = sampler_inputs_.count - 1;
-  input.fetch_slot = tex->const_idx & 0xF;  // ?
+  input.fetch_slot = tex->const_idx & 0xF;  // ??????????????????????????????
  input.tex_fetch = *tex;

  // Format mangling, size estimation, etc.
--- a/src/xenia/gpu/sources.gypi
+++ b/src/xenia/gpu/sources.gypi
@ -9,8 +9,12 @@
    'register_file.cc',
    'register_file.h',
    'register_table.inc',
+    'sampler_info.cc',
+    'sampler_info.h',
    'shader.cc',
    'shader.h',
+    'texture_info.cc',
+    'texture_info.h',
    'ucode.h',
    'ucode_disassembler.cc',
    'ucode_disassembler.h',
--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@ -0,0 +1,239 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <xenia/gpu/texture_info.h>
+
+#include <poly/math.h>
+
+namespace xe {
+namespace gpu {
+
+using namespace xe::gpu::ucode;
+using namespace xe::gpu::xenos;
+
+bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
+                          TextureInfo* out_info) {
+  // http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx
+  // a2xx_sq_surfaceformat
+
+  auto& info = *out_info;
+  info.swizzle = fetch.swizzle;
+
+  info.dimension = static_cast<Dimension>(fetch.dimension);
+  switch (info.dimension) {
+    case Dimension::k1D:
+      info.width = fetch.size_1d.width;
+      break;
+    case Dimension::k2D:
+      info.width = fetch.size_2d.width;
+      info.height = fetch.size_2d.height;
+      break;
+    case Dimension::k3D:
+    case Dimension::kCube:
+      info.width = fetch.size_3d.width;
+      info.height = fetch.size_3d.height;
+      info.depth = fetch.size_3d.depth;
+      break;
+  }
+  info.endianness = static_cast<Endian>(fetch.endianness);
+
+  info.block_size = 0;
+  info.texel_pitch = 0;
+  info.is_tiled = fetch.tiled;
+  info.is_compressed = false;
+  info.input_length = 0;
+  info.format = static_cast<TextureFormat>(fetch.format);
+  switch (fetch.format) {
+    case FMT_8:
+      info.block_size = 1;
+      info.texel_pitch = 1;
+      break;
+    case FMT_1_5_5_5:
+      info.block_size = 1;
+      info.texel_pitch = 2;
+      break;
+    case FMT_8_8_8_8:
+    case FMT_8_8_8_8_AS_16_16_16_16:
+      info.block_size = 1;
+      info.texel_pitch = 4;
+      break;
+    case FMT_4_4_4_4:
+      info.block_size = 1;
+      info.texel_pitch = 2;
+      break;
+    case FMT_16_16_16_16_FLOAT:
+      info.block_size = 1;
+      info.texel_pitch = 8;
+      break;
+    case FMT_32_FLOAT:
+      info.block_size = 1;
+      info.texel_pitch = 4;
+      break;
+    case FMT_DXT1:
+      info.block_size = 4;
+      info.texel_pitch = 8;
+      info.is_compressed = true;
+      break;
+    case FMT_DXT2_3:
+    case FMT_DXT4_5:
+      info.block_size = 4;
+      info.texel_pitch = 16;
+      info.is_compressed = true;
+      break;
+    case FMT_DXT1_AS_16_16_16_16:
+      // TODO(benvanik): conversion?
+      info.block_size = 4;
+      info.texel_pitch = 8;
+      info.is_compressed = true;
+      break;
+    case FMT_DXT2_3_AS_16_16_16_16:
+    case FMT_DXT4_5_AS_16_16_16_16:
+      // TODO(benvanik): conversion?
+      info.block_size = 4;
+      info.texel_pitch = 16;
+      info.is_compressed = true;
+      break;
+    case FMT_1_REVERSE:
+    case FMT_1:
+    case FMT_5_6_5:
+    case FMT_6_5_5:
+    case FMT_2_10_10_10:
+    case FMT_8_A:
+    case FMT_8_B:
+    case FMT_8_8:
+    case FMT_Cr_Y1_Cb_Y0:
+    case FMT_Y1_Cr_Y0_Cb:
+    case FMT_5_5_5_1:
+    case FMT_8_8_8_8_A:
+    case FMT_10_11_11:
+    case FMT_11_11_10:
+    case FMT_24_8:
+    case FMT_24_8_FLOAT:
+    case FMT_16:
+    case FMT_16_16:
+    case FMT_16_16_16_16:
+    case FMT_16_EXPAND:
+    case FMT_16_16_EXPAND:
+    case FMT_16_16_16_16_EXPAND:
+    case FMT_16_FLOAT:
+    case FMT_16_16_FLOAT:
+    case FMT_32:
+    case FMT_32_32:
+    case FMT_32_32_32_32:
+    case FMT_32_32_FLOAT:
+    case FMT_32_32_32_32_FLOAT:
+    case FMT_32_AS_8:
+    case FMT_32_AS_8_8:
+    case FMT_16_MPEG:
+    case FMT_16_16_MPEG:
+    case FMT_8_INTERLACED:
+    case FMT_32_AS_8_INTERLACED:
+    case FMT_32_AS_8_8_INTERLACED:
+    case FMT_16_INTERLACED:
+    case FMT_16_MPEG_INTERLACED:
+    case FMT_16_16_MPEG_INTERLACED:
+    case FMT_DXN:
+    case FMT_2_10_10_10_AS_16_16_16_16:
+    case FMT_10_11_11_AS_16_16_16_16:
+    case FMT_11_11_10_AS_16_16_16_16:
+    case FMT_32_32_32_FLOAT:
+    case FMT_DXT3A:
+    case FMT_DXT5A:
+    case FMT_CTX1:
+    case FMT_DXT3A_AS_1_1_1_1:
+      PLOGE("Unhandled texture format");
+      return false;
+    default:
+      assert_unhandled_case(fetch.format);
+      return false;
+  }
+
+  // Must be called here when we know the format.
+  switch (info.dimension) {
+    case Dimension::k1D:
+      info.CalculateTextureSizes1D(fetch);
+      break;
+    case Dimension::k2D:
+      info.CalculateTextureSizes2D(fetch);
+      break;
+    case Dimension::k3D:
+      // TODO(benvanik): calculate size.
+      return false;
+    case Dimension::kCube:
+      // TODO(benvanik): calculate size.
+      return false;
+  }
+
+  return true;
+}
+
+void TextureInfo::CalculateTextureSizes1D(const xe_gpu_texture_fetch_t& fetch) {
+  // ?
+  size_1d.width = fetch.size_1d.width;
+}
+
+void TextureInfo::CalculateTextureSizes2D(const xe_gpu_texture_fetch_t& fetch) {
+  size_2d.logical_width = 1 + fetch.size_2d.width;
+  size_2d.logical_height = 1 + fetch.size_2d.height;
+
+  size_2d.block_width = size_2d.logical_width / block_size;
+  size_2d.block_height = size_2d.logical_height / block_size;
+
+  if (!is_compressed) {
+    // must be 32x32 but also must have a pitch that is a multiple of 256 bytes
+    uint32_t bytes_per_block = block_size * block_size * texel_pitch;
+    uint32_t width_multiple = 32;
+    if (bytes_per_block) {
+      uint32_t minimum_multiple = 256 / bytes_per_block;
+      if (width_multiple < minimum_multiple) {
+        width_multiple = minimum_multiple;
+      }
+    }
+    size_2d.input_width = poly::round_up(size_2d.logical_width, width_multiple);
+    size_2d.input_height = poly::round_up(size_2d.logical_height, 32);
+    size_2d.output_width = size_2d.logical_width;
+    size_2d.output_height = size_2d.logical_height;
+  } else {
+    // must be 128x128
+    size_2d.input_width = poly::round_up(size_2d.logical_width, 128);
+    size_2d.input_height = poly::round_up(size_2d.logical_height, 128);
+    size_2d.output_width = poly::next_pow2(size_2d.logical_width);
+    size_2d.output_height = poly::next_pow2(size_2d.logical_height);
+  }
+
+  size_2d.logical_pitch = (size_2d.logical_width / block_size) * texel_pitch;
+  size_2d.input_pitch = (size_2d.input_width / block_size) * texel_pitch;
+
+  if (!is_tiled) {
+    input_length = size_2d.block_height * size_2d.logical_pitch;
+  } else {
+    input_length = size_2d.block_height * size_2d.logical_pitch;  // ?
+  }
+}
+
+// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104
+uint32_t TextureInfo::TiledOffset2DOuter(uint32_t y, uint32_t width,
+                                         uint32_t log_bpp) {
+  uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7);
+  uint32_t micro = ((y & 6) << 2) << log_bpp;
+  return macro + ((micro & ~15) << 1) + (micro & 15) +
+         ((y & 8) << (3 + log_bpp)) + ((y & 1) << 4);
+}
+
+uint32_t TextureInfo::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp,
+                                         uint32_t base_offset) {
+  uint32_t macro = (x >> 5) << (bpp + 7);
+  uint32_t micro = (x & 7) << bpp;
+  uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15));
+  return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) +
+         ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6);
+}
+
+}  //  namespace gpu
+}  //  namespace xe
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@ -0,0 +1,140 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_TEXTURE_INFO_H_
+#define XENIA_GPU_TEXTURE_INFO_H_
+
+#include <xenia/gpu/ucode.h>
+#include <xenia/gpu/xenos.h>
+
+namespace xe {
+namespace gpu {
+
+// a2xx_sq_surfaceformat
+enum class TextureFormat : uint32_t {
+  k_1_REVERSE = 0,
+  k_1 = 1,
+  k_8 = 2,
+  k_1_5_5_5 = 3,
+  k_5_6_5 = 4,
+  k_6_5_5 = 5,
+  k_8_8_8_8 = 6,
+  k_2_10_10_10 = 7,
+  k_8_A = 8,
+  k_8_B = 9,
+  k_8_8 = 10,
+  k_Cr_Y1_Cb_Y0 = 11,
+  k_Y1_Cr_Y0_Cb = 12,
+  // ? hole
+  k_8_8_8_8_A = 14,
+  k_4_4_4_4 = 15,
+  k_10_11_11 = 16,
+  k_11_11_10 = 17,
+  k_DXT1 = 18,
+  k_DXT2_3 = 19,
+  k_DXT4_5 = 20,
+  // ? hole
+  k_24_8 = 22,
+  k_24_8_FLOAT = 23,
+  k_16 = 24,
+  k_16_16 = 25,
+  k_16_16_16_16 = 26,
+  k_16_EXPAND = 27,
+  k_16_16_EXPAND = 28,
+  k_16_16_16_16_EXPAND = 29,
+  k_16_FLOAT = 30,
+  k_16_16_FLOAT = 31,
+  k_16_16_16_16_FLOAT = 32,
+  k_32 = 33,
+  k_32_32 = 34,
+  k_32_32_32_32 = 35,
+  k_32_FLOAT = 36,
+  k_32_32_FLOAT = 37,
+  k_32_32_32_32_FLOAT = 38,
+  k_32_AS_8 = 39,
+  k_32_AS_8_8 = 40,
+  k_16_MPEG = 41,
+  k_16_16_MPEG = 42,
+  k_8_INTERLACED = 43,
+  k_32_AS_8_INTERLACED = 44,
+  k_32_AS_8_8_INTERLACED = 45,
+  k_16_INTERLACED = 46,
+  k_16_MPEG_INTERLACED = 47,
+  k_16_16_MPEG_INTERLACED = 48,
+  k_DXN = 49,
+  k_8_8_8_8_AS_16_16_16_16 = 50,
+  k_DXT1_AS_16_16_16_16 = 51,
+  k_DXT2_3_AS_16_16_16_16 = 52,
+  k_DXT4_5_AS_16_16_16_16 = 53,
+  k_2_10_10_10_AS_16_16_16_16 = 54,
+  k_10_11_11_AS_16_16_16_16 = 55,
+  k_11_11_10_AS_16_16_16_16 = 56,
+  k_32_32_32_FLOAT = 57,
+  k_DXT3A = 58,
+  k_DXT5A = 59,
+  k_CTX1 = 60,
+  k_DXT3A_AS_1_1_1_1 = 61,
+
+  kUnknown = 0xFFFFFFFFu,
+};
+
+struct TextureInfo {
+  uint32_t swizzle;
+  Dimension dimension;
+  uint32_t width;
+  uint32_t height;
+  uint32_t depth;
+  uint32_t block_size;
+  uint32_t texel_pitch;
+  xenos::Endian endianness;
+  bool is_tiled;
+  bool is_compressed;
+  uint32_t input_length;
+
+  TextureFormat format;
+
+  union {
+    struct {
+      uint32_t width;
+    } size_1d;
+    struct {
+      uint32_t logical_width;
+      uint32_t logical_height;
+      uint32_t block_width;
+      uint32_t block_height;
+      uint32_t input_width;
+      uint32_t input_height;
+      uint32_t output_width;
+      uint32_t output_height;
+      uint32_t logical_pitch;
+      uint32_t input_pitch;
+    } size_2d;
+    struct {
+    } size_3d;
+    struct {
+    } size_cube;
+  };
+
+  static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
+                      TextureInfo* out_info);
+
+  static uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width,
+                                     uint32_t log_bpp);
+  static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp,
+                                     uint32_t base_offset);
+
+ private:
+  void CalculateTextureSizes1D(const xenos::xe_gpu_texture_fetch_t& fetch);
+  void CalculateTextureSizes2D(const xenos::xe_gpu_texture_fetch_t& fetch);
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_TEXTURE_INFO_H_
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -35,6 +35,13 @@ enum class PrimitiveType : uint32_t {
  kQuadList = 0x0D,
 };

+enum class Dimension : uint32_t {
+  k1D = 0,
+  k2D = 1,
+  k3D = 2,
+  kCube = 3,
+};
+
 namespace xenos {

 typedef enum {