diff --git a/src/poly/memory.cc b/src/poly/memory.cc
index 585e38a82..7514663d4 100644
--- a/src/poly/memory.cc
+++ b/src/poly/memory.cc
@@ -36,69 +36,39 @@ size_t page_size() {
 // http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/f2bc76cc65ffba51a141950f98e75364e49df874/entry/volk/kernels/volk/volk_32u_byteswap.h
 // http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/2c4c371885c31222362f70a1cd714415d1398021/entry/volk/kernels/volk/volk_64u_byteswap.h
 
-void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, size_t count,
-                              uint16_t* out_max_value) {
-  return copy_and_swap_16_unaligned(dest, src, count, out_max_value);
+void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src,
+                              size_t count) {
+  return copy_and_swap_16_unaligned(dest, src, count);
 }
 
 void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src,
-                                size_t count, uint16_t* out_max_value) {
-  if (out_max_value) {
-    uint16_t max_value = 0;
-    for (size_t i = 0; i < count; ++i) {
-      uint16_t value = byte_swap(src[i]);
-      max_value = std::max(max_value, value);
-      dest[i] = value;
-    }
-    *out_max_value = max_value;
-  } else {
-    for (size_t i = 0; i < count; ++i) {
-      dest[i] = byte_swap(src[i]);
-    }
+                                size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dest[i] = byte_swap(src[i]);
   }
 }
 
-void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, size_t count,
-                              uint32_t* out_max_value) {
-  return copy_and_swap_32_unaligned(dest, src, count, out_max_value);
+void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src,
+                              size_t count) {
+  return copy_and_swap_32_unaligned(dest, src, count);
 }
 
 void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src,
-                                size_t count, uint32_t* out_max_value) {
-  if (out_max_value) {
-    uint32_t max_value = 0;
-    for (size_t i = 0; i < count; ++i) {
-      uint32_t value = byte_swap(src[i]);
-      max_value = std::max(max_value, value);
-      dest[i] = value;
-    }
-    *out_max_value = max_value;
-  } else {
-    for (size_t i = 0; i < count; ++i) {
-      dest[i] = byte_swap(src[i]);
-    }
+                                size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dest[i] = byte_swap(src[i]);
   }
 }
 
-void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, size_t count,
-                              uint64_t* out_max_value) {
-  return copy_and_swap_64_unaligned(dest, src, count, out_max_value);
+void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src,
+                              size_t count) {
+  return copy_and_swap_64_unaligned(dest, src, count);
 }
 
 void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
-                                size_t count, uint64_t* out_max_value) {
-  if (out_max_value) {
-    uint64_t max_value = 0;
-    for (size_t i = 0; i < count; ++i) {
-      uint64_t value = byte_swap(src[i]);
-      max_value = std::max(max_value, value);
-      dest[i] = value;
-    }
-    *out_max_value = max_value;
-  } else {
-    for (size_t i = 0; i < count; ++i) {
-      dest[i] = byte_swap(src[i]);
-    }
+                                size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dest[i] = byte_swap(src[i]);
   }
 }
 
diff --git a/src/poly/memory.h b/src/poly/memory.h
index 8214b6c5c..be3e643ea 100644
--- a/src/poly/memory.h
+++ b/src/poly/memory.h
@@ -29,21 +29,18 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) {
 
 size_t page_size();
 
-void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, size_t count,
-                              uint16_t* out_max_value = nullptr);
+void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src,
+                              size_t count);
 void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src,
-                                size_t count,
-                                uint16_t* out_max_value = nullptr);
-void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, size_t count,
-                              uint32_t* out_max_value = nullptr);
+                                size_t count);
+void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src,
+                              size_t count);
 void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src,
-                                size_t count,
-                                uint32_t* out_max_value = nullptr);
-void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, size_t count,
-                              uint64_t* out_max_value = nullptr);
+                                size_t count);
+void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src,
+                              size_t count);
 void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
-                                size_t count,
-                                uint64_t* out_max_value = nullptr);
+                                size_t count);
 
 template <typename T>
 void copy_and_swap(T* dest, const T* src, size_t count) {
diff --git a/src/xenia/gpu/gl4/circular_buffer.cc b/src/xenia/gpu/gl4/circular_buffer.cc
index bfd7cbf2d..538b71e82 100644
--- a/src/xenia/gpu/gl4/circular_buffer.cc
+++ b/src/xenia/gpu/gl4/circular_buffer.cc
@@ -86,10 +86,32 @@ CircularBuffer::Allocation CircularBuffer::Acquire(size_t length) {
   allocation.offset = write_head_;
   allocation.length = length;
   allocation.aligned_length = aligned_length;
+  allocation.cache_key = 0;
   write_head_ += aligned_length;
   return allocation;
 }
 
+bool CircularBuffer::AcquireCached(uint32_t key, size_t length,
+                                   Allocation* out_allocation) {
+  uint64_t full_key = key | (length << 32);
+  auto& it = allocation_cache_.find(full_key);
+  if (it != allocation_cache_.end()) {
+    uintptr_t write_head = it->second;
+    size_t aligned_length = poly::round_up(length, alignment_);
+    out_allocation->host_ptr = host_base_ + write_head;
+    out_allocation->gpu_ptr = gpu_base_ + write_head;
+    out_allocation->offset = write_head;
+    out_allocation->length = length;
+    out_allocation->aligned_length = aligned_length;
+    out_allocation->cache_key = full_key;
+    return true;
+  } else {
+    *out_allocation = Acquire(length);
+    out_allocation->cache_key = full_key;
+    return false;
+  }
+}
+
 void CircularBuffer::Discard(Allocation allocation) {
   write_head_ -= allocation.aligned_length;
 }
@@ -100,6 +122,9 @@ void CircularBuffer::Commit(Allocation allocation) {
   dirty_start_ = std::min(dirty_start_, start);
   dirty_end_ = std::max(dirty_end_, end);
   assert_true(dirty_end_ <= capacity_);
+  if (allocation.cache_key) {
+    allocation_cache_.insert({allocation.cache_key, allocation.offset});
+  }
 }
 
 void CircularBuffer::Flush() {
@@ -112,10 +137,13 @@ void CircularBuffer::Flush() {
   dirty_end_ = 0;
 }
 
+void CircularBuffer::ClearCache() { allocation_cache_.clear(); }
+
 void CircularBuffer::WaitUntilClean() {
   Flush();
   glFinish();
   write_head_ = 0;
+  ClearCache();
 }
 
 }  // namespace gl4
diff --git a/src/xenia/gpu/gl4/circular_buffer.h b/src/xenia/gpu/gl4/circular_buffer.h
index 7a0232693..da1ebd788 100644
--- a/src/xenia/gpu/gl4/circular_buffer.h
+++ b/src/xenia/gpu/gl4/circular_buffer.h
@@ -10,6 +10,8 @@
 #ifndef XENIA_GPU_GL4_CIRCULAR_BUFFER_H_
 #define XENIA_GPU_GL4_CIRCULAR_BUFFER_H_
 
+#include <unordered_map>
+
 #include "xenia/gpu/gl4/gl_context.h"
 
 namespace xe {
@@ -29,6 +31,7 @@ class CircularBuffer {
     size_t offset;
     size_t length;
     size_t aligned_length;
+    uint64_t cache_key;  // 0 if caching disabled.
   };
 
   bool Initialize();
@@ -40,9 +43,11 @@ class CircularBuffer {
 
   bool CanAcquire(size_t length);
   Allocation Acquire(size_t length);
+  bool AcquireCached(uint32_t key, size_t length, Allocation* out_allocation);
   void Discard(Allocation allocation);
   void Commit(Allocation allocation);
   void Flush();
+  void ClearCache();
 
   void WaitUntilClean();
 
@@ -55,6 +60,8 @@ class CircularBuffer {
   GLuint buffer_;
   GLuint64 gpu_base_;
   uint8_t* host_base_;
+
+  std::unordered_map<uint64_t, uintptr_t> allocation_cache_;
 };
 
 }  // namespace gl4
diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc
index 8e3a5eed1..fd50eee70 100644
--- a/src/xenia/gpu/gl4/command_processor.cc
+++ b/src/xenia/gpu/gl4/command_processor.cc
@@ -524,6 +524,8 @@ void CommandProcessor::MakeCoherent() {
   // Mark coherent.
   status_host &= ~0x80000000ul;
   regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host;
+
+  scratch_buffer_.ClearCache();
 }
 
 void CommandProcessor::PrepareForWait() {
@@ -1431,8 +1433,6 @@ bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE(
 bool CommandProcessor::LoadShader(ShaderType shader_type,
                                   const uint32_t* address,
                                   uint32_t dword_count) {
-  SCOPE_profile_cpu_f("gpu");
-
   // Hash the input memory and lookup the shader.
   GL4Shader* shader_ptr = nullptr;
   uint64_t hash = XXH64(address, dword_count * sizeof(uint32_t), 0);
@@ -2288,30 +2288,29 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer() {
   assert_true(info.endianness == Endian::k8in16 ||
               info.endianness == Endian::k8in32);
 
+  trace_writer_.WriteMemoryRead(info.guest_base, info.length);
+
   size_t total_size =
       info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
                                                        : sizeof(uint16_t));
-  auto allocation = scratch_buffer_.Acquire(total_size);
-
-  trace_writer_.WriteMemoryRead(info.guest_base, info.length);
-  if (info.format == IndexFormat::kInt32) {
-    auto dest = reinterpret_cast<uint32_t*>(allocation.host_ptr);
-    auto src = reinterpret_cast<const uint32_t*>(membase_ + info.guest_base);
-    uint32_t max_index_found;
-    poly::copy_and_swap_32_aligned(dest, src, info.count, &max_index_found);
-    index_buffer_info_.max_index_found = max_index_found;
+  CircularBuffer::Allocation allocation;
+  if (!scratch_buffer_.AcquireCached(info.guest_base, total_size,
+                                     &allocation)) {
+    if (info.format == IndexFormat::kInt32) {
+      auto dest = reinterpret_cast<uint32_t*>(allocation.host_ptr);
+      auto src = reinterpret_cast<const uint32_t*>(membase_ + info.guest_base);
+      poly::copy_and_swap_32_aligned(dest, src, info.count);
+    } else {
+      auto dest = reinterpret_cast<uint16_t*>(allocation.host_ptr);
+      auto src = reinterpret_cast<const uint16_t*>(membase_ + info.guest_base);
+      poly::copy_and_swap_16_aligned(dest, src, info.count);
+    }
+    draw_batcher_.set_index_buffer(allocation);
+    scratch_buffer_.Commit(std::move(allocation));
   } else {
-    auto dest = reinterpret_cast<uint16_t*>(allocation.host_ptr);
-    auto src = reinterpret_cast<const uint16_t*>(membase_ + info.guest_base);
-    uint16_t max_index_found;
-    poly::copy_and_swap_16_aligned(dest, src, info.count, &max_index_found);
-    index_buffer_info_.max_index_found = max_index_found;
+    draw_batcher_.set_index_buffer(allocation);
   }
 
-  draw_batcher_.set_index_buffer(allocation);
-
-  scratch_buffer_.Commit(std::move(allocation));
-
   return UpdateStatus::kCompatible;
 }
 
@@ -2344,44 +2343,56 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers() {
     }
     assert_true(fetch->endian == 2);
 
-    // Constrain the vertex upload to just what we are interested in.
-    const size_t kRangeKludge = 5;  // could pick index count based on prim.
-    uint32_t max_index = index_buffer_info_.guest_base
-                             ? index_buffer_info_.max_index_found
-                             : draw_index_count_;
-    size_t valid_range = (max_index + kRangeKludge) * desc.stride_words * 4;
-    valid_range = std::min(valid_range, size_t(fetch->size * 4));
-
-    auto allocation = scratch_buffer_.Acquire(valid_range);
+    size_t valid_range = size_t(fetch->size * 4);
 
     trace_writer_.WriteMemoryRead(fetch->address << 2, valid_range);
 
-    // Copy and byte swap the entire buffer.
-    // We could be smart about this to save GPU bandwidth by building a CRC
-    // as we copy and only if it differs from the previous value committing
-    // it (and if it matches just discard and reuse).
-    poly::copy_and_swap_32_aligned(
-        reinterpret_cast<uint32_t*>(allocation.host_ptr),
-        reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
-        valid_range / 4);
+    CircularBuffer::Allocation allocation;
+    if (!scratch_buffer_.AcquireCached(fetch->address << 2, valid_range,
+                                       &allocation)) {
+      // Copy and byte swap the entire buffer.
+      // We could be smart about this to save GPU bandwidth by building a CRC
+      // as we copy and only if it differs from the previous value committing
+      // it (and if it matches just discard and reuse).
+      poly::copy_and_swap_32_aligned(
+          reinterpret_cast<uint32_t*>(allocation.host_ptr),
+          reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
+          valid_range / 4);
 
-    if (!has_bindless_vbos_) {
-      // TODO(benvanik): if we could find a way to avoid this, we could use
-      // multidraw without flushing.
-      glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index,
-                                scratch_buffer_.handle(), allocation.offset,
-                                desc.stride_words * 4);
-    }
+      if (!has_bindless_vbos_) {
+        // TODO(benvanik): if we could find a way to avoid this, we could use
+        // multidraw without flushing.
+        glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index,
+                                  scratch_buffer_.handle(), allocation.offset,
+                                  desc.stride_words * 4);
+      }
 
-    if (has_bindless_vbos_) {
-      for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
-        const auto& el = desc.elements[i];
-        draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4,
-                                        allocation);
+      if (has_bindless_vbos_) {
+        for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
+          const auto& el = desc.elements[i];
+          draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4,
+                                          allocation);
+        }
+      }
+
+      scratch_buffer_.Commit(std::move(allocation));
+    } else {
+      if (!has_bindless_vbos_) {
+        // TODO(benvanik): if we could find a way to avoid this, we could use
+        // multidraw without flushing.
+        glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index,
+                                  scratch_buffer_.handle(), allocation.offset,
+                                  desc.stride_words * 4);
+      }
+
+      if (has_bindless_vbos_) {
+        for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
+          const auto& el = desc.elements[i];
+          draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4,
+                                          allocation);
+        }
       }
     }
-
-    scratch_buffer_.Commit(std::move(allocation));
   }
 
   return UpdateStatus::kCompatible;
diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h
index ffe7bd3cd..1d8aa4142 100644
--- a/src/xenia/gpu/gl4/command_processor.h
+++ b/src/xenia/gpu/gl4/command_processor.h
@@ -277,7 +277,6 @@ class CommandProcessor {
     uint32_t count;
     uint32_t guest_base;
     size_t length;
-    uint32_t max_index_found;
   } index_buffer_info_;
   uint32_t draw_index_count_;