From 130c11a2ca8e34f0766768c30a20ce4d8c0a77ba Mon Sep 17 00:00:00 2001
From: Ben Vanik <ben.vanik@gmail.com>
Date: Fri, 20 Feb 2015 07:47:06 -0800
Subject: [PATCH] GPU recording (--trace_gpu=file) and playback
 (gpu-trace-viewer file).

---
 src/poly/mapped_memory.h                      |   4 +-
 src/poly/mapped_memory_win.cc                 |   4 +-
 src/xenia/emulator.cc                         |   5 +-
 src/xenia/gpu/gl4/command_processor.cc        | 315 ++++++++----------
 src/xenia/gpu/gl4/command_processor.h         |  82 +++--
 src/xenia/gpu/gl4/gl4_gpu.cc                  |   4 +-
 src/xenia/gpu/gl4/gl4_gpu.h                   |   2 +-
 src/xenia/gpu/gl4/gl4_graphics_system.cc      | 127 ++++++-
 src/xenia/gpu/gl4/gl4_graphics_system.h       |   8 +-
 src/xenia/gpu/gpu-private.h                   |   3 +-
 src/xenia/gpu/gpu.cc                          |   9 +-
 src/xenia/gpu/gpu.h                           |   4 +-
 src/xenia/gpu/graphics_system.cc              |  20 +-
 src/xenia/gpu/graphics_system.h               |  28 +-
 src/xenia/gpu/sources.gypi                    |   1 +
 src/xenia/gpu/trace_viewer_main.cc            |  73 ++++
 src/xenia/gpu/tracing.h                       | 211 ++++++++++++
 src/xenia/gpu/xenos.h                         |   9 -
 .../kernel/fs/devices/disc_image_device.cc    |   3 +-
 .../kernel/fs/devices/host_path_entry.cc      |   4 +-
 .../fs/devices/stfs_container_device.cc       |   3 +-
 src/xenia/xenia_main.cc                       |  10 +-
 xenia.gyp                                     |  23 ++
 23 files changed, 667 insertions(+), 285 deletions(-)
 create mode 100644 src/xenia/gpu/trace_viewer_main.cc
 create mode 100644 src/xenia/gpu/tracing.h

diff --git a/src/poly/mapped_memory.h b/src/poly/mapped_memory.h
index 0d1ffffd2..9db211e4f 100644
--- a/src/poly/mapped_memory.h
+++ b/src/poly/mapped_memory.h
@@ -18,8 +18,8 @@ namespace poly {
 class MappedMemory {
  public:
   enum class Mode {
-    READ,
-    READ_WRITE,
+    kRead,
+    kReadWrite,
   };
 
   virtual ~MappedMemory() = default;
diff --git a/src/poly/mapped_memory_win.cc b/src/poly/mapped_memory_win.cc
index 98162567c..83320fd12 100644
--- a/src/poly/mapped_memory_win.cc
+++ b/src/poly/mapped_memory_win.cc
@@ -45,14 +45,14 @@ std::unique_ptr<MappedMemory> MappedMemory::Open(const std::wstring& path,
   DWORD mapping_protect = 0;
   DWORD view_access = 0;
   switch (mode) {
-    case Mode::READ:
+    case Mode::kRead:
       file_access |= GENERIC_READ;
       file_share |= FILE_SHARE_READ;
       create_mode |= OPEN_EXISTING;
       mapping_protect |= PAGE_READONLY;
       view_access |= FILE_MAP_READ;
       break;
-    case Mode::READ_WRITE:
+    case Mode::kReadWrite:
       file_access |= GENERIC_READ | GENERIC_WRITE;
       file_share |= 0;
       create_mode |= OPEN_EXISTING;
diff --git a/src/xenia/emulator.cc b/src/xenia/emulator.cc
index df4960e39..f7011c15b 100644
--- a/src/xenia/emulator.cc
+++ b/src/xenia/emulator.cc
@@ -102,7 +102,7 @@ X_STATUS Emulator::Setup() {
   }
 
   // Initialize the GPU.
-  graphics_system_ = std::move(xe::gpu::Create(this));
+  graphics_system_ = std::move(xe::gpu::Create());
   if (!graphics_system_) {
     return X_STATUS_NOT_IMPLEMENTED;
   }
@@ -122,7 +122,8 @@ X_STATUS Emulator::Setup() {
   if (result) {
     return result;
   }
-  result = graphics_system_->Setup();
+  result = graphics_system_->Setup(processor_.get(), main_window_->loop(),
+                                   main_window_.get());
   if (result) {
     return result;
   }
diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc
index 4eebfc138..47b9e83e6 100644
--- a/src/xenia/gpu/gl4/command_processor.cc
+++ b/src/xenia/gpu/gl4/command_processor.cc
@@ -22,9 +22,6 @@
 
 #include "third_party/xxhash/xxhash.h"
 
-#define XETRACECP(fmt, ...) \
-  if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__)
-
 #define FINE_GRAINED_DRAW_SCOPES 1
 
 namespace xe {
@@ -56,6 +53,7 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
       membase_(graphics_system->memory()->membase()),
       graphics_system_(graphics_system),
       register_file_(graphics_system_->register_file()),
+      trace_writer_(graphics_system->memory()->membase()),
       worker_running_(true),
       time_base_(0),
       counter_(0),
@@ -94,6 +92,8 @@ uint64_t CommandProcessor::QueryTime() {
 bool CommandProcessor::Initialize(std::unique_ptr<GLContext> context) {
   context_ = std::move(context);
 
+  pending_fn_event_ = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+
   worker_running_ = true;
   worker_thread_ = std::thread([this]() {
     poly::threading::set_name("GL4 Worker");
@@ -106,6 +106,8 @@ bool CommandProcessor::Initialize(std::unique_ptr<GLContext> context) {
 }
 
 void CommandProcessor::Shutdown() {
+  EndTracing();
+
   worker_running_ = false;
   SetEvent(write_ptr_index_event_);
   worker_thread_.join();
@@ -115,6 +117,22 @@ void CommandProcessor::Shutdown() {
   shader_cache_.clear();
 
   context_.reset();
+
+  CloseHandle(pending_fn_event_);
+}
+
+void CommandProcessor::BeginTracing(const std::wstring& root_path) {
+  std::wstring path = poly::join_paths(root_path, L"gpu_trace");
+  trace_writer_.Open(path);
+}
+
+void CommandProcessor::EndTracing() { trace_writer_.Close(); }
+
+void CommandProcessor::CallInThread(std::function<void()> fn) {
+  assert_null(pending_fn_);
+  pending_fn_ = std::move(fn);
+  WaitForSingleObject(pending_fn_event_, INFINITE);
+  ResetEvent(pending_fn_event_);
 }
 
 void CommandProcessor::WorkerMain() {
@@ -125,6 +143,13 @@ void CommandProcessor::WorkerMain() {
   }
 
   while (worker_running_) {
+    if (pending_fn_) {
+      auto fn = std::move(pending_fn_);
+      pending_fn_ = nullptr;
+      fn();
+      SetEvent(pending_fn_event_);
+    }
+
     uint32_t write_ptr_index = write_ptr_index_.load();
     if (write_ptr_index == 0xBAADF00D || read_ptr_index_ == write_ptr_index) {
       SCOPE_profile_cpu_i("gpu", "xe::gpu::gl4::CommandProcessor::Stall");
@@ -140,15 +165,15 @@ void CommandProcessor::WorkerMain() {
         SwitchToThread();
         MemoryBarrier();
         write_ptr_index = write_ptr_index_.load();
-      } while (write_ptr_index == 0xBAADF00D ||
-               read_ptr_index_ == write_ptr_index);
+      } while (!pending_fn_ && (write_ptr_index == 0xBAADF00D ||
+                                read_ptr_index_ == write_ptr_index));
       // ReturnFromWait();
+      if (pending_fn_) {
+        continue;
+      }
     }
     assert_true(read_ptr_index_ != write_ptr_index);
 
-    // Process the new commands.
-    XETRACECP("Command processor thread work");
-
     // Execute. Note that we handle wraparound transparently.
     ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index);
     read_ptr_index_ = write_ptr_index;
@@ -378,8 +403,7 @@ void CommandProcessor::UpdateWritePointer(uint32_t value) {
   SetEvent(write_ptr_index_event_);
 }
 
-void CommandProcessor::WriteRegister(uint32_t packet_ptr, uint32_t index,
-                                     uint32_t value) {
+void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
   RegisterFile* regs = register_file_;
   assert_true(index < RegisterFile::kRegisterCount);
   regs->values[index].u32 = value;
@@ -398,8 +422,8 @@ void CommandProcessor::WriteRegister(uint32_t packet_ptr, uint32_t index,
       // Enabled - write to address.
       uint32_t scratch_addr = regs->values[XE_GPU_REG_SCRATCH_ADDR].u32;
       uint32_t mem_addr = scratch_addr + (scratch_reg * 4);
-      poly::store_and_swap<uint32_t>(
-          membase_ + xenos::GpuToCpu(primary_buffer_ptr_, mem_addr), value);
+      poly::store_and_swap<uint32_t>(membase_ + xenos::GpuToCpu(mem_addr),
+                                     value);
     }
   }
 }
@@ -426,8 +450,8 @@ void CommandProcessor::MakeCoherent() {
   }
 
   // TODO(benvanik): notify resource cache of base->size and type.
-  XETRACECP("Make %.8X -> %.8X (%db) coherent", base_host,
-            base_host + size_host, size_host);
+  // XELOGD("Make %.8X -> %.8X (%db) coherent", base_host, base_host +
+  // size_host, size_host);
 
   // Mark coherent.
   status_host &= ~0x80000000ul;
@@ -437,6 +461,8 @@ void CommandProcessor::MakeCoherent() {
 void CommandProcessor::PrepareForWait() {
   SCOPE_profile_cpu_f("gpu");
 
+  trace_writer_.Flush();
+
   // TODO(benvanik): fences and fancy stuff. We should figure out a way to
   // make interrupt callbacks from the GPU so that we don't have to do a full
   // synchronize here.
@@ -494,14 +520,6 @@ class CommandProcessor::RingbufferReader {
 
   void Skip(uint32_t words) { Advance(words); }
 
-  void TraceData(uint32_t words) {
-    for (uint32_t i = 0; i < words; ++i) {
-      uint32_t i_ptr = ptr_ + i * sizeof(uint32_t);
-      XETRACECP("[%.8X]   %.8X", i_ptr,
-                poly::load_and_swap<uint32_t>(membase_ + i_ptr));
-    }
-  }
-
  private:
   uint8_t* membase_;
 
@@ -523,8 +541,7 @@ void CommandProcessor::ExecutePrimaryBuffer(uint32_t start_index,
   uint32_t end_ptr = primary_buffer_ptr_ + end_index * sizeof(uint32_t);
   end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
 
-  XETRACECP("[%.8X] ExecutePrimaryBuffer(%dw -> %dw)", start_ptr, start_index,
-            end_index);
+  trace_writer_.WritePrimaryBufferStart(start_ptr, end_index - start_index);
 
   // Execute commands!
   uint32_t ptr_mask = (primary_buffer_size_ / sizeof(uint32_t)) - 1;
@@ -537,13 +554,13 @@ void CommandProcessor::ExecutePrimaryBuffer(uint32_t start_index,
     assert_true(reader.offset() == (end_index - start_index));
   }
 
-  XETRACECP("           ExecutePrimaryBuffer End");
+  trace_writer_.WritePrimaryBufferEnd();
 }
 
 void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) {
   SCOPE_profile_cpu_f("gpu");
 
-  XETRACECP("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length);
+  trace_writer_.WriteIndirectBufferStart(ptr, length / sizeof(uint32_t));
 
   // Execute commands!
   uint32_t ptr_mask = 0;
@@ -553,29 +570,38 @@ void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) {
     ExecutePacket(&reader);
   }
 
-  XETRACECP("           ExecuteIndirectBuffer End");
+  trace_writer_.WriteIndirectBufferEnd();
+}
+
+void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
+  uint32_t ptr_mask = 0;
+  RingbufferReader reader(membase_, primary_buffer_ptr_, ptr_mask, ptr,
+                          ptr + count * sizeof(uint32_t));
+  while (reader.can_read()) {
+    ExecutePacket(&reader);
+  }
 }
 
 bool CommandProcessor::ExecutePacket(RingbufferReader* reader) {
   RegisterFile* regs = register_file_;
 
-  uint32_t packet_ptr = reader->ptr();
   const uint32_t packet = reader->Read();
   const uint32_t packet_type = packet >> 30;
   if (packet == 0) {
-    XETRACECP("[%.8X] Packet(%.8X): 0?", packet_ptr, packet);
+    trace_writer_.WritePacketStart(reader->ptr() - 4, 1);
+    trace_writer_.WritePacketEnd();
     return true;
   }
 
   switch (packet_type) {
     case 0x00:
-      return ExecutePacketType0(reader, packet_ptr, packet);
+      return ExecutePacketType0(reader, packet);
     case 0x01:
-      return ExecutePacketType1(reader, packet_ptr, packet);
+      return ExecutePacketType1(reader, packet);
     case 0x02:
-      return ExecutePacketType2(reader, packet_ptr, packet);
+      return ExecutePacketType2(reader, packet);
     case 0x03:
-      return ExecutePacketType3(reader, packet_ptr, packet);
+      return ExecutePacketType3(reader, packet);
     default:
       assert_unhandled_case(packet_type);
       return false;
@@ -583,75 +609,66 @@ bool CommandProcessor::ExecutePacket(RingbufferReader* reader) {
 }
 
 bool CommandProcessor::ExecutePacketType0(RingbufferReader* reader,
-                                          uint32_t packet_ptr,
                                           uint32_t packet) {
   // Type-0 packet.
   // Write count registers in sequence to the registers starting at
   // (base_index << 2).
-  XETRACECP("[%.8X] Packet(%.8X): set registers:", packet_ptr, packet);
+
   uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
+  trace_writer_.WritePacketStart(reader->ptr() - 4, 1 + count);
+
   uint32_t base_index = (packet & 0x7FFF);
   uint32_t write_one_reg = (packet >> 15) & 0x1;
   for (uint32_t m = 0; m < count; m++) {
-    uint32_t reg_data = reader->Peek();
+    uint32_t reg_data = reader->Read();
     uint32_t target_index = write_one_reg ? base_index : base_index + m;
-    const char* reg_name = register_file_->GetRegisterName(target_index);
-    XETRACECP("[%.8X]   %.8X -> %.4X %s", reader->ptr(), reg_data, target_index,
-              reg_name ? reg_name : "");
-    reader->Advance(1);
-    WriteRegister(packet_ptr, target_index, reg_data);
+    WriteRegister(target_index, reg_data);
   }
+
+  trace_writer_.WritePacketEnd();
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType1(RingbufferReader* reader,
-                                          uint32_t packet_ptr,
                                           uint32_t packet) {
   // Type-1 packet.
   // Contains two registers of data. Type-0 should be more common.
-  XETRACECP("[%.8X] Packet(%.8X): set registers:", packet_ptr, packet);
+  trace_writer_.WritePacketStart(reader->ptr() - 4, 3);
   uint32_t reg_index_1 = packet & 0x7FF;
   uint32_t reg_index_2 = (packet >> 11) & 0x7FF;
-  uint32_t reg_ptr_1 = reader->ptr();
   uint32_t reg_data_1 = reader->Read();
-  uint32_t reg_ptr_2 = reader->ptr();
   uint32_t reg_data_2 = reader->Read();
-  const char* reg_name_1 = register_file_->GetRegisterName(reg_index_1);
-  const char* reg_name_2 = register_file_->GetRegisterName(reg_index_2);
-  XETRACECP("[%.8X]   %.8X -> %.4X %s", reg_ptr_1, reg_data_1, reg_index_1,
-            reg_name_1 ? reg_name_1 : "");
-  XETRACECP("[%.8X]   %.8X -> %.4X %s", reg_ptr_2, reg_data_2, reg_index_2,
-            reg_name_2 ? reg_name_2 : "");
-  WriteRegister(packet_ptr, reg_index_1, reg_data_1);
-  WriteRegister(packet_ptr, reg_index_2, reg_data_2);
+  WriteRegister(reg_index_1, reg_data_1);
+  WriteRegister(reg_index_2, reg_data_2);
+  trace_writer_.WritePacketEnd();
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType2(RingbufferReader* reader,
-                                          uint32_t packet_ptr,
                                           uint32_t packet) {
   // Type-2 packet.
   // No-op. Do nothing.
-  XETRACECP("[%.8X] Packet(%.8X): padding", packet_ptr, packet);
+  trace_writer_.WritePacketStart(reader->ptr() - 4, 1);
+  trace_writer_.WritePacketEnd();
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3(RingbufferReader* reader,
-                                          uint32_t packet_ptr,
                                           uint32_t packet) {
   // Type-3 packet.
   uint32_t opcode = (packet >> 8) & 0x7F;
   uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
   auto data_start_offset = reader->offset();
 
+  trace_writer_.WritePacketStart(reader->ptr() - 4, 1 + count);
+
   // & 1 == predicate - when set, we do bin check to see if we should execute
   // the packet. Only type 3 packets are affected.
   if (packet & 1) {
     bool any_pass = (bin_select_ & bin_mask_) != 0;
     if (!any_pass) {
-      XETRACECP("[%.8X] Packet(%.8X): SKIPPED (predicate fail)", packet_ptr,
-                packet);
       reader->Skip(count);
+      trace_writer_.WritePacketEnd();
       return true;
     }
   }
@@ -659,96 +676,78 @@ bool CommandProcessor::ExecutePacketType3(RingbufferReader* reader,
   bool result = false;
   switch (opcode) {
     case PM4_ME_INIT:
-      result = ExecutePacketType3_ME_INIT(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_ME_INIT(reader, packet, count);
       break;
     case PM4_NOP:
-      result = ExecutePacketType3_NOP(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_NOP(reader, packet, count);
       break;
     case PM4_INTERRUPT:
-      result = ExecutePacketType3_INTERRUPT(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_INTERRUPT(reader, packet, count);
       break;
     case PM4_XE_SWAP:
-      result = ExecutePacketType3_XE_SWAP(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_XE_SWAP(reader, packet, count);
       break;
     case PM4_INDIRECT_BUFFER:
-      result =
-          ExecutePacketType3_INDIRECT_BUFFER(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_INDIRECT_BUFFER(reader, packet, count);
       break;
     case PM4_WAIT_REG_MEM:
-      result =
-          ExecutePacketType3_WAIT_REG_MEM(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_WAIT_REG_MEM(reader, packet, count);
       break;
     case PM4_REG_RMW:
-      result = ExecutePacketType3_REG_RMW(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_REG_RMW(reader, packet, count);
       break;
     case PM4_COND_WRITE:
-      result = ExecutePacketType3_COND_WRITE(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_COND_WRITE(reader, packet, count);
       break;
     case PM4_EVENT_WRITE:
-      result =
-          ExecutePacketType3_EVENT_WRITE(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_EVENT_WRITE(reader, packet, count);
       break;
     case PM4_EVENT_WRITE_SHD:
-      result =
-          ExecutePacketType3_EVENT_WRITE_SHD(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_EVENT_WRITE_SHD(reader, packet, count);
       break;
     case PM4_EVENT_WRITE_EXT:
-      result =
-          ExecutePacketType3_EVENT_WRITE_EXT(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_EVENT_WRITE_EXT(reader, packet, count);
       break;
     case PM4_DRAW_INDX:
-      result = ExecutePacketType3_DRAW_INDX(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_DRAW_INDX(reader, packet, count);
       break;
     case PM4_DRAW_INDX_2:
-      result =
-          ExecutePacketType3_DRAW_INDX_2(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_DRAW_INDX_2(reader, packet, count);
       break;
     case PM4_SET_CONSTANT:
-      result =
-          ExecutePacketType3_SET_CONSTANT(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_SET_CONSTANT(reader, packet, count);
       break;
     case PM4_LOAD_ALU_CONSTANT:
-      result = ExecutePacketType3_LOAD_ALU_CONSTANT(reader, packet_ptr, packet,
-                                                    count);
+      result = ExecutePacketType3_LOAD_ALU_CONSTANT(reader, packet, count);
       break;
     case PM4_IM_LOAD:
-      result = ExecutePacketType3_IM_LOAD(reader, packet_ptr, packet, count);
+      result = ExecutePacketType3_IM_LOAD(reader, packet, count);
       break;
     case PM4_IM_LOAD_IMMEDIATE:
-      result = ExecutePacketType3_IM_LOAD_IMMEDIATE(reader, packet_ptr, packet,
-                                                    count);
+      result = ExecutePacketType3_IM_LOAD_IMMEDIATE(reader, packet, count);
       break;
     case PM4_INVALIDATE_STATE:
-      result = ExecutePacketType3_INVALIDATE_STATE(reader, packet_ptr, packet,
-                                                   count);
+      result = ExecutePacketType3_INVALIDATE_STATE(reader, packet, count);
       break;
 
     case PM4_SET_BIN_MASK_LO: {
       uint32_t value = reader->Read();
-      XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_LO = %.8X", packet_ptr,
-                packet, value);
       bin_mask_ = (bin_mask_ & 0xFFFFFFFF00000000ull) | value;
       result = true;
     } break;
     case PM4_SET_BIN_MASK_HI: {
       uint32_t value = reader->Read();
-      XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_HI = %.8X", packet_ptr,
-                packet, value);
       bin_mask_ =
           (bin_mask_ & 0xFFFFFFFFull) | (static_cast<uint64_t>(value) << 32);
       result = true;
     } break;
     case PM4_SET_BIN_SELECT_LO: {
       uint32_t value = reader->Read();
-      XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_LO = %.8X", packet_ptr,
-                packet, value);
       bin_select_ = (bin_select_ & 0xFFFFFFFF00000000ull) | value;
       result = true;
     } break;
     case PM4_SET_BIN_SELECT_HI: {
       uint32_t value = reader->Read();
-      XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_HI = %.8X", packet_ptr,
-                packet, value);
       bin_select_ =
           (bin_select_ & 0xFFFFFFFFull) | (static_cast<uint64_t>(value) << 32);
       result = true;
@@ -757,53 +756,44 @@ bool CommandProcessor::ExecutePacketType3(RingbufferReader* reader,
     // Ignored packets - useful if breaking on the default handler below.
     case 0x50:  // 0xC0015000 usually 2 words, 0xFFFFFFFF / 0x00000000
     case 0x51:  // 0xC0015100 usually 2 words, 0xFFFFFFFF / 0xFFFFFFFF
-      XETRACECP("[%.8X] Packet(%.8X): unknown!", packet_ptr, packet);
-      reader->TraceData(count);
       reader->Skip(count);
       break;
 
     default:
-      XETRACECP("[%.8X] Packet(%.8X): unknown!", packet_ptr, packet);
-      reader->TraceData(count);
       reader->Skip(count);
       break;
   }
 
+  trace_writer_.WritePacketEnd();
   assert_true(reader->offset() == data_start_offset + count);
   return result;
 }
 
 bool CommandProcessor::ExecutePacketType3_ME_INIT(RingbufferReader* reader,
-                                                  uint32_t packet_ptr,
+
                                                   uint32_t packet,
                                                   uint32_t count) {
   // initialize CP's micro-engine
-  XETRACECP("[%.8X] Packet(%.8X): PM4_ME_INIT", packet_ptr, packet);
-  reader->TraceData(count);
   reader->Advance(count);
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_NOP(RingbufferReader* reader,
-                                              uint32_t packet_ptr,
+
                                               uint32_t packet, uint32_t count) {
   // skip N 32-bit words to get to the next packet
   // No-op, ignore some data.
-  XETRACECP("[%.8X] Packet(%.8X): PM4_NOP", packet_ptr, packet);
-  reader->TraceData(count);
   reader->Advance(count);
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_INTERRUPT(RingbufferReader* reader,
-                                                    uint32_t packet_ptr,
+
                                                     uint32_t packet,
                                                     uint32_t count) {
   SCOPE_profile_cpu_f("gpu");
 
   // generate interrupt from the command stream
-  XETRACECP("[%.8X] Packet(%.8X): PM4_INTERRUPT", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t cpu_mask = reader->Read();
   for (int n = 0; n < 6; n++) {
     if (cpu_mask & (1 << n)) {
@@ -814,7 +804,7 @@ bool CommandProcessor::ExecutePacketType3_INTERRUPT(RingbufferReader* reader,
 }
 
 bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingbufferReader* reader,
-                                                  uint32_t packet_ptr,
+
                                                   uint32_t packet,
                                                   uint32_t count) {
   SCOPE_profile_cpu_f("gpu");
@@ -826,9 +816,7 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingbufferReader* reader,
   // Xenia-specific VdSwap hook.
   // VdSwap will post this to tell us we need to swap the screen/fire an
   // interrupt.
-  XETRACECP("[%.8X] Packet(%.8X): PM4_XE_SWAP", packet_ptr, packet);
   // 63 words here, but only the first has any data.
-  reader->TraceData(1);
   uint32_t frontbuffer_ptr = reader->Read();
   reader->Advance(count - 1);
 
@@ -868,30 +856,28 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingbufferReader* reader,
     // Remove any dead textures, etc.
     texture_cache_.Scavenge();
   }
+
+  trace_writer_.WriteEvent(EventType::kSwap);
+  trace_writer_.Flush();
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_INDIRECT_BUFFER(
-    RingbufferReader* reader, uint32_t packet_ptr, uint32_t packet,
-    uint32_t count) {
+    RingbufferReader* reader, uint32_t packet, uint32_t count) {
   // indirect buffer dispatch
   uint32_t list_ptr = reader->Read();
   uint32_t list_length = reader->Read();
-  XETRACECP("[%.8X] Packet(%.8X): PM4_INDIRECT_BUFFER %.8X (%dw)", packet_ptr,
-            packet, list_ptr, list_length);
   ExecuteIndirectBuffer(GpuToCpu(list_ptr), list_length);
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingbufferReader* reader,
-                                                       uint32_t packet_ptr,
+
                                                        uint32_t packet,
                                                        uint32_t count) {
   SCOPE_profile_cpu_f("gpu");
 
   // wait until a register or memory location is a specific value
-  XETRACECP("[%.8X] Packet(%.8X): PM4_WAIT_REG_MEM", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t wait_info = reader->Read();
   uint32_t poll_reg_addr = reader->Read();
   uint32_t ref = reader->Read();
@@ -904,9 +890,9 @@ bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingbufferReader* reader,
       // Memory.
       auto endianness = static_cast<Endian>(poll_reg_addr & 0x3);
       poll_reg_addr &= ~0x3;
-      value =
-          poly::load<uint32_t>(membase_ + GpuToCpu(packet_ptr, poll_reg_addr));
+      value = poly::load<uint32_t>(membase_ + GpuToCpu(poll_reg_addr));
       value = GpuSwap(value, endianness);
+      trace_writer_.WriteMemoryRead(poll_reg_addr, 4);
     } else {
       // Register.
       assert_true(poll_reg_addr < RegisterFile::kRegisterCount);
@@ -963,13 +949,11 @@ bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingbufferReader* reader,
 }
 
 bool CommandProcessor::ExecutePacketType3_REG_RMW(RingbufferReader* reader,
-                                                  uint32_t packet_ptr,
+
                                                   uint32_t packet,
                                                   uint32_t count) {
   // register read/modify/write
   // ? (used during shader upload and edram setup)
-  XETRACECP("[%.8X] Packet(%.8X): PM4_REG_RMW", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t rmw_info = reader->Read();
   uint32_t and_mask = reader->Read();
   uint32_t or_mask = reader->Read();
@@ -988,17 +972,15 @@ bool CommandProcessor::ExecutePacketType3_REG_RMW(RingbufferReader* reader,
     // & imm
     value &= and_mask;
   }
-  WriteRegister(packet_ptr, rmw_info & 0x1FFF, value);
+  WriteRegister(rmw_info & 0x1FFF, value);
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_COND_WRITE(RingbufferReader* reader,
-                                                     uint32_t packet_ptr,
+
                                                      uint32_t packet,
                                                      uint32_t count) {
   // conditional write to memory or register
-  XETRACECP("[%.8X] Packet(%.8X): PM4_COND_WRITE", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t wait_info = reader->Read();
   uint32_t poll_reg_addr = reader->Read();
   uint32_t ref = reader->Read();
@@ -1010,8 +992,8 @@ bool CommandProcessor::ExecutePacketType3_COND_WRITE(RingbufferReader* reader,
     // Memory.
     auto endianness = static_cast<Endian>(poll_reg_addr & 0x3);
     poll_reg_addr &= ~0x3;
-    value =
-        poly::load<uint32_t>(membase_ + GpuToCpu(packet_ptr, poll_reg_addr));
+    trace_writer_.WriteMemoryRead(poll_reg_addr, 4);
+    value = poly::load<uint32_t>(membase_ + GpuToCpu(poll_reg_addr));
     value = GpuSwap(value, endianness);
   } else {
     // Register.
@@ -1052,23 +1034,21 @@ bool CommandProcessor::ExecutePacketType3_COND_WRITE(RingbufferReader* reader,
       auto endianness = static_cast<Endian>(write_reg_addr & 0x3);
       write_reg_addr &= ~0x3;
       write_data = GpuSwap(write_data, endianness);
-      poly::store(membase_ + GpuToCpu(packet_ptr, write_reg_addr), write_data);
+      poly::store(membase_ + GpuToCpu(write_reg_addr), write_data);
+      trace_writer_.WriteMemoryWrite(write_reg_addr, 4);
     } else {
       // Register.
-      WriteRegister(packet_ptr, write_reg_addr, write_data);
+      WriteRegister(write_reg_addr, write_data);
     }
   }
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_EVENT_WRITE(RingbufferReader* reader,
-                                                      uint32_t packet_ptr,
+
                                                       uint32_t packet,
                                                       uint32_t count) {
   // generate an event that creates a write to memory when completed
-  XETRACECP("[%.8X] Packet(%.8X): PM4_EVENT_WRITE (unimplemented!)", packet_ptr,
-            packet);
-  reader->TraceData(count);
   uint32_t initiator = reader->Read();
   if (count == 1) {
     // Just an event flag? Where does this write?
@@ -1081,16 +1061,13 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE(RingbufferReader* reader,
 }
 
 bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_SHD(
-    RingbufferReader* reader, uint32_t packet_ptr, uint32_t packet,
-    uint32_t count) {
+    RingbufferReader* reader, uint32_t packet, uint32_t count) {
   // generate a VS|PS_done event
-  XETRACECP("[%.8X] Packet(%.8X): PM4_EVENT_WRITE_SHD", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t initiator = reader->Read();
   uint32_t address = reader->Read();
   uint32_t value = reader->Read();
   // Writeback initiator.
-  WriteRegister(packet_ptr, XE_GPU_REG_VGT_EVENT_INITIATOR, initiator & 0x3F);
+  WriteRegister(XE_GPU_REG_VGT_EVENT_INITIATOR, initiator & 0x3F);
   uint32_t data_value;
   if ((initiator >> 31) & 0x1) {
     // Write counter (GPU vblank counter?).
@@ -1103,27 +1080,23 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_SHD(
   address &= ~0x3;
   data_value = GpuSwap(data_value, endianness);
   poly::store(membase_ + GpuToCpu(address), data_value);
+  trace_writer_.WriteMemoryWrite(address, 4);
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_EXT(
-    RingbufferReader* reader, uint32_t packet_ptr, uint32_t packet,
-    uint32_t count) {
+    RingbufferReader* reader, uint32_t packet, uint32_t count) {
   // generate a screen extent event
-  XETRACECP("[%.8X] Packet(%.8X): PM4_EVENT_WRITE_EXT", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t unk0 = reader->Read();
   uint32_t unk1 = reader->Read();
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
-                                                    uint32_t packet_ptr,
+
                                                     uint32_t packet,
                                                     uint32_t count) {
   // initiate fetch of index buffer and draw
-  XETRACECP("[%.8X] Packet(%.8X): PM4_DRAW_INDX", packet_ptr, packet);
-  reader->TraceData(count);
   // dword0 = viz query info
   uint32_t dword0 = reader->Read();
   uint32_t dword1 = reader->Read();
@@ -1172,12 +1145,10 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
 }
 
 bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
-                                                      uint32_t packet_ptr,
+
                                                       uint32_t packet,
                                                       uint32_t count) {
   // draw using supplied indices in packet
-  XETRACECP("[%.8X] Packet(%.8X): PM4_DRAW_INDX_2", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t dword0 = reader->Read();
   uint32_t index_count = dword0 >> 16;
   auto prim_type = static_cast<PrimitiveType>(dword0 & 0x3F);
@@ -1198,11 +1169,10 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
 }
 
 bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader,
-                                                       uint32_t packet_ptr,
+
                                                        uint32_t packet,
                                                        uint32_t count) {
   // load constant into chip and to memory
-  XETRACECP("[%.8X] Packet(%.8X): PM4_SET_CONSTANT", packet_ptr, packet);
   // PM4_REG(reg) ((0x4 << 16) | (GSL_HAL_SUBBLOCK_OFFSET(reg)))
   //                                     reg - 0x2000
   uint32_t offset_type = reader->Read();
@@ -1213,10 +1183,7 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader,
       index += 0x2000;  // registers
       for (uint32_t n = 0; n < count - 1; n++, index++) {
         uint32_t data = reader->Read();
-        const char* reg_name = register_file_->GetRegisterName(index);
-        XETRACECP("[%.8X]   %.8X -> %.4X %s", packet_ptr + (1 + n) * 4, data,
-                  index, reg_name ? reg_name : "");
-        WriteRegister(packet_ptr, index, data);
+        WriteRegister(index, data);
       }
       break;
     default:
@@ -1227,10 +1194,8 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader,
 }
 
 bool CommandProcessor::ExecutePacketType3_LOAD_ALU_CONSTANT(
-    RingbufferReader* reader, uint32_t packet_ptr, uint32_t packet,
-    uint32_t count) {
+    RingbufferReader* reader, uint32_t packet, uint32_t count) {
   // load constants from memory
-  XETRACECP("[%.8X] Packet(%.8X): PM4_LOAD_ALU_CONSTANT", packet_ptr, packet);
   uint32_t address = reader->Read();
   address &= 0x3FFFFFFF;
   uint32_t offset_type = reader->Read();
@@ -1238,24 +1203,20 @@ bool CommandProcessor::ExecutePacketType3_LOAD_ALU_CONSTANT(
   uint32_t size = reader->Read();
   size &= 0xFFF;
   index += 0x4000;  // alu constants
+  trace_writer_.WriteMemoryRead(address, size * 4);
   for (uint32_t n = 0; n < size; n++, index++) {
-    uint32_t data = poly::load_and_swap<uint32_t>(
-        membase_ + GpuToCpu(packet_ptr, address + n * 4));
-    const char* reg_name = register_file_->GetRegisterName(index);
-    XETRACECP("[%.8X]   %.8X -> %.4X %s", packet_ptr, data, index,
-              reg_name ? reg_name : "");
-    WriteRegister(packet_ptr, index, data);
+    uint32_t data =
+        poly::load_and_swap<uint32_t>(membase_ + GpuToCpu(address + n * 4));
+    WriteRegister(index, data);
   }
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_IM_LOAD(RingbufferReader* reader,
-                                                  uint32_t packet_ptr,
+
                                                   uint32_t packet,
                                                   uint32_t count) {
   // load sequencer instruction memory (pointer-based)
-  XETRACECP("[%.8X] Packet(%.8X): PM4_IM_LOAD", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t addr_type = reader->Read();
   auto shader_type = static_cast<ShaderType>(addr_type & 0x3);
   uint32_t addr = addr_type & ~0x3;
@@ -1263,18 +1224,16 @@ bool CommandProcessor::ExecutePacketType3_IM_LOAD(RingbufferReader* reader,
   uint32_t start = start_size >> 16;
   uint32_t size_dwords = start_size & 0xFFFF;  // dwords
   assert_true(start == 0);
+  trace_writer_.WriteMemoryRead(addr, size_dwords * 4);
   LoadShader(shader_type,
-             reinterpret_cast<uint32_t*>(membase_ + GpuToCpu(packet_ptr, addr)),
+             reinterpret_cast<uint32_t*>(membase_ + GpuToCpu(addr)),
              size_dwords);
   return true;
 }
 
 bool CommandProcessor::ExecutePacketType3_IM_LOAD_IMMEDIATE(
-    RingbufferReader* reader, uint32_t packet_ptr, uint32_t packet,
-    uint32_t count) {
+    RingbufferReader* reader, uint32_t packet, uint32_t count) {
   // load sequencer instruction memory (code embedded in packet)
-  XETRACECP("[%.8X] Packet(%.8X): PM4_IM_LOAD_IMMEDIATE", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t dword0 = reader->Read();
   uint32_t dword1 = reader->Read();
   auto shader_type = static_cast<ShaderType>(dword0);
@@ -1290,11 +1249,8 @@ bool CommandProcessor::ExecutePacketType3_IM_LOAD_IMMEDIATE(
 }
 
 bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE(
-    RingbufferReader* reader, uint32_t packet_ptr, uint32_t packet,
-    uint32_t count) {
+    RingbufferReader* reader, uint32_t packet, uint32_t count) {
   // selective invalidation of state pointers
-  XETRACECP("[%.8X] Packet(%.8X): PM4_INVALIDATE_STATE", packet_ptr, packet);
-  reader->TraceData(count);
   uint32_t mask = reader->Read();
   // driver_->InvalidateState(mask);
   return true;
@@ -1382,7 +1338,6 @@ bool CommandProcessor::IssueDraw() {
     // No framebuffer, so nothing we do will actually have an effect.
     // Treat it as a no-op.
     // TODO(benvanik): if we have a vs export, still allow it to go.
-    XETRACECP("No-op draw (no framebuffer set)");
     draw_batcher_.DiscardDraw();
     return true;
   }
@@ -2066,6 +2021,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer() {
                                                        : sizeof(uint16_t));
   auto allocation = scratch_buffer_.Acquire(total_size);
 
+  trace_writer_.WriteMemoryRead(info.guest_base, info.length);
   if (info.format == IndexFormat::kInt32) {
     auto dest = reinterpret_cast<uint32_t*>(allocation.host_ptr);
     auto src = reinterpret_cast<const uint32_t*>(membase_ + info.guest_base);
@@ -2125,6 +2081,8 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers() {
 
     auto allocation = scratch_buffer_.Acquire(valid_range);
 
+    trace_writer_.WriteMemoryRead(fetch->address << 2, valid_range);
+
     // Copy and byte swap the entire buffer.
     // We could be smart about this to save GPU bandwidth by building a CRC
     // as we copy and only if it differs from the previous value committing
@@ -2236,6 +2194,9 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSampler(
     return UpdateStatus::kCompatible;  // invalid texture used
   }
 
+  trace_writer_.WriteMemoryRead(texture_info.guest_address,
+                                texture_info.input_length);
+
   auto entry_view = texture_cache_.Demand(texture_info, sampler_info);
   if (!entry_view) {
     // Unable to create/fetch/etc.
diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h
index 769b581a3..3d47971be 100644
--- a/src/xenia/gpu/gl4/command_processor.h
+++ b/src/xenia/gpu/gl4/command_processor.h
@@ -23,6 +23,7 @@
 #include "xenia/gpu/gl4/gl4_shader.h"
 #include "xenia/gpu/gl4/texture_cache.h"
 #include "xenia/gpu/register_file.h"
+#include "xenia/gpu/tracing.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/memory.h"
 
@@ -56,12 +57,18 @@ class CommandProcessor {
 
   bool Initialize(std::unique_ptr<GLContext> context);
   void Shutdown();
+  void CallInThread(std::function<void()> fn);
+
+  void BeginTracing(const std::wstring& root_path);
+  void EndTracing();
 
   void InitializeRingBuffer(uint32_t ptr, uint32_t page_count);
   void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
 
   void UpdateWritePointer(uint32_t value);
 
+  void ExecutePacket(uint32_t ptr, uint32_t count);
+
  private:
   class RingbufferReader;
 
@@ -109,7 +116,7 @@ class CommandProcessor {
   void ShutdownGL();
   GLuint CreateGeometryProgram(const std::string& source);
 
-  void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value);
+  void WriteRegister(uint32_t index, uint32_t value);
   void MakeCoherent();
   void PrepareForWait();
   void ReturnFromWait();
@@ -117,63 +124,48 @@ class CommandProcessor {
   void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
   void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length);
   bool ExecutePacket(RingbufferReader* reader);
-  bool ExecutePacketType0(RingbufferReader* reader, uint32_t packet_ptr,
-                          uint32_t packet);
-  bool ExecutePacketType1(RingbufferReader* reader, uint32_t packet_ptr,
-                          uint32_t packet);
-  bool ExecutePacketType2(RingbufferReader* reader, uint32_t packet_ptr,
-                          uint32_t packet);
-  bool ExecutePacketType3(RingbufferReader* reader, uint32_t packet_ptr,
-                          uint32_t packet);
-  bool ExecutePacketType3_ME_INIT(RingbufferReader* reader, uint32_t packet_ptr,
-                                  uint32_t packet, uint32_t count);
-  bool ExecutePacketType3_NOP(RingbufferReader* reader, uint32_t packet_ptr,
-                              uint32_t packet, uint32_t count);
-  bool ExecutePacketType3_INTERRUPT(RingbufferReader* reader,
-                                    uint32_t packet_ptr, uint32_t packet,
+  bool ExecutePacketType0(RingbufferReader* reader, uint32_t packet);
+  bool ExecutePacketType1(RingbufferReader* reader, uint32_t packet);
+  bool ExecutePacketType2(RingbufferReader* reader, uint32_t packet);
+  bool ExecutePacketType3(RingbufferReader* reader, uint32_t packet);
+  bool ExecutePacketType3_ME_INIT(RingbufferReader* reader, uint32_t packet,
+                                  uint32_t count);
+  bool ExecutePacketType3_NOP(RingbufferReader* reader, uint32_t packet,
+                              uint32_t count);
+  bool ExecutePacketType3_INTERRUPT(RingbufferReader* reader, uint32_t packet,
                                     uint32_t count);
-  bool ExecutePacketType3_XE_SWAP(RingbufferReader* reader, uint32_t packet_ptr,
-                                  uint32_t packet, uint32_t count);
+  bool ExecutePacketType3_XE_SWAP(RingbufferReader* reader, uint32_t packet,
+                                  uint32_t count);
   bool ExecutePacketType3_INDIRECT_BUFFER(RingbufferReader* reader,
-                                          uint32_t packet_ptr, uint32_t packet,
-                                          uint32_t count);
+                                          uint32_t packet, uint32_t count);
   bool ExecutePacketType3_WAIT_REG_MEM(RingbufferReader* reader,
-                                       uint32_t packet_ptr, uint32_t packet,
-                                       uint32_t count);
-  bool ExecutePacketType3_REG_RMW(RingbufferReader* reader, uint32_t packet_ptr,
-                                  uint32_t packet, uint32_t count);
-  bool ExecutePacketType3_COND_WRITE(RingbufferReader* reader,
-                                     uint32_t packet_ptr, uint32_t packet,
+                                       uint32_t packet, uint32_t count);
+  bool ExecutePacketType3_REG_RMW(RingbufferReader* reader, uint32_t packet,
+                                  uint32_t count);
+  bool ExecutePacketType3_COND_WRITE(RingbufferReader* reader, uint32_t packet,
                                      uint32_t count);
-  bool ExecutePacketType3_EVENT_WRITE(RingbufferReader* reader,
-                                      uint32_t packet_ptr, uint32_t packet,
+  bool ExecutePacketType3_EVENT_WRITE(RingbufferReader* reader, uint32_t packet,
                                       uint32_t count);
   bool ExecutePacketType3_EVENT_WRITE_SHD(RingbufferReader* reader,
-                                          uint32_t packet_ptr, uint32_t packet,
-                                          uint32_t count);
+                                          uint32_t packet, uint32_t count);
   bool ExecutePacketType3_EVENT_WRITE_EXT(RingbufferReader* reader,
-                                          uint32_t packet_ptr, uint32_t packet,
-                                          uint32_t count);
-  bool ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
-                                    uint32_t packet_ptr, uint32_t packet,
+                                          uint32_t packet, uint32_t count);
+  bool ExecutePacketType3_DRAW_INDX(RingbufferReader* reader, uint32_t packet,
                                     uint32_t count);
-  bool ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
-                                      uint32_t packet_ptr, uint32_t packet,
+  bool ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader, uint32_t packet,
                                       uint32_t count);
   bool ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader,
-                                       uint32_t packet_ptr, uint32_t packet,
-                                       uint32_t count);
+                                       uint32_t packet, uint32_t count);
   bool ExecutePacketType3_LOAD_ALU_CONSTANT(RingbufferReader* reader,
-                                            uint32_t packet_ptr,
+
                                             uint32_t packet, uint32_t count);
-  bool ExecutePacketType3_IM_LOAD(RingbufferReader* reader, uint32_t packet_ptr,
-                                  uint32_t packet, uint32_t count);
+  bool ExecutePacketType3_IM_LOAD(RingbufferReader* reader, uint32_t packet,
+                                  uint32_t count);
   bool ExecutePacketType3_IM_LOAD_IMMEDIATE(RingbufferReader* reader,
-                                            uint32_t packet_ptr,
+
                                             uint32_t packet, uint32_t count);
   bool ExecutePacketType3_INVALIDATE_STATE(RingbufferReader* reader,
-                                           uint32_t packet_ptr, uint32_t packet,
-                                           uint32_t count);
+                                           uint32_t packet, uint32_t count);
 
   bool LoadShader(ShaderType shader_type, const uint32_t* address,
                   uint32_t dword_count);
@@ -206,10 +198,14 @@ class CommandProcessor {
   GL4GraphicsSystem* graphics_system_;
   RegisterFile* register_file_;
 
+  TraceWriter trace_writer_;
+
   std::thread worker_thread_;
   std::atomic<bool> worker_running_;
   std::unique_ptr<GLContext> context_;
   SwapHandler swap_handler_;
+  std::function<void()> pending_fn_;
+  HANDLE pending_fn_event_;
 
   uint64_t time_base_;
   uint32_t counter_;
diff --git a/src/xenia/gpu/gl4/gl4_gpu.cc b/src/xenia/gpu/gl4/gl4_gpu.cc
index bcbba56ea..a4c2e1bf1 100644
--- a/src/xenia/gpu/gl4/gl4_gpu.cc
+++ b/src/xenia/gpu/gl4/gl4_gpu.cc
@@ -47,9 +47,9 @@ void InitializeIfNeeded() {
 
 void CleanupOnShutdown() {}
 
-std::unique_ptr<GraphicsSystem> Create(Emulator* emulator) {
+std::unique_ptr<GraphicsSystem> Create() {
   InitializeIfNeeded();
-  return std::make_unique<GL4GraphicsSystem>(emulator);
+  return std::make_unique<GL4GraphicsSystem>();
 }
 
 }  // namespace gl4
diff --git a/src/xenia/gpu/gl4/gl4_gpu.h b/src/xenia/gpu/gl4/gl4_gpu.h
index c6144cec4..67512bdda 100644
--- a/src/xenia/gpu/gl4/gl4_gpu.h
+++ b/src/xenia/gpu/gl4/gl4_gpu.h
@@ -19,7 +19,7 @@ namespace xe {
 namespace gpu {
 namespace gl4 {
 
-std::unique_ptr<GraphicsSystem> Create(Emulator* emulator);
+std::unique_ptr<GraphicsSystem> Create();
 
 }  // namespace gl4
 }  // namespace gpu
diff --git a/src/xenia/gpu/gl4/gl4_graphics_system.cc b/src/xenia/gpu/gl4/gl4_graphics_system.cc
index 24bd0527a..f5913d1fe 100644
--- a/src/xenia/gpu/gl4/gl4_graphics_system.cc
+++ b/src/xenia/gpu/gl4/gl4_graphics_system.cc
@@ -14,6 +14,7 @@
 #include "xenia/gpu/gl4/gl4_gpu-private.h"
 #include "xenia/gpu/gl4/gl4_profiler_display.h"
 #include "xenia/gpu/gpu-private.h"
+#include "xenia/gpu/tracing.h"
 
 namespace xe {
 namespace gpu {
@@ -21,13 +22,15 @@ namespace gl4 {
 
 extern "C" GLEWContext* glewGetContext();
 
-GL4GraphicsSystem::GL4GraphicsSystem(Emulator* emulator)
-    : GraphicsSystem(emulator), timer_queue_(nullptr), vsync_timer_(nullptr) {}
+GL4GraphicsSystem::GL4GraphicsSystem()
+    : GraphicsSystem(), timer_queue_(nullptr), vsync_timer_(nullptr) {}
 
 GL4GraphicsSystem::~GL4GraphicsSystem() = default;
 
-X_STATUS GL4GraphicsSystem::Setup() {
-  auto result = GraphicsSystem::Setup();
+X_STATUS GL4GraphicsSystem::Setup(cpu::Processor* processor,
+                                  ui::PlatformLoop* target_loop,
+                                  ui::PlatformWindow* target_window) {
+  auto result = GraphicsSystem::Setup(processor, target_loop, target_window);
   if (result) {
     return result;
   }
@@ -35,14 +38,13 @@ X_STATUS GL4GraphicsSystem::Setup() {
   // Create rendering control.
   // This must happen on the UI thread.
   poly::threading::Fence control_ready_fence;
-  auto loop = emulator_->main_window()->loop();
   std::unique_ptr<GLContext> processor_context;
-  loop->Post([&]() {
+  target_loop_->Post([&]() {
     // Setup the GL control that actually does the drawing.
     // We run here in the loop and only touch it (and its context) on this
     // thread. That means some sync-fu when we want to swap.
-    control_ = std::make_unique<WGLControl>(loop);
-    emulator_->main_window()->AddChild(control_.get());
+    control_ = std::make_unique<WGLControl>(target_loop_);
+    target_window_->AddChild(control_.get());
 
     // Setup the GL context the command processor will do all its drawing in.
     // It's shared with the control context so that we can resolve framebuffers
@@ -70,8 +72,12 @@ X_STATUS GL4GraphicsSystem::Setup() {
   command_processor_->set_swap_handler(
       [this](const SwapParameters& swap_params) { SwapHandler(swap_params); });
 
+  if (!FLAGS_trace_gpu.empty()) {
+    command_processor_->BeginTracing(poly::to_wstring(FLAGS_trace_gpu));
+  }
+
   // Let the processor know we want register access callbacks.
-  emulator_->memory()->AddMappedRange(
+  memory_->AddMappedRange(
       0x7FC80000, 0xFFFF0000, 0x0000FFFF, this,
       reinterpret_cast<cpu::MMIOReadCallback>(MMIOReadRegisterThunk),
       reinterpret_cast<cpu::MMIOWriteCallback>(MMIOWriteRegisterThunk));
@@ -91,6 +97,8 @@ X_STATUS GL4GraphicsSystem::Setup() {
 }
 
 void GL4GraphicsSystem::Shutdown() {
+  command_processor_->EndTracing();
+
   DeleteTimerQueueTimer(timer_queue_, vsync_timer_, nullptr);
   DeleteTimerQueue(timer_queue_);
 
@@ -114,6 +122,101 @@ void GL4GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr,
   command_processor_->EnableReadPointerWriteBack(ptr, block_size);
 }
 
+const uint8_t* GL4GraphicsSystem::PlayTrace(const uint8_t* trace_data,
+                                            size_t trace_size,
+                                            TracePlaybackMode playback_mode) {
+  auto trace_ptr = trace_data;
+  command_processor_->CallInThread([&]() {
+    bool pending_break = false;
+    const PacketStartCommand* pending_packet = nullptr;
+    while (trace_ptr < trace_data + trace_size) {
+      auto type =
+          static_cast<TraceCommandType>(poly::load<uint32_t>(trace_ptr));
+      switch (type) {
+        case TraceCommandType::kPrimaryBufferStart: {
+          auto cmd =
+              reinterpret_cast<const PrimaryBufferStartCommand*>(trace_ptr);
+          //
+          trace_ptr += sizeof(*cmd) + cmd->count * 4;
+          break;
+        }
+        case TraceCommandType::kPrimaryBufferEnd: {
+          auto cmd =
+              reinterpret_cast<const PrimaryBufferEndCommand*>(trace_ptr);
+          //
+          trace_ptr += sizeof(*cmd);
+          break;
+        }
+        case TraceCommandType::kIndirectBufferStart: {
+          auto cmd =
+              reinterpret_cast<const IndirectBufferStartCommand*>(trace_ptr);
+          //
+          trace_ptr += sizeof(*cmd) + cmd->count * 4;
+          break;
+        }
+        case TraceCommandType::kIndirectBufferEnd: {
+          auto cmd =
+              reinterpret_cast<const IndirectBufferEndCommand*>(trace_ptr);
+          //
+          trace_ptr += sizeof(*cmd);
+          break;
+        }
+        case TraceCommandType::kPacketStart: {
+          auto cmd = reinterpret_cast<const PacketStartCommand*>(trace_ptr);
+          trace_ptr += sizeof(*cmd);
+          std::memcpy(memory()->Translate(cmd->base_ptr), trace_ptr,
+                      cmd->count * 4);
+          trace_ptr += cmd->count * 4;
+          pending_packet = cmd;
+          break;
+        }
+        case TraceCommandType::kPacketEnd: {
+          auto cmd = reinterpret_cast<const PacketEndCommand*>(trace_ptr);
+          trace_ptr += sizeof(*cmd);
+          if (pending_packet) {
+            command_processor_->ExecutePacket(pending_packet->base_ptr,
+                                              pending_packet->count);
+            pending_packet = nullptr;
+          }
+          if (pending_break) {
+            return;
+          }
+          break;
+        }
+        case TraceCommandType::kMemoryRead: {
+          auto cmd = reinterpret_cast<const MemoryReadCommand*>(trace_ptr);
+          trace_ptr += sizeof(*cmd);
+          std::memcpy(memory()->Translate(cmd->base_ptr), trace_ptr,
+                      cmd->length);
+          trace_ptr += cmd->length;
+          break;
+        }
+        case TraceCommandType::kMemoryWrite: {
+          auto cmd = reinterpret_cast<const MemoryWriteCommand*>(trace_ptr);
+          trace_ptr += sizeof(*cmd);
+          // ?
+          trace_ptr += cmd->length;
+          break;
+        }
+        case TraceCommandType::kEvent: {
+          auto cmd = reinterpret_cast<const EventCommand*>(trace_ptr);
+          trace_ptr += sizeof(*cmd);
+          switch (cmd->event_type) {
+            case EventType::kSwap: {
+              if (playback_mode == TracePlaybackMode::kBreakOnSwap) {
+                pending_break = true;
+              }
+              break;
+            }
+          }
+          break;
+        }
+      }
+    }
+  });
+  return trace_ptr;
+}
+
 void GL4GraphicsSystem::MarkVblank() {
   static bool thread_name_set = false;
   if (!thread_name_set) {
@@ -147,9 +250,6 @@ void GL4GraphicsSystem::SwapHandler(const SwapParameters& swap_params) {
 
 uint64_t GL4GraphicsSystem::ReadRegister(uint64_t addr) {
   uint32_t r = addr & 0xFFFF;
-  if (FLAGS_trace_ring_buffer) {
-    XELOGGPU("ReadRegister(%.4X)", r);
-  }
 
   switch (r) {
     case 0x3C00:  // ?
@@ -170,9 +270,6 @@ uint64_t GL4GraphicsSystem::ReadRegister(uint64_t addr) {
 
 void GL4GraphicsSystem::WriteRegister(uint64_t addr, uint64_t value) {
   uint32_t r = addr & 0xFFFF;
-  if (FLAGS_trace_ring_buffer) {
-    XELOGGPU("WriteRegister(%.4X, %.8X)", r, value);
-  }
 
   switch (r) {
     case 0x0714:  // CP_RB_WPTR
diff --git a/src/xenia/gpu/gl4/gl4_graphics_system.h b/src/xenia/gpu/gl4/gl4_graphics_system.h
index 6a6d028d8..77b144da0 100644
--- a/src/xenia/gpu/gl4/gl4_graphics_system.h
+++ b/src/xenia/gpu/gl4/gl4_graphics_system.h
@@ -24,10 +24,11 @@ namespace gl4 {
 
 class GL4GraphicsSystem : public GraphicsSystem {
  public:
-  GL4GraphicsSystem(Emulator* emulator);
+  GL4GraphicsSystem();
   ~GL4GraphicsSystem() override;
 
-  X_STATUS Setup() override;
+  X_STATUS Setup(cpu::Processor* processor, ui::PlatformLoop* target_loop,
+                 ui::PlatformWindow* target_window) override;
   void Shutdown() override;
 
   RegisterFile* register_file() { return &register_file_; }
@@ -35,6 +36,9 @@ class GL4GraphicsSystem : public GraphicsSystem {
   void InitializeRingBuffer(uint32_t ptr, uint32_t page_count) override;
   void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size) override;
 
+  const uint8_t* PlayTrace(const uint8_t* trace_data, size_t trace_size,
+                           TracePlaybackMode playback_mode) override;
+
  private:
   void MarkVblank();
   void SwapHandler(const SwapParameters& swap_params);
diff --git a/src/xenia/gpu/gpu-private.h b/src/xenia/gpu/gpu-private.h
index d51698c4a..58f6c81e7 100644
--- a/src/xenia/gpu/gpu-private.h
+++ b/src/xenia/gpu/gpu-private.h
@@ -14,7 +14,8 @@
 
 DECLARE_string(gpu);
 
-DECLARE_bool(trace_ring_buffer);
+DECLARE_string(trace_gpu);
+
 DECLARE_string(dump_shaders);
 
 DECLARE_bool(vsync);
diff --git a/src/xenia/gpu/gpu.cc b/src/xenia/gpu/gpu.cc
index ec0037f64..41c545fcf 100644
--- a/src/xenia/gpu/gpu.cc
+++ b/src/xenia/gpu/gpu.cc
@@ -15,7 +15,8 @@
 
 DEFINE_string(gpu, "any", "Graphics system. Use: [any, gl4]");
 
-DEFINE_bool(trace_ring_buffer, false, "Trace GPU ring buffer packets.");
+DEFINE_string(trace_gpu, "", "Trace GPU data to the given root path.");
+
 DEFINE_string(dump_shaders, "",
               "Path to write GPU shaders to as they are compiled.");
 
@@ -24,14 +25,14 @@ DEFINE_bool(vsync, true, "Enable VSYNC.");
 namespace xe {
 namespace gpu {
 
-std::unique_ptr<GraphicsSystem> Create(Emulator* emulator) {
+std::unique_ptr<GraphicsSystem> Create() {
   if (FLAGS_gpu.compare("gl4") == 0) {
-    return xe::gpu::gl4::Create(emulator);
+    return xe::gpu::gl4::Create();
   } else {
     // Create best available.
     std::unique_ptr<GraphicsSystem> best;
 
-    best = xe::gpu::gl4::Create(emulator);
+    best = xe::gpu::gl4::Create();
     if (best) {
       return best;
     }
diff --git a/src/xenia/gpu/gpu.h b/src/xenia/gpu/gpu.h
index 88541f2ca..912e05406 100644
--- a/src/xenia/gpu/gpu.h
+++ b/src/xenia/gpu/gpu.h
@@ -21,9 +21,9 @@ class Emulator;
 namespace xe {
 namespace gpu {
 
-std::unique_ptr<GraphicsSystem> Create(Emulator* emulator);
+std::unique_ptr<GraphicsSystem> Create();
 
-std::unique_ptr<GraphicsSystem> CreateGL4(Emulator* emulator);
+std::unique_ptr<GraphicsSystem> CreateGL4();
 
 }  // namespace gpu
 }  // namespace xe
diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc
index b8528b7e8..e6803eb1c 100644
--- a/src/xenia/gpu/graphics_system.cc
+++ b/src/xenia/gpu/graphics_system.cc
@@ -10,23 +10,29 @@
 #include "xenia/gpu/graphics_system.h"
 
 #include "poly/poly.h"
-#include "xenia/emulator.h"
 #include "xenia/cpu/processor.h"
 #include "xenia/gpu/gpu-private.h"
 
 namespace xe {
 namespace gpu {
 
-GraphicsSystem::GraphicsSystem(Emulator* emulator)
-    : emulator_(emulator),
-      memory_(emulator->memory()),
+GraphicsSystem::GraphicsSystem()
+    : memory_(nullptr),
+      processor_(nullptr),
+      target_loop_(nullptr),
+      target_window_(nullptr),
       interrupt_callback_(0),
       interrupt_callback_data_(0) {}
 
-GraphicsSystem::~GraphicsSystem() {}
+GraphicsSystem::~GraphicsSystem() = default;
 
-X_STATUS GraphicsSystem::Setup() {
-  processor_ = emulator_->processor();
+X_STATUS GraphicsSystem::Setup(cpu::Processor* processor,
+                               ui::PlatformLoop* target_loop,
+                               ui::PlatformWindow* target_window) {
+  processor_ = processor;
+  memory_ = processor->memory();
+  target_loop_ = target_loop;
+  target_window_ = target_window;
 
   return X_STATUS_SUCCESS;
 }
diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h
index 7c430b412..d5b588a70 100644
--- a/src/xenia/gpu/graphics_system.h
+++ b/src/xenia/gpu/graphics_system.h
@@ -14,7 +14,9 @@
 #include <thread>
 
 #include "xenia/common.h"
-#include "xenia/emulator.h"
+#include "xenia/cpu/processor.h"
+#include "xenia/memory.h"
+#include "xenia/ui/main_window.h"
 #include "xenia/xbox.h"
 
 namespace xe {
@@ -24,25 +26,37 @@ class GraphicsSystem {
  public:
   virtual ~GraphicsSystem();
 
-  Emulator* emulator() const { return emulator_; }
   Memory* memory() const { return memory_; }
   cpu::Processor* processor() const { return processor_; }
 
-  virtual X_STATUS Setup();
+  virtual X_STATUS Setup(cpu::Processor* processor,
+                         ui::PlatformLoop* target_loop,
+                         ui::PlatformWindow* target_window);
   virtual void Shutdown();
 
   void SetInterruptCallback(uint32_t callback, uint32_t user_data);
   virtual void InitializeRingBuffer(uint32_t ptr, uint32_t page_count) = 0;
-  virtual void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size) = 0;
+  virtual void EnableReadPointerWriteBack(uint32_t ptr,
+                                          uint32_t block_size) = 0;
 
   void DispatchInterruptCallback(uint32_t source, uint32_t cpu);
 
- protected:
-  GraphicsSystem(Emulator* emulator);
+  enum class TracePlaybackMode {
+    kUntilEnd,
+    kBreakOnSwap,
+  };
+  virtual const uint8_t* PlayTrace(const uint8_t* trace_data, size_t trace_size,
+                                   TracePlaybackMode playback_mode) {
+    return nullptr;
+  }
+
+ protected:
+  GraphicsSystem();
 
-  Emulator* emulator_;
   Memory* memory_;
   cpu::Processor* processor_;
+  ui::PlatformLoop* target_loop_;
+  ui::PlatformWindow* target_window_;
 
   uint32_t interrupt_callback_;
   uint32_t interrupt_callback_data_;
diff --git a/src/xenia/gpu/sources.gypi b/src/xenia/gpu/sources.gypi
index ec144c8af..02d227a31 100644
--- a/src/xenia/gpu/sources.gypi
+++ b/src/xenia/gpu/sources.gypi
@@ -15,6 +15,7 @@
     'shader.h',
     'texture_info.cc',
     'texture_info.h',
+    'tracing.h',
     'ucode.h',
     'ucode_disassembler.cc',
     'ucode_disassembler.h',
diff --git a/src/xenia/gpu/trace_viewer_main.cc b/src/xenia/gpu/trace_viewer_main.cc
new file mode 100644
index 000000000..63c3d476c
--- /dev/null
+++ b/src/xenia/gpu/trace_viewer_main.cc
@@ -0,0 +1,73 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <gflags/gflags.h>
+#include "poly/main.h"
+#include "poly/mapped_memory.h"
+#include "xenia/gpu/graphics_system.h"
+#include "xenia/gpu/tracing.h"
+#include "xenia/emulator.h"
+#include "xenia/ui/main_window.h"
+
+DEFINE_string(target_trace_file, "", "Specifies the trace file to load.");
+
+namespace xe {
+namespace gpu {
+
+int trace_viewer_main(std::vector<std::wstring>& args) {
+  // Create the emulator.
+  auto emulator = std::make_unique<Emulator>(L"");
+  X_STATUS result = emulator->Setup();
+  if (XFAILED(result)) {
+    XELOGE("Failed to setup emulator: %.8X", result);
+    return 1;
+  }
+
+  // Grab path from the flag or unnamed argument.
+  if (!FLAGS_target_trace_file.empty() || args.size() >= 2) {
+    std::wstring path;
+    if (!FLAGS_target_trace_file.empty()) {
+      // Passed as a named argument.
+      // TODO(benvanik): find something better than gflags that supports
+      // unicode.
+      path = poly::to_wstring(FLAGS_target_trace_file);
+    } else {
+      // Passed as an unnamed argument.
+      path = args[1];
+    }
+    // Normalize the path and make absolute.
+    std::wstring abs_path = poly::to_absolute_path(path);
+
+    // TODO(benvanik): UI? replay control on graphics system?
+    auto graphics_system = emulator->graphics_system();
+    auto mmap =
+        poly::MappedMemory::Open(abs_path, poly::MappedMemory::Mode::kRead);
+    auto trace_data = reinterpret_cast<const uint8_t*>(mmap->data());
+    auto trace_size = mmap->size();
+
+    auto trace_ptr = trace_data;
+    while (trace_ptr < trace_data + trace_size) {
+      trace_ptr = graphics_system->PlayTrace(
+          trace_ptr, trace_size - (trace_ptr - trace_data),
+          GraphicsSystem::TracePlaybackMode::kBreakOnSwap);
+    }
+
+    // Wait until we are exited.
+    emulator->main_window()->loop()->AwaitQuit();
+  }
+
+  emulator.reset();
+  return 0;
+}
+
+}  // namespace gpu
+}  // namespace xe
+
+DEFINE_ENTRY_POINT(L"gpu_trace_viewer", L"gpu_trace_viewer some.trace",
+                   xe::gpu::trace_viewer_main);
diff --git a/src/xenia/gpu/tracing.h b/src/xenia/gpu/tracing.h
new file mode 100644
index 000000000..e1919e4e3
--- /dev/null
+++ b/src/xenia/gpu/tracing.h
@@ -0,0 +1,211 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_TRACING_H_
+#define XENIA_GPU_TRACING_H_
+
+#include <string>
+
+#include "xenia/memory.h"
+
+namespace xe {
+namespace gpu {
+
+enum class TraceCommandType : uint32_t {
+  kPrimaryBufferStart,
+  kPrimaryBufferEnd,
+  kIndirectBufferStart,
+  kIndirectBufferEnd,
+  kPacketStart,
+  kPacketEnd,
+  kMemoryRead,
+  kMemoryWrite,
+  kEvent,
+};
+
+struct PrimaryBufferStartCommand {
+  TraceCommandType type;
+  uint32_t base_ptr;
+  uint32_t count;
+};
+
+struct PrimaryBufferEndCommand {
+  TraceCommandType type;
+};
+
+struct IndirectBufferStartCommand {
+  TraceCommandType type;
+  uint32_t base_ptr;
+  uint32_t count;
+};
+
+struct IndirectBufferEndCommand {
+  TraceCommandType type;
+};
+
+struct PacketStartCommand {
+  TraceCommandType type;
+  uint32_t base_ptr;
+  uint32_t count;
+};
+
+struct PacketEndCommand {
+  TraceCommandType type;
+};
+
+struct MemoryReadCommand {
+  TraceCommandType type;
+  uint32_t base_ptr;
+  uint32_t length;
+};
+
+struct MemoryWriteCommand {
+  TraceCommandType type;
+  uint32_t base_ptr;
+  uint32_t length;
+};
+
+enum class EventType {
+  kSwap,
+};
+
+struct EventCommand {
+  TraceCommandType type;
+  EventType event_type;
+};
+
+class TraceWriter {
+ public:
+  TraceWriter(uint8_t* membase) : membase_(membase), file_(nullptr) {}
+  ~TraceWriter() = default;
+
+  bool Open(const std::wstring& path) {
+    Close();
+    file_ = _wfopen(path.c_str(), L"wb");
+    return file_ != nullptr;
+  }
+
+  void Flush() {
+    if (file_) {
+      fflush(file_);
+    }
+  }
+
+  void Close() {
+    if (file_) {
+      fflush(file_);
+      fclose(file_);
+      file_ = nullptr;
+    }
+  }
+
+  void WritePrimaryBufferStart(uint32_t base_ptr, uint32_t count) {
+    if (!file_) {
+      return;
+    }
+    auto cmd = PrimaryBufferStartCommand({
+        TraceCommandType::kPrimaryBufferStart, base_ptr, count,
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+    fwrite(membase_ + base_ptr, 4, count, file_);
+  }
+
+  void WritePrimaryBufferEnd() {
+    if (!file_) {
+      return;
+    }
+    auto cmd = PrimaryBufferEndCommand({
+        TraceCommandType::kPrimaryBufferEnd,
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+  }
+
+  void WriteIndirectBufferStart(uint32_t base_ptr, uint32_t count) {
+    if (!file_) {
+      return;
+    }
+    auto cmd = IndirectBufferStartCommand({
+        TraceCommandType::kIndirectBufferStart, base_ptr, count,
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+    fwrite(membase_ + base_ptr, 4, count, file_);
+  }
+
+  void WriteIndirectBufferEnd() {
+    if (!file_) {
+      return;
+    }
+    auto cmd = IndirectBufferEndCommand({
+        TraceCommandType::kIndirectBufferEnd,
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+  }
+
+  void WritePacketStart(uint32_t base_ptr, uint32_t count) {
+    if (!file_) {
+      return;
+    }
+    auto cmd = PacketStartCommand({
+        TraceCommandType::kPacketStart, base_ptr, count,
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+    fwrite(membase_ + base_ptr, 4, count, file_);
+  }
+
+  void WritePacketEnd() {
+    if (!file_) {
+      return;
+    }
+    auto cmd = PacketEndCommand({
+        TraceCommandType::kPacketEnd,
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+  }
+
+  void WriteMemoryRead(uint32_t base_ptr, size_t length) {
+    if (!file_) {
+      return;
+    }
+    auto cmd = MemoryReadCommand({
+        TraceCommandType::kMemoryRead, base_ptr, uint32_t(length),
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+    fwrite(membase_ + base_ptr, 1, length, file_);
+  }
+
+  void WriteMemoryWrite(uint32_t base_ptr, size_t length) {
+    if (!file_) {
+      return;
+    }
+    auto cmd = MemoryWriteCommand({
+        TraceCommandType::kMemoryWrite, base_ptr, uint32_t(length),
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+    fwrite(membase_ + base_ptr, 1, length, file_);
+  }
+
+  void WriteEvent(EventType event_type) {
+    if (!file_) {
+      return;
+    }
+    auto cmd = EventCommand({
+        TraceCommandType::kEvent, event_type,
+    });
+    fwrite(&cmd, 1, sizeof(cmd), file_);
+  }
+
+ private:
+  uint8_t* membase_;
+  FILE* file_;
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_TRACING_H_
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 91c031dd7..c7f7c5e79 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -234,15 +234,6 @@ inline uint32_t GpuToCpu(uint32_t p) {
   return p;
 }
 
-inline uint32_t GpuToCpu(uint32_t base, uint32_t p) {
-  // Some AMD docs say relative to base ptr, some say just this.
-  // Some games use some crazy shift magic, but it seems to nop.
-  uint32_t upper = 0;//base & 0xFF000000;
-  //uint32_t lower = p & 0x01FFFFFF;
-  uint32_t lower = p;
-  return upper + lower;// -(((base >> 20) + 0x200) & 0x1000);
-}
-
 // XE_GPU_REG_SQ_PROGRAM_CNTL
 typedef union {
   XEPACKEDSTRUCTANONYMOUS({
diff --git a/src/xenia/kernel/fs/devices/disc_image_device.cc b/src/xenia/kernel/fs/devices/disc_image_device.cc
index fa6510dd3..11ab1dfe7 100644
--- a/src/xenia/kernel/fs/devices/disc_image_device.cc
+++ b/src/xenia/kernel/fs/devices/disc_image_device.cc
@@ -24,7 +24,8 @@ DiscImageDevice::DiscImageDevice(const std::string& path,
 DiscImageDevice::~DiscImageDevice() { delete gdfx_; }
 
 int DiscImageDevice::Init() {
-  mmap_ = poly::MappedMemory::Open(local_path_, poly::MappedMemory::Mode::READ);
+  mmap_ =
+      poly::MappedMemory::Open(local_path_, poly::MappedMemory::Mode::kRead);
   if (!mmap_) {
     XELOGE("Disc image could not be mapped");
     return 1;
diff --git a/src/xenia/kernel/fs/devices/host_path_entry.cc b/src/xenia/kernel/fs/devices/host_path_entry.cc
index 32a0de4d2..f8257fd63 100644
--- a/src/xenia/kernel/fs/devices/host_path_entry.cc
+++ b/src/xenia/kernel/fs/devices/host_path_entry.cc
@@ -125,8 +125,8 @@ std::unique_ptr<MemoryMapping> HostPathEntry::CreateMemoryMapping(
     Mode map_mode, const size_t offset, const size_t length) {
   auto mmap = poly::MappedMemory::Open(
       local_path_,
-      map_mode == Mode::READ ? poly::MappedMemory::Mode::READ
-                             : poly::MappedMemory::Mode::READ_WRITE,
+      map_mode == Mode::READ ? poly::MappedMemory::Mode::kRead
+                             : poly::MappedMemory::Mode::kReadWrite,
       offset, length);
   if (!mmap) {
     return nullptr;
diff --git a/src/xenia/kernel/fs/devices/stfs_container_device.cc b/src/xenia/kernel/fs/devices/stfs_container_device.cc
index 0951fd683..57ff17d7e 100644
--- a/src/xenia/kernel/fs/devices/stfs_container_device.cc
+++ b/src/xenia/kernel/fs/devices/stfs_container_device.cc
@@ -25,7 +25,8 @@ STFSContainerDevice::STFSContainerDevice(const std::string& path,
 STFSContainerDevice::~STFSContainerDevice() { delete stfs_; }
 
 int STFSContainerDevice::Init() {
-  mmap_ = poly::MappedMemory::Open(local_path_, poly::MappedMemory::Mode::READ);
+  mmap_ =
+      poly::MappedMemory::Open(local_path_, poly::MappedMemory::Mode::kRead);
   if (!mmap_) {
     XELOGE("STFS container could not be mapped");
     return 1;
diff --git a/src/xenia/xenia_main.cc b/src/xenia/xenia_main.cc
index eb2e07b99..2962f1733 100644
--- a/src/xenia/xenia_main.cc
+++ b/src/xenia/xenia_main.cc
@@ -30,9 +30,9 @@ int xenia_main(std::vector<std::wstring>& args) {
   }
 
   // Grab path from the flag or unnamed argument.
-  if (FLAGS_target.size() || args.size() >= 2) {
+  if (!FLAGS_target.empty() || args.size() >= 2) {
     std::wstring path;
-    if (FLAGS_target.size()) {
+    if (!FLAGS_target.empty()) {
       // Passed as a named argument.
       // TODO(benvanik): find something better than gflags that supports
       // unicode.
@@ -49,10 +49,10 @@ int xenia_main(std::vector<std::wstring>& args) {
       XELOGE("Failed to launch target: %.8X", result);
       return 1;
     }
-  }
 
-  // Wait until we are exited.
-  emulator->main_window()->loop()->AwaitQuit();
+    // Wait until we are exited.
+    emulator->main_window()->loop()->AwaitQuit();
+  }
 
   emulator.reset();
   Profiler::Dump();
diff --git a/xenia.gyp b/xenia.gyp
index 7c241be81..fee673afd 100644
--- a/xenia.gyp
+++ b/xenia.gyp
@@ -470,5 +470,28 @@
         'src/xenia/xenia_main.cc',
       ],
     },
+
+    {
+      'target_name': 'gpu-trace-viewer',
+      'type': 'executable',
+
+      'msvs_settings': {
+        'VCLinkerTool': {
+          'SubSystem': '2'
+        },
+      },
+
+      'dependencies': [
+        'libxenia',
+      ],
+
+      'include_dirs': [
+        '.',
+      ],
+
+      'sources': [
+        'src/xenia/gpu/trace_viewer_main.cc',
+      ],
+    },
   ],
 }