diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc
index a686862b0..b1f2b48c0 100644
--- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc
+++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc
@@ -83,15 +83,12 @@ void D3D11GraphicsDriver::SetShader(
       type, p, length);
 
   // Disassemble.
-  char* source = shader->Disassemble();
+  const char* source = shader->disasm_src();
   if (!source) {
     source = "<failed to disassemble>";
   }
   XELOGGPU("D3D11: set shader %d at %0.8X (%db):\n%s",
            type, address, length, source);
-  if (source) {
-    xe_free(source);
-  }
 
   // Stash for later.
   switch (type) {
@@ -289,7 +286,7 @@ int D3D11GraphicsDriver::BindShaders() {
   if (ps) {
     if (!ps->is_prepared()) {
       // Prepare for use.
-      if (ps->Prepare(&program_cntl)) {
+      if (ps->Prepare(&program_cntl, vs)) {
         XELOGGPU("D3D11: failed to prepare pixel shader");
         state_.pixel_shader = NULL;
         return 1;
diff --git a/src/xenia/gpu/d3d11/d3d11_shader.cc b/src/xenia/gpu/d3d11/d3d11_shader.cc
index 9bb207148..2aa36027e 100644
--- a/src/xenia/gpu/d3d11/d3d11_shader.cc
+++ b/src/xenia/gpu/d3d11/d3d11_shader.cc
@@ -20,20 +20,104 @@ using namespace xe::gpu::d3d11;
 using namespace xe::gpu::xenos;
 
 
+namespace {
+
+const int OUTPUT_CAPACITY = 64 * 1024;
+
+}  // anonymous namespace
+
+
+struct xe::gpu::d3d11::Output {
+  char buffer[OUTPUT_CAPACITY];
+  size_t capacity;
+  size_t offset;
+  Output() :
+      capacity(OUTPUT_CAPACITY),
+      offset(0) {
+    buffer[0] = 0;
+  }
+  void append(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    int len = xevsnprintfa(
+        buffer + offset, capacity - offset, format, args);
+    va_end(args);
+    offset += len;
+    buffer[offset] = 0;
+  }
+};
+
+
 D3D11Shader::D3D11Shader(
     ID3D11Device* device,
     XE_GPU_SHADER_TYPE type,
     const uint8_t* src_ptr, size_t length,
     uint64_t hash) :
+    translated_src_(NULL),
     Shader(type, src_ptr, length, hash) {
   device_ = device;
   device_->AddRef();
 }
 
 D3D11Shader::~D3D11Shader() {
+  if (translated_src_) {
+    xe_free(translated_src_);
+  }
   XESAFERELEASE(device_);
 }
 
+void D3D11Shader::set_translated_src(char* value) {
+  if (translated_src_) {
+    xe_free(translated_src_);
+  }
+  translated_src_ = xestrdupa(value);
+}
+
+ID3D10Blob* D3D11Shader::Compile(const char* shader_source) {
+  // TODO(benvanik): pick shared runtime mode defines.
+  D3D10_SHADER_MACRO defines[] = {
+    "TEST_DEFINE", "1",
+    0, 0,
+  };
+
+  uint32_t flags1 = 0;
+  flags1 |= D3D10_SHADER_DEBUG;
+  flags1 |= D3D10_SHADER_ENABLE_STRICTNESS;
+  uint32_t flags2 = 0;
+
+  // Create a name.
+  char file_name[64];
+  xesnprintfa(file_name, XECOUNT(file_name),
+      "gen_%.16XLL.%s",
+      hash_,
+      type_ == XE_GPU_SHADER_TYPE_VERTEX ? "vs" : "ps");
+
+  // TODO(benvanik): dump to disk so tools can find it.
+
+  // Compile shader to bytecode blob.
+  ID3D10Blob* shader_blob = 0;
+  ID3D10Blob* error_blob = 0;
+  HRESULT hr = D3DX11CompileFromMemory(
+      shader_source, strlen(shader_source),
+      file_name,
+      defines, NULL,
+      "main",
+      type_ == XE_GPU_SHADER_TYPE_VERTEX ?
+          "vs_5_0" : "ps_5_0",
+      flags1, flags2,
+      NULL,
+      &shader_blob, &error_blob, NULL);
+  if (error_blob) {
+    char* msg = (char*)error_blob->GetBufferPointer();
+    XELOGE("D3D11: shader compile failed with %s", msg);
+  }
+  XESAFERELEASE(error_blob);
+  if (FAILED(hr)) {
+    return NULL;
+  }
+  return shader_blob;
+}
+
 
 D3D11VertexShader::D3D11VertexShader(
     ID3D11Device* device,
@@ -54,13 +138,24 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) {
     return 0;
   }
 
+  // TODO(benvanik): look in file based on hash/etc.
   void* byte_code = NULL;
   size_t byte_code_length = 0;
 
-
-  if (!byte_code) {
+  // Translate and compile source.
+  const char* shader_source = Translate(program_cntl);
+  if (!shader_source) {
     return 1;
   }
+  ID3D10Blob* shader_blob = Compile(shader_source);
+  if (!shader_blob) {
+    return 1;
+  }
+  byte_code_length = shader_blob->GetBufferSize();
+  byte_code = xe_malloc(byte_code_length);
+  xe_copy_struct(
+      byte_code, shader_blob->GetBufferPointer(), byte_code_length);
+  XESAFERELEASE(shader_blob);
 
   // Create shader.
   HRESULT hr = device_->CreateVertexShader(
@@ -169,10 +264,10 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) {
       XEASSERTALWAYS();
       break;
     }
-    element_descs[n].SemanticName         = "XEVF";
+    element_descs[n].SemanticName         = "XE_VF";
     element_descs[n].SemanticIndex        = n;
     element_descs[n].Format               = vtx_format;
-    // TODO(benvanik): pick slot in same way that driver does.
+    // Pick slot in same way that driver does.
     // CONST(31, 2) = reg 31, index 2 = rf([31] * 6 + [2] * 2)
     uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel;
     uint32_t vb_slot = 95 - fetch_slot;
@@ -199,6 +294,94 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) {
 }
 
 
+const char* D3D11VertexShader::Translate(xe_gpu_program_cntl_t* program_cntl) {
+  Output* output = new Output();
+  xe_gpu_translate_ctx_t ctx;
+  ctx.output  = output;
+  ctx.type    = type_;
+
+  // Add constants buffers.
+  // We could optimize this by only including used buffers, but the compiler
+  // seems to do a good job of doing this for us.
+  // It also does read detection, so c[512] can end up c[4] in the asm -
+  // instead of doing this optimization ourselves we could maybe just query
+  // this from the compiler.
+  output->append(
+    "cbuffer float_consts : register(b0) {\n"
+    "  float4 c[512];\n"
+    "};\n");
+  // TODO(benvanik): add bool/loop constants.
+
+  // Add vertex shader input.
+  output->append(
+    "struct VS_INPUT {\n");
+  int n = 0;
+  for (std::vector<instr_fetch_vtx_t>::iterator it = fetch_vtxs_.begin();
+       it != fetch_vtxs_.end(); ++it, ++n) {
+    const instr_fetch_vtx_t& vtx = *it;
+    uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel;
+    output->append(
+      "  float4 vf%u_%d : XE_VF%u;\n", fetch_slot, vtx.offset, n);
+  }
+  output->append(
+    "};\n");
+
+  // Add vertex shader output (pixel shader input).
+  output->append(
+    "struct VS_OUTPUT {\n");
+  if (alloc_counts_.positions) {
+    XEASSERT(alloc_counts_.positions == 1);
+    output->append(
+      "  float4 oPos : SV_POSITION;\n");
+  }
+  if (alloc_counts_.params) {
+    output->append(
+      "  float4 o[%d] : XE_O;\n",
+      alloc_counts_.params);
+  }
+  output->append(
+    "};\n");
+
+  // Vertex shader main() header.
+  output->append(
+    "VS_OUTPUT main(VS_INPUT i) {\n"
+    "  VS_OUTPUT o;\n");
+
+  // TODO(benvanik): remove this, if possible (though the compiler may be smart
+  //     enough to do it for us).
+  for (uint32_t n = 0; n < alloc_counts_.params; n++) {
+    output->append(
+      "  o.o[%d] = float4(0.0, 0.0, 0.0, 0.0);\n", n);
+  }
+
+  // Add temporaries for any registers we may use.
+  for (uint32_t n = 0; n <= program_cntl->vs_regs; n++) {
+    output->append(
+      "  float4 r%d;\n", n);
+  }
+
+  // Execute blocks.
+  for (std::vector<instr_cf_exec_t>::iterator it = execs_.begin();
+       it != execs_.end(); ++it) {
+    instr_cf_exec_t& cf = *it;
+    // TODO(benvanik): figure out how sequences/jmps/loops/etc work.
+    if (TranslateExec(ctx, cf)) {
+      delete output;
+      return NULL;
+    }
+  }
+
+  // main footer.
+  output->append(
+    "  return o;\n"
+    "};\n");
+
+  set_translated_src(output->buffer);
+  delete output;
+  return translated_src_;
+}
+
+
 D3D11PixelShader::D3D11PixelShader(
     ID3D11Device* device,
     const uint8_t* src_ptr, size_t length,
@@ -212,18 +395,30 @@ D3D11PixelShader::~D3D11PixelShader() {
   XESAFERELEASE(handle_);
 }
 
-int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl) {
+int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl,
+                              D3D11VertexShader* input_shader) {
   if (handle_) {
     return 0;
   }
 
+  // TODO(benvanik): look in file based on hash/etc.
   void* byte_code = NULL;
   size_t byte_code_length = 0;
 
-
-  if (!byte_code) {
+  // Translate and compile source.
+  const char* shader_source = Translate(program_cntl, input_shader);
+  if (!shader_source) {
     return 1;
   }
+  ID3D10Blob* shader_blob = Compile(shader_source);
+  if (!shader_blob) {
+    return 1;
+  }
+  byte_code_length = shader_blob->GetBufferSize();
+  byte_code = xe_malloc(byte_code_length);
+  xe_copy_struct(
+      byte_code, shader_blob->GetBufferPointer(), byte_code_length);
+  XESAFERELEASE(shader_blob);
 
   // Create shader.
   HRESULT hr = device_->CreatePixelShader(
@@ -241,3 +436,828 @@ int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl) {
   is_prepared_ = true;
   return 0;
 }
+
+const char* D3D11PixelShader::Translate(
+    xe_gpu_program_cntl_t* program_cntl, D3D11VertexShader* input_shader) {
+  Output* output = new Output();
+  xe_gpu_translate_ctx_t ctx;
+  ctx.output  = output;
+  ctx.type    = type_;
+
+  // We need an input VS to make decisions here.
+  // TODO(benvanik): do we need to pair VS/PS up and store the combination?
+  // If the same PS is used with different VS that output different amounts
+  // (and less than the number of required registers), things may die.
+  XEASSERTNOTNULL(input_shader);
+  const Shader::alloc_counts_t& input_alloc_counts =
+      input_shader->alloc_counts();
+
+  // Add constants buffers.
+  // We could optimize this by only including used buffers, but the compiler
+  // seems to do a good job of doing this for us.
+  // It also does read detection, so c[512] can end up c[4] in the asm -
+  // instead of doing this optimization ourselves we could maybe just query
+  // this from the compiler.
+  output->append(
+    "cbuffer float_consts : register(b0) {\n"
+    "  float4 c[512];\n"
+    "};\n");
+  // TODO(benvanik): add bool/loop constants.
+
+  // Add vertex shader output (pixel shader input).
+  output->append(
+    "struct VS_OUTPUT {\n");
+  if (input_alloc_counts.positions) {
+    XEASSERT(input_alloc_counts.positions == 1);
+    output->append(
+      "  float4 oPos : SV_POSITION;\n");
+  }
+  if (input_alloc_counts.params) {
+    output->append(
+      "  float4 o[%d] : XE_O;\n",
+      input_alloc_counts.params);
+  }
+  output->append(
+    "};\n");
+
+  // Add pixel shader output.
+  output->append(
+    "struct PS_OUTPUT {\n");
+  for (uint32_t n = 0; n < alloc_counts_.params; n++) {
+    output->append(
+      "  float4 oC%d   : SV_TARGET%d;\n", n, n);
+    if (program_cntl->ps_export_depth) {
+      // Is this per render-target?
+      output->append(
+        "  float oD%d   : SV_DEPTH%d;\n", n, n);
+    }
+  }
+  output->append(
+    "};\n");
+
+  // Pixel shader main() header.
+  output->append(
+    "PS_OUTPUT main(VS_OUTPUT i) {\n"
+    "  PS_OUTPUT o;\n");
+
+  // Add temporary registers.
+  for (uint32_t n = 0; n <= program_cntl->ps_regs; n++) {
+    output->append(
+      "  float4 r%d;\n", n);
+  }
+
+  // Bring registers local.
+  for (uint32_t n = 0; n < input_alloc_counts.params; n++) {
+    output->append(
+      "  r%d = i.o[%d];\n", n, n);
+  }
+
+  // Execute blocks.
+  for (std::vector<instr_cf_exec_t>::iterator it = execs_.begin();
+       it != execs_.end(); ++it) {
+    instr_cf_exec_t& cf = *it;
+    // TODO(benvanik): figure out how sequences/jmps/loops/etc work.
+    if (TranslateExec(ctx, cf)) {
+      delete output;
+      return NULL;
+    }
+  }
+
+  // main footer.
+  output->append(
+    "  return o;\n"
+    "}\n");
+
+  set_translated_src(output->buffer);
+  delete output;
+  return translated_src_;
+}
+
+
+namespace {
+
+static const char chan_names[] = {
+  'x', 'y', 'z', 'w'
+};
+
+void AppendSrcReg(
+    xe_gpu_translate_ctx_t& ctx,
+    uint32_t num, uint32_t type,
+    uint32_t swiz, uint32_t negate, uint32_t abs) {
+  if (negate) {
+    ctx.output->append("-");
+  }
+  if (abs) {
+    ctx.output->append("abs(");
+  }
+  if (type) {
+    // Register.
+    ctx.output->append("r%u", num);
+  } else {
+    // Constant.
+    ctx.output->append("c[%u]", num);
+  }
+  if (swiz) {
+    ctx.output->append(".");
+    for (int i = 0; i < 4; i++) {
+      ctx.output->append("%c", chan_names[(swiz + i) & 0x3]);
+      swiz >>= 2;
+    }
+  }
+  if (abs) {
+    ctx.output->append(")");
+  }
+}
+
+void AppendDestReg(
+    xe_gpu_translate_ctx_t& ctx,
+    uint32_t num, uint32_t mask, uint32_t dst_exp) {
+  if (!dst_exp) {
+    // Register.
+    ctx.output->append("r%u", num);
+  } else {
+    // Export.
+    switch (ctx.type) {
+    case XE_GPU_SHADER_TYPE_VERTEX:
+      switch (num) {
+      case 62:
+        ctx.output->append("o.oPos");
+        break;
+      case 63:
+        ctx.output->append("o.point_size");
+        break;
+      default:
+        // Varying.
+        ctx.output->append("o.o[%u]", num);;
+        break;
+      }
+      break;
+    case XE_GPU_SHADER_TYPE_PIXEL:
+      switch (num) {
+      case 0:
+        ctx.output->append("o.oC0");
+        break;
+      default:
+        // TODO(benvanik): other render targets?
+        // TODO(benvanik): depth?
+        XEASSERTALWAYS();
+        break;
+      }
+      break;
+    }
+  }
+  if (mask != 0xf) {
+    ctx.output->append(".");
+    for (int i = 0; i < 4; i++) {
+      ctx.output->append("%c", (mask & 0x1) ? chan_names[i] : '_');
+      mask >>= 1;
+    }
+  }
+}
+
+void print_srcreg(
+    Output* output,
+    uint32_t num, uint32_t type,
+    uint32_t swiz, uint32_t negate, uint32_t abs) {
+  if (negate) {
+    output->append("-");
+  }
+  if (abs) {
+    output->append("|");
+  }
+  output->append("%c%u", type ? 'R' : 'C', num);
+  if (swiz) {
+    output->append(".");
+    for (int i = 0; i < 4; i++) {
+      output->append("%c", chan_names[(swiz + i) & 0x3]);
+      swiz >>= 2;
+    }
+  }
+  if (abs) {
+    output->append("|");
+  }
+}
+
+void print_dstreg(
+    Output* output, uint32_t num, uint32_t mask, uint32_t dst_exp) {
+  output->append("%s%u", dst_exp ? "export" : "R", num);
+  if (mask != 0xf) {
+    output->append(".");
+    for (int i = 0; i < 4; i++) {
+      output->append("%c", (mask & 0x1) ? chan_names[i] : '_');
+      mask >>= 1;
+    }
+  }
+}
+
+void print_export_comment(
+    Output* output, uint32_t num, XE_GPU_SHADER_TYPE type) {
+  const char *name = NULL;
+  switch (type) {
+  case XE_GPU_SHADER_TYPE_VERTEX:
+    switch (num) {
+    case 62: name = "gl_Position";  break;
+    case 63: name = "gl_PointSize"; break;
+    }
+    break;
+  case XE_GPU_SHADER_TYPE_PIXEL:
+    switch (num) {
+    case 0:  name = "gl_FragColor"; break;
+    }
+    break;
+  }
+  /* if we had a symbol table here, we could look
+   * up the name of the varying..
+   */
+  if (name) {
+    output->append("\t; %s", name);
+  }
+}
+
+int TranslateALU_ADDv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  ctx.output->append("(");
+  AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  ctx.output->append(" + ");
+  AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs);
+  ctx.output->append(")");
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+int TranslateALU_MULv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  ctx.output->append("(");
+  AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  ctx.output->append(" * ");
+  AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs);
+  ctx.output->append(")");
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+int TranslateALU_MAXv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  if (alu.src1_reg == alu.src2_reg &&
+      alu.src1_sel == alu.src2_sel &&
+      alu.src1_swiz == alu.src2_swiz &&
+      alu.src1_reg_negate == alu.src2_reg_negate &&
+      alu.src1_reg_abs == alu.src2_reg_abs) {
+    // This is a mov.
+    AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  } else {
+    ctx.output->append("max(");
+    AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+    ctx.output->append(", ");
+    AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs);
+    ctx.output->append(")");
+  }
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+int TranslateALU_MINv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  ctx.output->append("min(");
+  AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  ctx.output->append(", ");
+  AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs);
+  ctx.output->append(")");
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+int TranslateALU_FRACv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  ctx.output->append("frac(");
+  AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  ctx.output->append(")");
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+int TranslateALU_TRUNCv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  ctx.output->append("trunc(");
+  AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  ctx.output->append(")");
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+int TranslateALU_FLOORv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  ctx.output->append("floor(");
+  AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  ctx.output->append(")");
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+// ...
+
+int TranslateALU_MULADDv(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) {
+  AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data);
+  ctx.output->append(" = ");
+  if (alu.vector_clamp) {
+    ctx.output->append("saturate(");
+  }
+  ctx.output->append("mad(");
+  // TODO(benvanik): verify correct - may be 1,2,3 for (1*2+3)
+  AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs);
+  ctx.output->append(", ");
+  AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs);
+  ctx.output->append(", ");
+  AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs);
+  ctx.output->append(")");
+  if (alu.vector_clamp) {
+    ctx.output->append(")");
+  }
+  ctx.output->append(";\n");
+  return 0;
+}
+
+typedef int (*xe_gpu_translate_alu_fn)(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu);
+typedef struct {
+  uint32_t    num_srcs;
+  const char* name;
+  xe_gpu_translate_alu_fn   fn;
+} xe_gpu_translate_alu_info_t;
+#define ALU_INSTR(opc, num_srcs) \
+    { num_srcs, #opc, 0 }
+#define ALU_INSTR_IMPL(opc, num_srcs) \
+    { num_srcs, #opc, TranslateALU_##opc }
+static xe_gpu_translate_alu_info_t vector_alu_instrs[0x20] = {
+  ALU_INSTR_IMPL(ADDv,               2),  // 0
+  ALU_INSTR_IMPL(MULv,               2),  // 1
+  ALU_INSTR_IMPL(MAXv,               2),  // 2
+  ALU_INSTR_IMPL(MINv,               2),  // 3
+  ALU_INSTR(SETEv,              2),  // 4
+  ALU_INSTR(SETGTv,             2),  // 5
+  ALU_INSTR(SETGTEv,            2),  // 6
+  ALU_INSTR(SETNEv,             2),  // 7
+  ALU_INSTR_IMPL(FRACv,              1),  // 8
+  ALU_INSTR_IMPL(TRUNCv,             1),  // 9
+  ALU_INSTR_IMPL(FLOORv,             1),  // 10
+  ALU_INSTR_IMPL(MULADDv,            3),  // 11
+  ALU_INSTR(CNDEv,              3),  // 12
+  ALU_INSTR(CNDGTEv,            3),  // 13
+  ALU_INSTR(CNDGTv,             3),  // 14
+  ALU_INSTR(DOT4v,              2),  // 15
+  ALU_INSTR(DOT3v,              2),  // 16
+  ALU_INSTR(DOT2ADDv,           3),  // 17 -- ???
+  ALU_INSTR(CUBEv,              2),  // 18
+  ALU_INSTR(MAX4v,              1),  // 19
+  ALU_INSTR(PRED_SETE_PUSHv,    2),  // 20
+  ALU_INSTR(PRED_SETNE_PUSHv,   2),  // 21
+  ALU_INSTR(PRED_SETGT_PUSHv,   2),  // 22
+  ALU_INSTR(PRED_SETGTE_PUSHv,  2),  // 23
+  ALU_INSTR(KILLEv,             2),  // 24
+  ALU_INSTR(KILLGTv,            2),  // 25
+  ALU_INSTR(KILLGTEv,           2),  // 26
+  ALU_INSTR(KILLNEv,            2),  // 27
+  ALU_INSTR(DSTv,               2),  // 28
+  ALU_INSTR(MOVAv,              1),  // 29
+};
+static xe_gpu_translate_alu_info_t scalar_alu_instrs[0x40] = {
+  ALU_INSTR(ADDs,               1),  // 0
+  ALU_INSTR(ADD_PREVs,          1),  // 1
+  ALU_INSTR(MULs,               1),  // 2
+  ALU_INSTR(MUL_PREVs,          1),  // 3
+  ALU_INSTR(MUL_PREV2s,         1),  // 4
+  ALU_INSTR(MAXs,               1),  // 5
+  ALU_INSTR(MINs,               1),  // 6
+  ALU_INSTR(SETEs,              1),  // 7
+  ALU_INSTR(SETGTs,             1),  // 8
+  ALU_INSTR(SETGTEs,            1),  // 9
+  ALU_INSTR(SETNEs,             1),  // 10
+  ALU_INSTR(FRACs,              1),  // 11
+  ALU_INSTR(TRUNCs,             1),  // 12
+  ALU_INSTR(FLOORs,             1),  // 13
+  ALU_INSTR(EXP_IEEE,           1),  // 14
+  ALU_INSTR(LOG_CLAMP,          1),  // 15
+  ALU_INSTR(LOG_IEEE,           1),  // 16
+  ALU_INSTR(RECIP_CLAMP,        1),  // 17
+  ALU_INSTR(RECIP_FF,           1),  // 18
+  ALU_INSTR(RECIP_IEEE,         1),  // 19
+  ALU_INSTR(RECIPSQ_CLAMP,      1),  // 20
+  ALU_INSTR(RECIPSQ_FF,         1),  // 21
+  ALU_INSTR(RECIPSQ_IEEE,       1),  // 22
+  ALU_INSTR(MOVAs,              1),  // 23
+  ALU_INSTR(MOVA_FLOORs,        1),  // 24
+  ALU_INSTR(SUBs,               1),  // 25
+  ALU_INSTR(SUB_PREVs,          1),  // 26
+  ALU_INSTR(PRED_SETEs,         1),  // 27
+  ALU_INSTR(PRED_SETNEs,        1),  // 28
+  ALU_INSTR(PRED_SETGTs,        1),  // 29
+  ALU_INSTR(PRED_SETGTEs,       1),  // 30
+  ALU_INSTR(PRED_SET_INVs,      1),  // 31
+  ALU_INSTR(PRED_SET_POPs,      1),  // 32
+  ALU_INSTR(PRED_SET_CLRs,      1),  // 33
+  ALU_INSTR(PRED_SET_RESTOREs,  1),  // 34
+  ALU_INSTR(KILLEs,             1),  // 35
+  ALU_INSTR(KILLGTs,            1),  // 36
+  ALU_INSTR(KILLGTEs,           1),  // 37
+  ALU_INSTR(KILLNEs,            1),  // 38
+  ALU_INSTR(KILLONEs,           1),  // 39
+  ALU_INSTR(SQRT_IEEE,          1),  // 40
+  { 0, 0, false },
+  ALU_INSTR(MUL_CONST_0,        1),  // 42
+  ALU_INSTR(MUL_CONST_1,        1),  // 43
+  ALU_INSTR(ADD_CONST_0,        1),  // 44
+  ALU_INSTR(ADD_CONST_1,        1),  // 45
+  ALU_INSTR(SUB_CONST_0,        1),  // 46
+  ALU_INSTR(SUB_CONST_1,        1),  // 47
+  ALU_INSTR(SIN,                1),  // 48
+  ALU_INSTR(COS,                1),  // 49
+  ALU_INSTR(RETAIN_PREV,        1),  // 50
+};
+#undef ALU_INSTR
+
+int TranslateALU(
+    xe_gpu_translate_ctx_t& ctx, const instr_alu_t* alu, int sync) {
+  Output* output = ctx.output;
+
+  if (!alu->scalar_write_mask && !alu->vector_write_mask) {
+    output->append("  //   <nop>\n");
+    return 0;
+  }
+
+  if (alu->vector_write_mask) {
+    // Disassemble vector op.
+    xe_gpu_translate_alu_info_t& iv = vector_alu_instrs[alu->vector_opc];
+    output->append("  //   %sALU:\t", sync ? "(S)" : "   ");
+    output->append("%s", iv.name);
+    if (alu->pred_select & 0x2) {
+      // seems to work similar to conditional execution in ARM instruction
+      // set, so let's use a similar syntax for now:
+      output->append((alu->pred_select & 0x1) ? "EQ" : "NE");
+    }
+    output->append("\t");
+    print_dstreg(output,
+                  alu->vector_dest, alu->vector_write_mask, alu->export_data);
+    output->append(" = ");
+    if (iv.num_srcs == 3) {
+      print_srcreg(output,
+                    alu->src3_reg, alu->src3_sel, alu->src3_swiz,
+                    alu->src3_reg_negate, alu->src3_reg_abs);
+      output->append(", ");
+    }
+    print_srcreg(output,
+                  alu->src1_reg, alu->src1_sel, alu->src1_swiz,
+                  alu->src1_reg_negate, alu->src1_reg_abs);
+    if (iv.num_srcs > 1) {
+      output->append(", ");
+      print_srcreg(output,
+                    alu->src2_reg, alu->src2_sel, alu->src2_swiz,
+                    alu->src2_reg_negate, alu->src2_reg_abs);
+    }
+    if (alu->vector_clamp) {
+      output->append(" CLAMP");
+    }
+    if (alu->export_data) {
+      print_export_comment(output, alu->vector_dest, ctx.type);
+    }
+    output->append("\n");
+
+    // Translate vector op.
+    if (iv.fn) {
+      output->append("  ");
+      if (iv.fn(ctx, *alu)) {
+        return 1;
+      }
+    } else {
+      output->append("  // <UNIMPLEMENTED>\n");
+    }
+  }
+
+  if (alu->scalar_write_mask || !alu->vector_write_mask) {
+    // 2nd optional scalar op:
+
+    // Disassemble scalar op.
+    xe_gpu_translate_alu_info_t& is = scalar_alu_instrs[alu->scalar_opc];
+    output->append("  //  ");
+    output->append("                          \t");
+    if (is.name) {
+      output->append("\t    \t%s\t", is.name);
+    } else {
+      output->append("\t    \tOP(%u)\t", alu->scalar_opc);
+    }
+    print_dstreg(output,
+                 alu->scalar_dest, alu->scalar_write_mask, alu->export_data);
+    output->append(" = ");
+    print_srcreg(output,
+                 alu->src3_reg, alu->src3_sel, alu->src3_swiz,
+                 alu->src3_reg_negate, alu->src3_reg_abs);
+    // TODO ADD/MUL must have another src?!?
+    if (alu->scalar_clamp) {
+      output->append(" CLAMP");
+    }
+    if (alu->export_data) {
+      print_export_comment(output, alu->scalar_dest, ctx.type);
+    }
+    output->append("\n");
+
+    // Translate scalar op.
+    if (is.fn) {
+      output->append("  ");
+      if (is.fn(ctx, *alu)) {
+        return 1;
+      }
+    } else {
+      output->append("  // <UNIMPLEMENTED>\n");
+    }
+  }
+
+  return 0;
+}
+
+struct {
+  const char *name;
+} fetch_types[0xff] = {
+#define TYPE(id) { #id }
+    TYPE(FMT_1_REVERSE), // 0
+    {0},
+    TYPE(FMT_8), // 2
+    {0},
+    {0},
+    {0},
+    TYPE(FMT_8_8_8_8), // 6
+    {0},
+    {0},
+    {0},
+    TYPE(FMT_8_8), // 10
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    TYPE(FMT_16), // 24
+    TYPE(FMT_16_16), // 25
+    TYPE(FMT_16_16_16_16), // 26
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    TYPE(FMT_32), // 33
+    TYPE(FMT_32_32), // 34
+    TYPE(FMT_32_32_32_32), // 35
+    TYPE(FMT_32_FLOAT), // 36
+    TYPE(FMT_32_32_FLOAT), // 37
+    TYPE(FMT_32_32_32_32_FLOAT), // 38
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    {0},
+    TYPE(FMT_32_32_32_FLOAT), // 57
+#undef TYPE
+};
+
+void print_fetch_dst(Output* output, uint32_t dst_reg, uint32_t dst_swiz) {
+  output->append("\tR%u.", dst_reg);
+  for (int i = 0; i < 4; i++) {
+    output->append("%c", chan_names[dst_swiz & 0x7]);
+    dst_swiz >>= 3;
+  }
+}
+
+void AppendFetchDest(Output* output, uint32_t dst_reg, uint32_t dst_swiz) {
+  output->append("r%u.", dst_reg);
+  for (int i = 0; i < 4; i++) {
+    output->append("%c", chan_names[dst_swiz & 0x7]);
+    dst_swiz >>= 3;
+  }
+}
+
+int TranslateVertexFetch(
+    xe_gpu_translate_ctx_t& ctx, const instr_fetch_vtx_t* vtx, int sync) {
+  Output* output = ctx.output;
+
+  // Disassemble.
+  output->append("  //   %sFETCH:\t", sync ? "(S)" : "   ");
+  if (vtx->pred_select) {
+    output->append(vtx->pred_condition ? "EQ" : "NE");
+  }
+  print_fetch_dst(output, vtx->dst_reg, vtx->dst_swiz);
+  output->append(" = R%u.", vtx->src_reg);
+  output->append("%c", chan_names[vtx->src_swiz & 0x3]);
+  if (fetch_types[vtx->format].name) {
+    output->append(" %s", fetch_types[vtx->format].name);
+  } else  {
+    output->append(" TYPE(0x%x)", vtx->format);
+  }
+  output->append(" %s", vtx->format_comp_all ? "SIGNED" : "UNSIGNED");
+  if (!vtx->num_format_all) {
+    output->append(" NORMALIZED");
+  }
+  output->append(" STRIDE(%u)", vtx->stride);
+  if (vtx->offset) {
+    output->append(" OFFSET(%u)", vtx->offset);
+  }
+  output->append(" CONST(%u, %u)", vtx->const_index, vtx->const_index_sel);
+  if (1) {
+    // XXX
+    output->append(" src_reg_am=%u", vtx->src_reg_am);
+    output->append(" dst_reg_am=%u", vtx->dst_reg_am);
+    output->append(" num_format_all=%u", vtx->num_format_all);
+    output->append(" signed_rf_mode_all=%u", vtx->signed_rf_mode_all);
+    output->append(" exp_adjust_all=%u", vtx->exp_adjust_all);
+  }
+  output->append("\n");
+
+  // Translate.
+  output->append("  ");
+  output->append("r%u.xyzw", vtx->dst_reg);
+  output->append(" = ");
+  uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel;
+  output->append("i.vf%u_%d.", fetch_slot, vtx->offset);
+  // Pass one over dest does xyzw and fakes the special values.
+  // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc
+  uint32_t dst_swiz = vtx->dst_swiz;
+  for (int i = 0; i < 4; i++) {
+    output->append("%c", chan_names[dst_swiz & 0x3]);
+    dst_swiz >>= 3;
+  }
+  output->append(";\n");
+  // Do another pass to set constant values.
+  dst_swiz = vtx->dst_swiz;
+  for (int i = 0; i < 4; i++) {
+    if ((dst_swiz & 0x7) == 4) {
+      output->append("  r%u.%c = 0.0;\n", vtx->dst_reg, chan_names[i]);
+    } else if ((dst_swiz & 0x7) == 5) {
+      output->append("  r%u.%c = 1.0;\n", vtx->dst_reg, chan_names[i]);
+    }
+    dst_swiz >>= 3;
+  }
+  return 0;
+}
+
+struct {
+  const char *name;
+} cf_instructions[] = {
+#define INSTR(opc, fxn) { #opc }
+    INSTR(NOP, print_cf_nop),
+    INSTR(EXEC, print_cf_exec),
+    INSTR(EXEC_END, print_cf_exec),
+    INSTR(COND_EXEC, print_cf_exec),
+    INSTR(COND_EXEC_END, print_cf_exec),
+    INSTR(COND_PRED_EXEC, print_cf_exec),
+    INSTR(COND_PRED_EXEC_END, print_cf_exec),
+    INSTR(LOOP_START, print_cf_loop),
+    INSTR(LOOP_END, print_cf_loop),
+    INSTR(COND_CALL, print_cf_jmp_call),
+    INSTR(RETURN, print_cf_jmp_call),
+    INSTR(COND_JMP, print_cf_jmp_call),
+    INSTR(ALLOC, print_cf_alloc),
+    INSTR(COND_EXEC_PRED_CLEAN, print_cf_exec),
+    INSTR(COND_EXEC_PRED_CLEAN_END, print_cf_exec),
+    INSTR(MARK_VS_FETCH_DONE, print_cf_nop),  // ??
+#undef INSTR
+};
+
+}  // anonymous namespace
+
+
+int D3D11Shader::TranslateExec(xe_gpu_translate_ctx_t& ctx, const instr_cf_exec_t& cf) {
+  Output* output = ctx.output;
+
+  output->append(
+    "  // %s ADDR(0x%x) CNT(0x%x)",
+    cf_instructions[cf.opc].name, cf.address, cf.count);
+  if (cf.yeild) {
+    output->append(" YIELD");
+  }
+  uint8_t vc = cf.vc_hi | (cf.vc_lo << 2);
+  if (vc) {
+    output->append(" VC(0x%x)", vc);
+  }
+  if (cf.bool_addr) {
+    output->append(" BOOL_ADDR(0x%x)", cf.bool_addr);
+  }
+  if (cf.address_mode == ABSOLUTE_ADDR) {
+    output->append(" ABSOLUTE_ADDR");
+  }
+  if (cf.is_cond_exec()) {
+    output->append(" COND(%d)", cf.condition);
+  }
+  output->append("\n");
+
+  uint32_t sequence = cf.serialize;
+  for (uint32_t i = 0; i < cf.count; i++) {
+    uint32_t alu_off = (cf.address + i);
+    int sync = sequence & 0x2;
+    if (sequence & 0x1) {
+      const instr_fetch_t* fetch =
+          (const instr_fetch_t*)(dwords_ + alu_off * 3);
+      switch (fetch->opc) {
+      case VTX_FETCH:
+        if (TranslateVertexFetch(ctx, &fetch->vtx, sync)) {
+          return 1;
+        }
+        break;
+      case TEX_FETCH:
+      case TEX_GET_BORDER_COLOR_FRAC:
+      case TEX_GET_COMP_TEX_LOD:
+      case TEX_GET_GRADIENTS:
+      case TEX_GET_WEIGHTS:
+      case TEX_SET_TEX_LOD:
+      case TEX_SET_GRADIENTS_H:
+      case TEX_SET_GRADIENTS_V:
+      default:
+        XEASSERTALWAYS();
+        break;
+      }
+    } else {
+      const instr_alu_t* alu =
+          (const instr_alu_t*)(dwords_ + alu_off * 3);
+      if (TranslateALU(ctx, alu, sync)) {
+        return 1;
+      }
+    }
+    sequence >>= 2;
+  }
+
+  return 0;
+}
diff --git a/src/xenia/gpu/d3d11/d3d11_shader.h b/src/xenia/gpu/d3d11/d3d11_shader.h
index 1c06358a5..79ecd9460 100644
--- a/src/xenia/gpu/d3d11/d3d11_shader.h
+++ b/src/xenia/gpu/d3d11/d3d11_shader.h
@@ -22,6 +22,13 @@ namespace xe {
 namespace gpu {
 namespace d3d11 {
 
+struct Output;
+
+typedef struct {
+  Output*       output;
+  xenos::XE_GPU_SHADER_TYPE type;
+} xe_gpu_translate_ctx_t;
+
 
 class D3D11Shader : public Shader {
 public:
@@ -34,8 +41,18 @@ protected:
       const uint8_t* src_ptr, size_t length,
       uint64_t hash);
 
+  const char* translated_src() const { return translated_src_; }
+  void set_translated_src(char* value);
+
+  int TranslateExec(
+      xe_gpu_translate_ctx_t& ctx, const xenos::instr_cf_exec_t& cf);
+
+  ID3D10Blob* Compile(const char* shader_source);
+
 protected:
   ID3D11Device* device_;
+
+  char*   translated_src_;
 };
 
 
@@ -52,6 +69,9 @@ public:
 
   int Prepare(xenos::xe_gpu_program_cntl_t* program_cntl);
 
+private:
+  const char* Translate(xenos::xe_gpu_program_cntl_t* program_cntl);
+
 private:
   ID3D11VertexShader* handle_;
   ID3D11InputLayout*  input_layout_;
@@ -68,7 +88,12 @@ public:
 
   ID3D11PixelShader* handle() const { return handle_; }
 
-  int Prepare(xenos::xe_gpu_program_cntl_t* program_cntl);
+  int Prepare(xenos::xe_gpu_program_cntl_t* program_cntl,
+              D3D11VertexShader* input_shader);
+
+private:
+  const char* Translate(xenos::xe_gpu_program_cntl_t* program_cntl,
+                        D3D11VertexShader* input_shader);
 
 private:
   ID3D11PixelShader*  handle_;
diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc
index cf2ea7be4..00d9e76a7 100644
--- a/src/xenia/gpu/graphics_system.cc
+++ b/src/xenia/gpu/graphics_system.cc
@@ -29,9 +29,9 @@ GraphicsSystem::GraphicsSystem(const CreationParams* params) :
 
   // Set during Initialize();
   driver_ = 0;
-
-  // Create the run loop used for any windows/etc.
-  // This must be done on the thread we create the driver.
+
+  // Create the run loop used for any windows/etc.
+  // This must be done on the thread we create the driver.
   run_loop_ = xe_run_loop_create();
 
   // Create worker thread.
diff --git a/src/xenia/gpu/nop/nop_graphics_driver.cc b/src/xenia/gpu/nop/nop_graphics_driver.cc
index eb55f064c..de3f4fc3b 100644
--- a/src/xenia/gpu/nop/nop_graphics_driver.cc
+++ b/src/xenia/gpu/nop/nop_graphics_driver.cc
@@ -55,15 +55,12 @@ void NopGraphicsDriver::SetShader(
       type, p, length);
 
   // Disassemble.
-  char* source = shader->Disassemble();
+  const char* source = shader->disasm_src();
   if (!source) {
     source = "<failed to disassemble>";
   }
   XELOGGPU("NOP: set shader %d at %0.8X (%db):\n%s",
            type, address, length, source);
-  if (source) {
-    xe_free(source);
-  }
 }
 
 void NopGraphicsDriver::DrawIndexAuto(
diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc
index 128955fc7..459fc0799 100644
--- a/src/xenia/gpu/shader.cc
+++ b/src/xenia/gpu/shader.cc
@@ -21,7 +21,8 @@ Shader::Shader(
     XE_GPU_SHADER_TYPE type,
     const uint8_t* src_ptr, size_t length,
     uint64_t hash) :
-    type_(type), hash_(hash), is_prepared_(false) {
+    type_(type), hash_(hash), is_prepared_(false), disasm_src_(NULL) {
+  xe_zero_struct(&alloc_counts_, sizeof(alloc_counts_));
   xe_zero_struct(fetch_vtx_slots_, sizeof(fetch_vtx_slots_));
 
   // Verify.
@@ -37,9 +38,15 @@ Shader::Shader(
 
   // Gather input/output registers/etc.
   GatherIO();
+
+  // Disassemble, for debugging.
+  disasm_src_ = DisassembleShader(type_, dwords_, dword_count_);
 }
 
 Shader::~Shader() {
+  if (disasm_src_) {
+    xe_free(disasm_src_);
+  }
   xe_free(dwords_);
 }
 
@@ -73,9 +80,26 @@ void Shader::GatherIO() {
 
 void Shader::GatherAlloc(const instr_cf_alloc_t* cf) {
   allocs_.push_back(*cf);
+
+  switch (cf->buffer_select) {
+  case SQ_POSITION:
+    // Position (SV_POSITION).
+    alloc_counts_.positions += cf->size + 1;
+    break;
+  case SQ_PARAMETER_PIXEL:
+    // Output to PS (if VS), or frag output (if PS).
+    alloc_counts_.params += cf->size + 1;
+    break;
+  case SQ_MEMORY:
+    // MEMEXPORT?
+    alloc_counts_.memories += cf->size + 1;
+    break;
+  }
 }
 
 void Shader::GatherExec(const instr_cf_exec_t* cf) {
+  execs_.push_back(*cf);
+
   uint32_t sequence = cf->serialize;
   for (uint32_t i = 0; i < cf->count; i++) {
     uint32_t alu_off = (cf->address + i);
@@ -129,7 +153,3 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
 const instr_fetch_vtx_t* Shader::GetFetchVtxBySlot(uint32_t fetch_slot) {
   return &fetch_vtx_slots_[fetch_slot];
 }
-
-char* Shader::Disassemble() {
-  return DisassembleShader(type_, dwords_, dword_count_);
-}
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index 549363e52..c56c413cd 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -32,10 +32,16 @@ public:
   uint64_t hash() const { return hash_; }
   bool is_prepared() const { return is_prepared_; }
 
+  const char* disasm_src() const { return disasm_src_; }
+
   const xenos::instr_fetch_vtx_t* GetFetchVtxBySlot(uint32_t fetch_slot);
 
-  // NOTE: xe_free() the returned string!
-  char* Disassemble();
+  typedef struct {
+    uint32_t  positions;
+    uint32_t  params;
+    uint32_t  memories;
+  } alloc_counts_t;
+  const alloc_counts_t& alloc_counts() const { return alloc_counts_; }
 
 private:
   void GatherIO();
@@ -50,6 +56,10 @@ protected:
   uint64_t    hash_;
   bool        is_prepared_;
 
+  char*       disasm_src_;
+
+  alloc_counts_t alloc_counts_;
+  std::vector<xenos::instr_cf_exec_t> execs_;
   std::vector<xenos::instr_cf_alloc_t>  allocs_;
   std::vector<xenos::instr_fetch_vtx_t> fetch_vtxs_;
   xenos::instr_fetch_vtx_t fetch_vtx_slots_[96];
diff --git a/src/xenia/gpu/xenos/ucode.h b/src/xenia/gpu/xenos/ucode.h
index 4aaa7448a..9eec3daf7 100644
--- a/src/xenia/gpu/xenos/ucode.h
+++ b/src/xenia/gpu/xenos/ucode.h
@@ -297,6 +297,14 @@ XEPACKEDSTRUCT(instr_cf_exec_t, {
     uint32_t            address_mode            : 1;    // instr_addr_mode_t
     uint32_t            opc                     : 4;    // instr_cf_opc_t
   });
+  bool is_cond_exec() const {
+    return (this->opc == COND_EXEC) ||
+           (this->opc == COND_EXEC_END) ||
+           (this->opc == COND_PRED_EXEC) ||
+           (this->opc == COND_PRED_EXEC_END) ||
+           (this->opc == COND_EXEC_PRED_CLEAN) ||
+           (this->opc == COND_EXEC_PRED_CLEAN_END);
+  }
 });
 
 XEPACKEDSTRUCT(instr_cf_loop_t, {
diff --git a/src/xenia/gpu/xenos/ucode_disassembler.cc b/src/xenia/gpu/xenos/ucode_disassembler.cc
index 58f7122aa..d6db6de7e 100644
--- a/src/xenia/gpu/xenos/ucode_disassembler.cc
+++ b/src/xenia/gpu/xenos/ucode_disassembler.cc
@@ -255,44 +255,50 @@ int disasm_alu(
 
   output->append("   %sALU:\t", sync ? "(S)" : "   ");
 
-  output->append("%s", vector_instructions[alu->vector_opc].name);
-
-  if (alu->pred_select & 0x2) {
-    // seems to work similar to conditional execution in ARM instruction
-    // set, so let's use a similar syntax for now:
-    output->append((alu->pred_select & 0x1) ? "EQ" : "NE");
+  if (!alu->scalar_write_mask && !alu->vector_write_mask) {
+    output->append("   <nop>\n");
   }
 
-  output->append("\t");
+  if (alu->vector_write_mask) {
+    output->append("%s", vector_instructions[alu->vector_opc].name);
 
-  print_dstreg(output,
-               alu->vector_dest, alu->vector_write_mask, alu->export_data);
-  output->append(" = ");
-  if (vector_instructions[alu->vector_opc].num_srcs == 3) {
+    if (alu->pred_select & 0x2) {
+      // seems to work similar to conditional execution in ARM instruction
+      // set, so let's use a similar syntax for now:
+      output->append((alu->pred_select & 0x1) ? "EQ" : "NE");
+    }
+
+    output->append("\t");
+
+    print_dstreg(output,
+                 alu->vector_dest, alu->vector_write_mask, alu->export_data);
+    output->append(" = ");
+    if (vector_instructions[alu->vector_opc].num_srcs == 3) {
+      print_srcreg(output,
+                   alu->src3_reg, alu->src3_sel, alu->src3_swiz,
+                   alu->src3_reg_negate, alu->src3_reg_abs);
+      output->append(", ");
+    }
     print_srcreg(output,
-                 alu->src3_reg, alu->src3_sel, alu->src3_swiz,
-                 alu->src3_reg_negate, alu->src3_reg_abs);
-    output->append(", ");
-  }
-  print_srcreg(output,
-               alu->src1_reg, alu->src1_sel, alu->src1_swiz,
-               alu->src1_reg_negate, alu->src1_reg_abs);
-  if (vector_instructions[alu->vector_opc].num_srcs > 1) {
-    output->append(", ");
-    print_srcreg(output,
-                 alu->src2_reg, alu->src2_sel, alu->src2_swiz,
-                 alu->src2_reg_negate, alu->src2_reg_abs);
-  }
+                 alu->src1_reg, alu->src1_sel, alu->src1_swiz,
+                 alu->src1_reg_negate, alu->src1_reg_abs);
+    if (vector_instructions[alu->vector_opc].num_srcs > 1) {
+      output->append(", ");
+      print_srcreg(output,
+                   alu->src2_reg, alu->src2_sel, alu->src2_swiz,
+                   alu->src2_reg_negate, alu->src2_reg_abs);
+    }
 
-  if (alu->vector_clamp) {
-    output->append(" CLAMP");
-  }
+    if (alu->vector_clamp) {
+      output->append(" CLAMP");
+    }
 
-  if (alu->export_data) {
-    print_export_comment(output, alu->vector_dest, type);
-  }
+    if (alu->export_data) {
+      print_export_comment(output, alu->vector_dest, type);
+    }
 
-  output->append("\n");
+    output->append("\n");
+  }
 
   if (alu->scalar_write_mask || !alu->vector_write_mask) {
     // 2nd optional scalar op:
diff --git a/src/xenia/gpu/xenos/xenos.h b/src/xenia/gpu/xenos/xenos.h
index e0ee4e4b3..0df141a03 100644
--- a/src/xenia/gpu/xenos/xenos.h
+++ b/src/xenia/gpu/xenos/xenos.h
@@ -45,8 +45,10 @@ typedef enum {
 // XE_GPU_REG_SQ_PROGRAM_CNTL
 typedef union {
   XEPACKEDSTRUCTANONYMOUS({
-    uint32_t vs_regs            : 8;
-    uint32_t ps_regs            : 8;
+    uint32_t vs_regs            : 6;
+    uint32_t                    : 2;
+    uint32_t ps_regs            : 6;
+    uint32_t                    : 2;
     uint32_t vs_resource        : 1;
     uint32_t ps_resource        : 1;
     uint32_t param_gen          : 1;