From 6109e0b03aa604f2be5e3128402d98f8d46fdaff Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 25 Feb 2016 17:41:41 -0600
Subject: [PATCH 01/77] Fix incorrect images/samplers definitions ps_param_gen
 and fix interpolators being copied incorrectly

---
 src/xenia/gpu/spirv_shader_translator.cc | 112 ++++++++++++++---------
 src/xenia/gpu/spirv_shader_translator.h  |   1 -
 2 files changed, 69 insertions(+), 44 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index bdd4c7e97..2cf137b43 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -164,6 +164,7 @@ void SpirvShaderTranslator::StartTranslation() {
                                   push_constants_type, "push_consts");
 
   // Texture bindings
+  Id samplers_t = b.makeSamplerType();
   Id img_t[] = {
       b.makeImageType(float_type_, spv::Dim::Dim1D, false, false, false, 1,
                       spv::ImageFormat::ImageFormatUnknown),
@@ -173,35 +174,24 @@ void SpirvShaderTranslator::StartTranslation() {
                       spv::ImageFormat::ImageFormatUnknown),
       b.makeImageType(float_type_, spv::Dim::DimCube, false, false, false, 1,
                       spv::ImageFormat::ImageFormatUnknown)};
-  Id samplers_t = b.makeSamplerType();
 
+  Id samplers_a = b.makeArrayType(samplers_t, b.makeUintConstant(32), 0);
   Id img_a_t[] = {b.makeArrayType(img_t[0], b.makeUintConstant(32), 0),
                   b.makeArrayType(img_t[1], b.makeUintConstant(32), 0),
                   b.makeArrayType(img_t[2], b.makeUintConstant(32), 0),
                   b.makeArrayType(img_t[3], b.makeUintConstant(32), 0)};
-  Id samplers_a = b.makeArrayType(samplers_t, b.makeUintConstant(32), 0);
-
-  Id img_s[] = {
-      b.makeStructType({img_a_t[0]}, "img1D_type"),
-      b.makeStructType({img_a_t[1]}, "img2D_type"),
-      b.makeStructType({img_a_t[2]}, "img3D_type"),
-      b.makeStructType({img_a_t[3]}, "imgCube_type"),
-  };
-  Id samplers_s = b.makeStructType({samplers_a}, "samplers_type");
 
+  samplers_ = b.createVariable(spv::StorageClass::StorageClassUniform,
+                               samplers_a, "samplers");
+  b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
+  b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
   for (int i = 0; i < 4; i++) {
-    img_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
-                               img_s[i],
-                               xe::format_string("images%dD", i + 1).c_str());
-    b.addDecoration(img_[i], spv::Decoration::DecorationBlock);
+    img_[i] =
+        b.createVariable(spv::StorageClass::StorageClassUniform, img_a_t[i],
+                         xe::format_string("images%dD", i + 1).c_str());
     b.addDecoration(img_[i], spv::Decoration::DecorationDescriptorSet, 1);
     b.addDecoration(img_[i], spv::Decoration::DecorationBinding, i + 1);
   }
-  samplers_ = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
-                               samplers_s, "samplers");
-  b.addDecoration(samplers_, spv::Decoration::DecorationBlock);
-  b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
-  b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
 
   // Interpolators.
   Id interpolators_type =
@@ -255,7 +245,6 @@ void SpirvShaderTranslator::StartTranslation() {
 
     interpolators_ = b.createVariable(spv::StorageClass::StorageClassOutput,
                                       interpolators_type, "interpolators");
-    b.addDecoration(interpolators_, spv::Decoration::DecorationNoPerspective);
     b.addDecoration(interpolators_, spv::Decoration::DecorationLocation, 0);
 
     pos_ = b.createVariable(spv::StorageClass::StorageClassOutput,
@@ -266,22 +255,68 @@ void SpirvShaderTranslator::StartTranslation() {
     // Pixel inputs from vertex shader.
     interpolators_ = b.createVariable(spv::StorageClass::StorageClassInput,
                                       interpolators_type, "interpolators");
-    b.addDecoration(interpolators_, spv::Decoration::DecorationNoPerspective);
     b.addDecoration(interpolators_, spv::Decoration::DecorationLocation, 0);
 
     // Pixel fragment outputs (one per render target).
     Id frag_outputs_type =
         b.makeArrayType(vec4_float_type_, b.makeUintConstant(4), 0);
     frag_outputs_ = b.createVariable(spv::StorageClass::StorageClassOutput,
-                                     frag_outputs_type, "o");
+                                     frag_outputs_type, "oC");
     b.addDecoration(frag_outputs_, spv::Decoration::DecorationLocation, 0);
 
     // TODO(benvanik): frag depth, etc.
 
     // Copy interpolators to r[0..16].
-    b.createNoResultOp(spv::Op::OpCopyMemorySized,
-                       {registers_ptr_, interpolators_,
-                        b.makeUintConstant(16 * 4 * sizeof(float))});
+    // TODO: Need physical addressing in order to do this.
+    // b.createNoResultOp(spv::Op::OpCopyMemorySized,
+    //                   {registers_ptr_, interpolators_,
+    //                    b.makeUintConstant(16 * 4 * sizeof(float))});
+    for (int i = 0; i < 16; i++) {
+      // For now, copy interpolators register-by-register :/
+      auto idx = b.makeUintConstant(i);
+      auto i_a = b.createAccessChain(spv::StorageClass::StorageClassInput,
+                                     interpolators_, std::vector<Id>({idx}));
+      auto r_a = b.createAccessChain(spv::StorageClass::StorageClassFunction,
+                                     registers_ptr_, std::vector<Id>({idx}));
+      b.createNoResultOp(spv::Op::OpCopyMemory, std::vector<Id>({r_a, i_a}));
+    }
+
+    // Setup ps_param_gen
+    auto ps_param_gen_idx_ptr = b.createAccessChain(
+        spv::StorageClass::StorageClassPushConstant, push_consts_,
+        std::vector<Id>({b.makeUintConstant(3)}));
+    auto ps_param_gen_idx = b.createLoad(ps_param_gen_idx_ptr);
+
+    auto frag_coord = b.createVariable(spv::StorageClass::StorageClassInput,
+                                       vec4_float_type_, "gl_FragCoord");
+    b.addDecoration(frag_coord, spv::Decoration::DecorationBuiltIn,
+                    spv::BuiltIn::BuiltInFragCoord);
+
+    auto point_coord = b.createVariable(spv::StorageClass::StorageClassInput,
+                                        vec2_float_type_, "gl_PointCoord");
+    b.addDecoration(point_coord, spv::Decoration::DecorationBuiltIn,
+                    spv::BuiltIn::BuiltInPointCoord);
+    auto param = b.createOp(spv::Op::OpVectorShuffle, vec4_float_type_,
+                            {frag_coord, point_coord, 0, 1, 4, 5});
+    /*
+    // TODO: gl_FrontFacing
+    auto param_x = b.createCompositeExtract(param, float_type_, 0);
+    auto param_x_inv = b.createBinOp(spv::Op::OpFMul, float_type_, param_x,
+                                     b.makeFloatConstant(-1.f));
+    param_x = b.createCompositeInsert(param_x_inv, param, vec4_float_type_, 0);
+    */
+
+    auto cond = b.createBinOp(spv::Op::OpINotEqual, bool_type_,
+                              ps_param_gen_idx, b.makeUintConstant(-1));
+    spv::Builder::If ifb(cond, b);
+
+    // Index is specified
+    auto reg_ptr = b.createAccessChain(spv::StorageClass::StorageClassFunction,
+                                       registers_ptr_,
+                                       std::vector<Id>({ps_param_gen_idx}));
+    b.createStore(param, reg_ptr);
+
+    ifb.makeEndIf();
   }
 }
 
@@ -620,22 +655,12 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
   uint32_t dim_idx = 0;
   switch (instr.dimension) {
     case TextureDimension::k1D:
-      src = b.createCompositeExtract(src, float_type_, 0);
       dim_idx = 0;
       break;
     case TextureDimension::k2D: {
-      auto s0 = b.createCompositeExtract(src, float_type_, 0);
-      auto s1 = b.createCompositeExtract(src, float_type_, 1);
-      src = b.createCompositeConstruct(vec2_float_type_,
-                                       std::vector<Id>({s0, s1}));
       dim_idx = 1;
     } break;
     case TextureDimension::k3D: {
-      auto s0 = b.createCompositeExtract(src, float_type_, 0);
-      auto s1 = b.createCompositeExtract(src, float_type_, 1);
-      auto s2 = b.createCompositeExtract(src, float_type_, 2);
-      src = b.createCompositeConstruct(vec3_float_type_,
-                                       std::vector<Id>({s0, s1, s2}));
       dim_idx = 2;
     } break;
     case TextureDimension::kCube: {
@@ -648,21 +673,22 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
   switch (instr.opcode) {
     case FetchOpcode::kTextureFetch: {
       auto image_index = b.makeUintConstant(instr.operands[1].storage_index);
-      auto image_ptr = b.createAccessChain(
-          spv::StorageClass::StorageClassUniformConstant, img_[dim_idx],
-          std::vector<Id>({b.makeUintConstant(0), image_index}));
-      auto sampler_ptr = b.createAccessChain(
-          spv::StorageClass::StorageClassUniformConstant, samplers_,
-          std::vector<Id>({b.makeUintConstant(0), image_index}));
+      auto image_ptr =
+          b.createAccessChain(spv::StorageClass::StorageClassUniform,
+                              img_[dim_idx], std::vector<Id>({image_index}));
+      auto sampler_ptr =
+          b.createAccessChain(spv::StorageClass::StorageClassUniform, samplers_,
+                              std::vector<Id>({image_index}));
       auto image = b.createLoad(image_ptr);
       auto sampler = b.createLoad(sampler_ptr);
 
-      auto tex = b.createBinOp(spv::Op::OpSampledImage, b.getImageType(image),
+      auto sampled_image_type = b.makeSampledImageType(b.getImageType(image));
+      auto tex = b.createBinOp(spv::Op::OpSampledImage, sampled_image_type,
                                image, sampler);
 
       spv::Builder::TextureParameters params = {0};
       params.coords = src;
-      params.sampler = sampler;
+      params.sampler = tex;
       dest = b.createTextureCall(spv::Decoration::DecorationInvariant,
                                  vec4_float_type_, false, false, false, false,
                                  false, params);
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 0d8b1e14c..ed4356322 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -108,7 +108,6 @@ class SpirvShaderTranslator : public ShaderTranslator {
   spv::Id vec2_float_type_ = 0, vec3_float_type_ = 0, vec4_float_type_ = 0;
   spv::Id vec4_uint_type_ = 0;
   spv::Id vec4_bool_type_ = 0;
-  spv::Id sampled_image_type_ = 0;
 
   // Constants.
   spv::Id vec4_float_zero_ = 0, vec4_float_one_ = 0;

From c648e545395a6bb4fac62c9f0a68af200a787b7b Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 27 Feb 2016 11:30:50 -0600
Subject: [PATCH 02/77] Short-circuit draw calls if the render target's pitch
 is 0

---
 src/xenia/gpu/vulkan/vulkan_command_processor.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index f04ec1ad3..1bd05f16a 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -178,6 +178,11 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     return IssueCopy();
   }
 
+  if ((regs[XE_GPU_REG_RB_SURFACE_INFO].u32 & 0x3FFF) == 0) {
+    // Doesn't actually draw.
+    return true;
+  }
+
   // TODO(benvanik): move to CP or to host (trace dump, etc).
   if (FLAGS_vulkan_renderdoc_capture_all && device_->is_renderdoc_attached()) {
     device_->BeginRenderDocFrameCapture();

From 48cf270724aaa897a90ccdf62257c59d4a12f23b Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 27 Feb 2016 16:21:37 -0600
Subject: [PATCH 03/77] Use spv::NoPrecision instead of DecorationInvariant Set
 samplers/images as uniform constants

---
 src/xenia/gpu/spirv_shader_translator.cc | 167 +++++++++++------------
 1 file changed, 78 insertions(+), 89 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 2cf137b43..a45294415 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -48,9 +48,9 @@ void SpirvShaderTranslator::StartTranslation() {
   }
 
   spv::Block* function_block = nullptr;
-  translated_main_ = b.makeFunctionEntry(spv::Decoration::DecorationInvariant,
-                                         b.makeVoidType(), "translated_main",
-                                         {}, {}, &function_block);
+  translated_main_ =
+      b.makeFunctionEntry(spv::NoPrecision, b.makeVoidType(), "translated_main",
+                          {}, {}, &function_block);
 
   bool_type_ = b.makeBoolType();
   float_type_ = b.makeFloatType(32);
@@ -181,14 +181,14 @@ void SpirvShaderTranslator::StartTranslation() {
                   b.makeArrayType(img_t[2], b.makeUintConstant(32), 0),
                   b.makeArrayType(img_t[3], b.makeUintConstant(32), 0)};
 
-  samplers_ = b.createVariable(spv::StorageClass::StorageClassUniform,
+  samplers_ = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
                                samplers_a, "samplers");
   b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
   b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
   for (int i = 0; i < 4; i++) {
-    img_[i] =
-        b.createVariable(spv::StorageClass::StorageClassUniform, img_a_t[i],
-                         xe::format_string("images%dD", i + 1).c_str());
+    img_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
+                               img_a_t[i],
+                               xe::format_string("images%dD", i + 1).c_str());
     b.addDecoration(img_[i], spv::Decoration::DecorationDescriptorSet, 1);
     b.addDecoration(img_[i], spv::Decoration::DecorationBinding, i + 1);
   }
@@ -264,6 +264,11 @@ void SpirvShaderTranslator::StartTranslation() {
                                      frag_outputs_type, "oC");
     b.addDecoration(frag_outputs_, spv::Decoration::DecorationLocation, 0);
 
+    Id frag_depth = b.createVariable(spv::StorageClass::StorageClassOutput,
+                                     vec4_float_type_, "gl_FragDepth");
+    b.addDecoration(frag_depth, spv::Decoration::DecorationBuiltIn,
+                    spv::BuiltIn::BuiltInFragDepth);
+
     // TODO(benvanik): frag depth, etc.
 
     // Copy interpolators to r[0..16].
@@ -365,8 +370,7 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
     p_w = b.createTriOp(spv::Op::OpSelect, float_type_, c_w, p_w, p_w_inv);
 
     // pos.xyz = vtx_fmt.xyz != 0.0 ? pos.xyz / pos.w : pos.xyz
-    auto p_all_w = b.smearScalar(spv::Decoration::DecorationInvariant, p_w,
-                                 vec4_float_type_);
+    auto p_all_w = b.smearScalar(spv::NoPrecision, p_w, vec4_float_type_);
     auto p_inv = b.createBinOp(spv::Op::OpFDiv, vec4_float_type_, p, p_all_w);
     p = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c, p_inv, p);
 
@@ -654,9 +658,9 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
 
   uint32_t dim_idx = 0;
   switch (instr.dimension) {
-    case TextureDimension::k1D:
+    case TextureDimension::k1D: {
       dim_idx = 0;
-      break;
+    } break;
     case TextureDimension::k2D: {
       dim_idx = 1;
     } break;
@@ -674,13 +678,15 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
     case FetchOpcode::kTextureFetch: {
       auto image_index = b.makeUintConstant(instr.operands[1].storage_index);
       auto image_ptr =
-          b.createAccessChain(spv::StorageClass::StorageClassUniform,
+          b.createAccessChain(spv::StorageClass::StorageClassUniformConstant,
                               img_[dim_idx], std::vector<Id>({image_index}));
       auto sampler_ptr =
-          b.createAccessChain(spv::StorageClass::StorageClassUniform, samplers_,
-                              std::vector<Id>({image_index}));
+          b.createAccessChain(spv::StorageClass::StorageClassUniformConstant,
+                              samplers_, std::vector<Id>({image_index}));
       auto image = b.createLoad(image_ptr);
       auto sampler = b.createLoad(sampler_ptr);
+      assert(b.isImageType(b.getTypeId(image)));
+      assert(b.isSamplerType(b.getTypeId(sampler)));
 
       auto sampled_image_type = b.makeSampledImageType(b.getImageType(image));
       auto tex = b.createBinOp(spv::Op::OpSampledImage, sampled_image_type,
@@ -689,9 +695,8 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
       spv::Builder::TextureParameters params = {0};
       params.coords = src;
       params.sampler = tex;
-      dest = b.createTextureCall(spv::Decoration::DecorationInvariant,
-                                 vec4_float_type_, false, false, false, false,
-                                 false, params);
+      dest = b.createTextureCall(spv::NoPrecision, vec4_float_type_, false,
+                                 false, false, false, false, params);
     } break;
     default:
       // TODO: the rest of these
@@ -780,15 +785,15 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     } break;
 
     case AluVectorOpcode::kFloor: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, vec4_float_type_,
-          spv::GLSLstd450::kFloor, {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
+                                             spv::GLSLstd450::kFloor,
+                                             {sources[0]});
     } break;
 
     case AluVectorOpcode::kFrc: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, vec4_float_type_,
-          spv::GLSLstd450::kFract, {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
+                                             spv::GLSLstd450::kFract,
+                                             {sources[0]});
     } break;
 
     case AluVectorOpcode::kKillEq: {
@@ -883,27 +888,26 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
                            b.makeFloatConstant(0.5f));
       addr = b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, addr);
       addr = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, int_type_,
-          spv::GLSLstd450::kSClamp,
+          spv::NoPrecision, int_type_, spv::GLSLstd450::kSClamp,
           {addr, b.makeIntConstant(-256), b.makeIntConstant(255)});
       b.createStore(addr, a0_);
 
       // dest = src0 >= src1 ? src0 : src1
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, vec4_float_type_,
-          spv::GLSLstd450::kFMax, {sources[0], sources[1]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
+                                             spv::GLSLstd450::kFMax,
+                                             {sources[0], sources[1]});
     } break;
 
     case AluVectorOpcode::kMax: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, vec4_float_type_,
-          spv::GLSLstd450::kFMax, {sources[0], sources[1]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
+                                             spv::GLSLstd450::kFMax,
+                                             {sources[0], sources[1]});
     } break;
 
     case AluVectorOpcode::kMin: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, vec4_float_type_,
-          spv::GLSLstd450::kFMin, {sources[0], sources[1]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
+                                             spv::GLSLstd450::kFMin,
+                                             {sources[0], sources[1]});
     } break;
 
     case AluVectorOpcode::kMul: {
@@ -928,8 +932,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
       s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x,
                            b.makeFloatConstant(1.f));
-      auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x,
-                              vec4_float_type_);
+      auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_);
 
       dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x,
                            vec4_float_zero_, s0);
@@ -952,8 +955,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
       s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x,
                            b.makeFloatConstant(1.f));
-      auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x,
-                              vec4_float_type_);
+      auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_);
 
       dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x,
                            vec4_float_zero_, s0);
@@ -976,8 +978,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
       s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x,
                            b.makeFloatConstant(1.f));
-      auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x,
-                              vec4_float_type_);
+      auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_);
 
       dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x,
                            vec4_float_zero_, s0);
@@ -1000,8 +1001,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
       s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x,
                            b.makeFloatConstant(1.f));
-      auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x,
-                              vec4_float_type_);
+      auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_);
 
       dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x,
                            vec4_float_zero_, s0);
@@ -1040,9 +1040,8 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     } break;
 
     case AluVectorOpcode::kTrunc: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, vec4_float_type_,
-          GLSLstd450::kTrunc, {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
+                                             GLSLstd450::kTrunc, {sources[0]});
     } break;
 
     default:
@@ -1124,27 +1123,23 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
 
     case AluScalarOpcode::kCos: {
       // dest = cos(src0)
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kCos,
-          {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kCos, {sources[0]});
     } break;
 
     case AluScalarOpcode::kExp: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kExp2,
-          {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kExp2, {sources[0]});
     } break;
 
     case AluScalarOpcode::kFloors: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFloor,
-          {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kFloor, {sources[0]});
     } break;
 
     case AluScalarOpcode::kFrcs: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFract,
-          {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kFract, {sources[0]});
     } break;
 
     case AluScalarOpcode::kKillsEq: {
@@ -1239,23 +1234,21 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
 
     case AluScalarOpcode::kLog: {
       auto log = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_,
-          spv::GLSLstd450::kLog2, {sources[0]});
+          spv::NoPrecision, float_type_, spv::GLSLstd450::kLog2, {sources[0]});
     } break;
 
     case AluScalarOpcode::kMaxAsf: {
       auto addr =
           b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, sources[0]);
       addr = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, int_type_,
-          spv::GLSLstd450::kSClamp,
+          spv::NoPrecision, int_type_, spv::GLSLstd450::kSClamp,
           {addr, b.makeIntConstant(-256), b.makeIntConstant(255)});
       b.createStore(addr, a0_);
 
       // dest = src0 >= src1 ? src0 : src1
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_,
-          spv::GLSLstd450::kFMax, {sources[0], sources[1]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             spv::GLSLstd450::kFMax,
+                                             {sources[0], sources[1]});
     } break;
 
     case AluScalarOpcode::kMaxAs: {
@@ -1264,29 +1257,28 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
                                 b.makeFloatConstant(0.5f));
       addr = b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, addr);
       addr = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, int_type_,
-          spv::GLSLstd450::kSClamp,
+          spv::NoPrecision, int_type_, spv::GLSLstd450::kSClamp,
           {addr, b.makeIntConstant(-256), b.makeIntConstant(255)});
       b.createStore(addr, a0_);
 
       // dest = src0 >= src1 ? src0 : src1
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_,
-          spv::GLSLstd450::kFMax, {sources[0], sources[1]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             spv::GLSLstd450::kFMax,
+                                             {sources[0], sources[1]});
     } break;
 
     case AluScalarOpcode::kMaxs: {
       // dest = max(src0, src1)
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFMax,
-          {sources[0], sources[1]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kFMax,
+                                             {sources[0], sources[1]});
     } break;
 
     case AluScalarOpcode::kMins: {
       // dest = min(src0, src1)
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFMin,
-          {sources[0], sources[1]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kFMin,
+                                             {sources[0], sources[1]});
     } break;
 
     case AluScalarOpcode::kMuls:
@@ -1326,8 +1318,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto c = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                              b.makeFloatConstant(0.f));
       auto d = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, vec4_float_type_,
-          spv::GLSLstd450::kInverseSqrt, {sources[0]});
+          spv::NoPrecision, vec4_float_type_, spv::GLSLstd450::kInverseSqrt,
+          {sources[0]});
       dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c,
                            b.makeFloatConstant(0.f), d);
     } break;
@@ -1439,7 +1431,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       b.createStore(c, p0_);
 
       dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFMax,
+          spv::NoPrecision, float_type_, GLSLstd450::kFMax,
           {sources[0], b.makeFloatConstant(0.f)});
     } break;
 
@@ -1451,9 +1443,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     case AluScalarOpcode::kSin: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kSin,
-          {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kSin, {sources[0]});
     } break;
 
     case AluScalarOpcode::kSubs:
@@ -1468,9 +1459,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     case AluScalarOpcode::kTruncs: {
-      dest = CreateGlslStd450InstructionCall(
-          spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kTrunc,
-          {sources[0]});
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kTrunc, {sources[0]});
     } break;
 
     default:
@@ -1570,8 +1560,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
 
   if (op.is_absolute_value) {
     storage_value = CreateGlslStd450InstructionCall(
-        spv::Decoration::DecorationInvariant, storage_type, GLSLstd450::kFAbs,
-        {storage_value});
+        spv::NoPrecision, storage_type, GLSLstd450::kFAbs, {storage_value});
   }
   if (op.is_negated) {
     storage_value =
@@ -1739,14 +1728,14 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
       constituents.push_back(b.makeFloatConstant(0.f));
     }
 
-    source_value_id = b.createConstructor(spv::Decoration::DecorationInvariant,
-                                          constituents, storage_type);
+    source_value_id =
+        b.createConstructor(spv::NoPrecision, constituents, storage_type);
   }
 
   // Clamp the input value.
   if (result.is_clamped) {
     source_value_id = CreateGlslStd450InstructionCall(
-        spv::Decoration::DecorationInvariant, b.getTypeId(source_value_id),
+        spv::NoPrecision, b.getTypeId(source_value_id),
         spv::GLSLstd450::kFClamp,
         {source_value_id, b.makeFloatConstant(0.0), b.makeFloatConstant(1.0)});
   }

From 740c70f270c4654d3b4cf30dc55d888f45866209 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 1 Mar 2016 12:52:34 -0600
Subject: [PATCH 04/77] Scalar logc, fix log

---
 src/xenia/gpu/spirv_shader_translator.cc | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index a45294415..600e3fe56 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -729,6 +729,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     const ParsedAluInstruction& instr) {
   auto& b = *builder_;
 
+  // TODO: If we have identical operands, reuse previous one.
   Id sources[3] = {0};
   Id dest = 0;
   for (size_t i = 0; i < instr.operand_count; i++) {
@@ -899,12 +900,24 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     } break;
 
     case AluVectorOpcode::kMax: {
+      if (sources[0] == sources[1]) {
+        // mov dst, src
+        dest = sources[0];
+        break;
+      }
+
       dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
                                              spv::GLSLstd450::kFMax,
                                              {sources[0], sources[1]});
     } break;
 
     case AluVectorOpcode::kMin: {
+      if (sources[0] == sources[1]) {
+        // mov dst, src
+        dest = sources[0];
+        break;
+      }
+
       dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_,
                                              spv::GLSLstd450::kFMin,
                                              {sources[0], sources[1]});
@@ -1065,6 +1078,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     const ParsedAluInstruction& instr) {
   auto& b = *builder_;
 
+  // TODO: If we have identical operands, reuse previous one.
   Id sources[3] = {0};
   Id dest = 0;
   for (size_t i = 0, x = 0; i < instr.operand_count; i++) {
@@ -1230,10 +1244,17 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     case AluScalarOpcode::kLogc: {
+      auto t = CreateGlslStd450InstructionCall(
+          spv::NoPrecision, float_type_, spv::GLSLstd450::kLog2, {sources[0]});
+
+      // FIXME: We don't check to see if t == -INF, we just check for INF
+      auto c = b.createUnaryOp(spv::Op::OpIsInf, bool_type_, t);
+      dest = b.createTriOp(spv::Op::OpSelect, float_type_, c,
+                           b.makeFloatConstant(-FLT_MAX), t);
     } break;
 
     case AluScalarOpcode::kLog: {
-      auto log = CreateGlslStd450InstructionCall(
+      dest = CreateGlslStd450InstructionCall(
           spv::NoPrecision, float_type_, spv::GLSLstd450::kLog2, {sources[0]});
     } break;
 

From 38094ac81955170816b776202299c95ad971fe57 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 2 Mar 2016 21:16:38 -0600
Subject: [PATCH 05/77] Updated local clang-format.

---
 src/xenia/gpu/shader_translator.cc       |  4 ++--
 src/xenia/gpu/spirv_shader_translator.cc | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index a89be80f5..6e8b69cea 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -672,11 +672,11 @@ void ShaderTranslator::TranslateExecInstructions(
           static_cast<FetchOpcode>(ucode_dwords_[instr_offset * 3] & 0x1F);
       if (fetch_opcode == FetchOpcode::kVertexFetch) {
         auto& op = *reinterpret_cast<const VertexFetchInstruction*>(
-                       ucode_dwords_ + instr_offset * 3);
+            ucode_dwords_ + instr_offset * 3);
         TranslateVertexFetchInstruction(op);
       } else {
         auto& op = *reinterpret_cast<const TextureFetchInstruction*>(
-                       ucode_dwords_ + instr_offset * 3);
+            ucode_dwords_ + instr_offset * 3);
         TranslateTextureFetchInstruction(op);
       }
     } else {
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 600e3fe56..57af04e24 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -237,9 +237,8 @@ void SpirvShaderTranslator::StartTranslation() {
         b.addDecoration(attrib_var, spv::Decoration::DecorationLocation,
                         attrib.attrib_index);
 
-        vertex_binding_map_[binding.fetch_constant][attrib.fetch_instr
-                                                        .attributes.offset] =
-            attrib_var;
+        vertex_binding_map_[binding.fetch_constant]
+                           [attrib.fetch_instr.attributes.offset] = attrib_var;
       }
     }
 
@@ -636,9 +635,8 @@ void SpirvShaderTranslator::ProcessVertexFetchInstruction(
   // Operand 0 is the index
   // Operand 1 is the binding
   // TODO: Indexed fetch
-  auto vertex_ptr =
-      vertex_binding_map_[instr.operands[1].storage_index][instr.attributes
-                                                               .offset];
+  auto vertex_ptr = vertex_binding_map_[instr.operands[1].storage_index]
+                                       [instr.attributes.offset];
   assert_not_zero(vertex_ptr);
 
   auto vertex = b.createLoad(vertex_ptr);

From 8ca9c6f6f4f6acfcf8b593290deaa858a069e6c7 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 3 Mar 2016 20:11:23 -0600
Subject: [PATCH 06/77] Fix spirv-tools incorrect includes

---
 third_party/spirv-tools.lua | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/spirv-tools.lua b/third_party/spirv-tools.lua
index 4218ff08e..afa3cdef5 100644
--- a/third_party/spirv-tools.lua
+++ b/third_party/spirv-tools.lua
@@ -13,9 +13,9 @@ project("spirv-tools")
     "spirv-tools/include",
   })
   files({
-    "spirv-tools/external/include/headers/GLSL.std.450.h",
-    "spirv-tools/external/include/headers/OpenCL.std.h",
-    "spirv-tools/external/include/headers/spirv.h",
+    "spirv-tools/include/spirv/GLSL.std.450.h",
+    "spirv-tools/include/spirv/OpenCL.std.h",
+    "spirv-tools/include/spirv/spirv.h",
     "spirv-tools/include/spirv-tools/libspirv.h",
     "spirv-tools/source/assembly_grammar.cpp",
     "spirv-tools/source/assembly_grammar.h",

From af7fc20c38f6e3f6cbbd013f575bcbecc320667a Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 5 Mar 2016 22:09:18 -0600
Subject: [PATCH 07/77] Beginnings of texture conversion/uploads

---
 src/xenia/gpu/vulkan/texture_cache.cc         | 359 ++++++++++++++----
 src/xenia/gpu/vulkan/texture_cache.h          |  59 ++-
 .../gpu/vulkan/vulkan_command_processor.cc    |  82 +++-
 .../gpu/vulkan/vulkan_command_processor.h     |   8 +-
 4 files changed, 405 insertions(+), 103 deletions(-)

diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 4e93a46ca..8a8e2e2f4 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -81,83 +81,304 @@ TextureCache::TextureCache(RegisterFile* register_file,
                                     nullptr, &texture_descriptor_set_layout_);
   CheckResult(err, "vkCreateDescriptorSetLayout");
 
-  SetupGridImages();
+  // Allocate memory for a staging buffer.
+  VkBufferCreateInfo staging_buffer_info;
+  staging_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  staging_buffer_info.pNext = nullptr;
+  staging_buffer_info.flags = 0;
+  staging_buffer_info.size = 2048 * 2048 * 4;  // 16MB buffer
+  staging_buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+  staging_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  staging_buffer_info.queueFamilyIndexCount = 0;
+  staging_buffer_info.pQueueFamilyIndices = nullptr;
+  err =
+      vkCreateBuffer(*device_, &staging_buffer_info, nullptr, &staging_buffer_);
+  CheckResult(err, "vkCreateBuffer");
+
+  if (err == VK_SUCCESS) {
+    VkMemoryRequirements staging_buffer_reqs;
+    vkGetBufferMemoryRequirements(*device_, staging_buffer_,
+                                  &staging_buffer_reqs);
+    staging_buffer_mem_ = device_->AllocateMemory(staging_buffer_reqs);
+    assert_not_null(staging_buffer_mem_);
+
+    err = vkBindBufferMemory(*device_, staging_buffer_, staging_buffer_mem_, 0);
+    CheckResult(err, "vkBindBufferMemory");
+
+    // Upload a grid into the staging buffer.
+    uint32_t* gpu_data = nullptr;
+    err =
+        vkMapMemory(*device_, staging_buffer_mem_, 0, staging_buffer_info.size,
+                    0, reinterpret_cast<void**>(&gpu_data));
+    CheckResult(err, "vkMapMemory");
+
+    int width = 2048;
+    int height = 2048;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        gpu_data[y * width + x] =
+            ((y % 32 < 16) ^ (x % 32 >= 16)) ? 0xFF0000FF : 0xFFFFFFFF;
+      }
+    }
+
+    vkUnmapMemory(*device_, staging_buffer_mem_);
+  }
 }
 
 TextureCache::~TextureCache() {
-  vkDestroyImageView(*device_, grid_image_2d_view_, nullptr);
-  vkDestroyImage(*device_, grid_image_2d_, nullptr);
-  vkFreeMemory(*device_, grid_image_2d_memory_, nullptr);
-
   vkDestroyDescriptorSetLayout(*device_, texture_descriptor_set_layout_,
                                nullptr);
   vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr);
 }
 
-void TextureCache::SetupGridImages() {
-  VkImageCreateInfo image_info;
+TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
+                                            VkCommandBuffer command_buffer) {
+  // Run a tight loop to scan for an existing texture.
+  auto texture_hash = texture_info.hash();
+  for (auto it = textures_.find(texture_hash); it != textures_.end(); ++it) {
+    if (it->second->texture_info == texture_info) {
+      return it->second.get();
+    }
+  }
+
+  // Though we didn't find an exact match, that doesn't mean we're out of the
+  // woods yet. This texture could either be a portion of another texture or
+  // vice versa. Check for overlap before uploading.
+  for (auto it = textures_.begin(); it != textures_.end(); ++it) {
+  }
+
+  if (!command_buffer) {
+    // Texture not found and no command buffer was passed allowing us to upload
+    // a new one.
+    return nullptr;
+  }
+
+  // Create a new texture and cache it.
+  auto texture = AllocateTexture(texture_info);
+  if (!texture) {
+    // Failed to allocate texture (out of memory?)
+    assert_always();
+    return nullptr;
+  }
+
+  if (!UploadTexture2D(command_buffer, texture, texture_info)) {
+    // TODO: Destroy the texture.
+    assert_always();
+    return nullptr;
+  }
+
+  textures_[texture_hash] = std::unique_ptr<Texture>(texture);
+
+  return texture;
+}
+
+TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
+  auto sampler_hash = sampler_info.hash();
+  for (auto it = samplers_.find(sampler_hash); it != samplers_.end(); ++it) {
+    if (it->second->sampler_info == sampler_info) {
+      // Found a compatible sampler.
+      return it->second.get();
+    }
+  }
+
+  VkResult status = VK_SUCCESS;
+
+  // Create a new sampler and cache it.
+  // TODO: Actually set the properties
+  VkSamplerCreateInfo sampler_create_info;
+  sampler_create_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
+  sampler_create_info.pNext = nullptr;
+  sampler_create_info.flags = 0;
+  sampler_create_info.magFilter = VK_FILTER_NEAREST;
+  sampler_create_info.minFilter = VK_FILTER_NEAREST;
+  sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+  sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+  sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+  sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+  sampler_create_info.mipLodBias = 0.0f;
+  sampler_create_info.anisotropyEnable = VK_FALSE;
+  sampler_create_info.maxAnisotropy = 1.0f;
+  sampler_create_info.compareEnable = VK_FALSE;
+  sampler_create_info.compareOp = VK_COMPARE_OP_ALWAYS;
+  sampler_create_info.minLod = 0.0f;
+  sampler_create_info.maxLod = 0.0f;
+  sampler_create_info.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK;
+  sampler_create_info.unnormalizedCoordinates = VK_FALSE;
+  VkSampler vk_sampler;
+  status =
+      vkCreateSampler(*device_, &sampler_create_info, nullptr, &vk_sampler);
+  CheckResult(status, "vkCreateSampler");
+  if (status != VK_SUCCESS) {
+    return nullptr;
+  }
+
+  auto sampler = new Sampler();
+  sampler->sampler = vk_sampler;
+  sampler->sampler_info = sampler_info;
+  samplers_[sampler_hash] = std::unique_ptr<Sampler>(sampler);
+
+  return sampler;
+}
+
+TextureCache::Texture* TextureCache::AllocateTexture(TextureInfo texture_info) {
+  // Create an image first.
+  VkImageCreateInfo image_info = {};
   image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-  image_info.pNext = nullptr;
-  image_info.flags = 0;
-  image_info.imageType = VK_IMAGE_TYPE_2D;
+  switch (texture_info.dimension) {
+    case Dimension::k1D:
+      image_info.imageType = VK_IMAGE_TYPE_1D;
+      break;
+    case Dimension::k2D:
+      image_info.imageType = VK_IMAGE_TYPE_2D;
+      break;
+    case Dimension::k3D:
+      image_info.imageType = VK_IMAGE_TYPE_3D;
+      break;
+    case Dimension::kCube:
+      image_info.imageType = VK_IMAGE_TYPE_2D;
+      image_info.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
+      break;
+    default:
+      assert_unhandled_case(texture_info.dimension);
+      return nullptr;
+  }
+
+  // TODO: Format
   image_info.format = VK_FORMAT_R8G8B8A8_UNORM;
-  image_info.extent = {8, 8, 1};
+  image_info.extent = {texture_info.width + 1, texture_info.height + 1,
+                       texture_info.depth + 1};
   image_info.mipLevels = 1;
   image_info.arrayLayers = 1;
   image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-  image_info.tiling = VK_IMAGE_TILING_LINEAR;
-  image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
+  image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
+  image_info.usage =
+      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
   image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
   image_info.queueFamilyIndexCount = 0;
   image_info.pQueueFamilyIndices = nullptr;
-  image_info.initialLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-  auto err = vkCreateImage(*device_, &image_info, nullptr, &grid_image_2d_);
+  image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  VkImage image;
+  auto err = vkCreateImage(*device_, &image_info, nullptr, &image);
   CheckResult(err, "vkCreateImage");
 
-  VkMemoryRequirements memory_requirements;
-  vkGetImageMemoryRequirements(*device_, grid_image_2d_, &memory_requirements);
-  grid_image_2d_memory_ = device_->AllocateMemory(
-      memory_requirements, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-  err = vkBindImageMemory(*device_, grid_image_2d_, grid_image_2d_memory_, 0);
+  VkMemoryRequirements mem_requirements;
+  vkGetImageMemoryRequirements(*device_, image, &mem_requirements);
+
+  // TODO: Use a circular buffer or something else to allocate this memory.
+  // The device has a limited amount (around 64) of memory allocations that we
+  // can make.
+  // Now that we have the size, back the image with GPU memory.
+  auto memory = device_->AllocateMemory(mem_requirements, 0);
+  err = vkBindImageMemory(*device_, image, memory, 0);
   CheckResult(err, "vkBindImageMemory");
 
+  auto texture = new Texture();
+  texture->format = image_info.format;
+  texture->image = image;
+  texture->memory_offset = 0;
+  texture->memory_size = mem_requirements.size;
+  texture->texture_info = texture_info;
+  texture->texture_memory = memory;
+
+  // Create a default view, just for kicks.
   VkImageViewCreateInfo view_info;
   view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
   view_info.pNext = nullptr;
   view_info.flags = 0;
-  view_info.image = grid_image_2d_;
+  view_info.image = image;
   view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
-  view_info.format = VK_FORMAT_R8G8B8A8_UNORM;
+  view_info.format = image_info.format;
   view_info.components = {
       VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B,
       VK_COMPONENT_SWIZZLE_A,
   };
   view_info.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
-  err = vkCreateImageView(*device_, &view_info, nullptr, &grid_image_2d_view_);
+  VkImageView view;
+  err = vkCreateImageView(*device_, &view_info, nullptr, &view);
   CheckResult(err, "vkCreateImageView");
-
-  VkImageSubresource subresource;
-  subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-  subresource.mipLevel = 0;
-  subresource.arrayLayer = 0;
-  VkSubresourceLayout layout;
-  vkGetImageSubresourceLayout(*device_, grid_image_2d_, &subresource, &layout);
-
-  void* gpu_data = nullptr;
-  err = vkMapMemory(*device_, grid_image_2d_memory_, 0, layout.size, 0,
-                    &gpu_data);
-  CheckResult(err, "vkMapMemory");
-
-  uint32_t grid_pixels[8 * 8];
-  for (int y = 0; y < 8; ++y) {
-    for (int x = 0; x < 8; ++x) {
-      grid_pixels[y * 8 + x] =
-          ((y % 2 == 0) ^ (x % 2 != 0)) ? 0xFFFFFFFF : 0xFF0000FF;
-    }
+  if (err == VK_SUCCESS) {
+    auto texture_view = std::make_unique<TextureView>();
+    texture_view->texture = texture;
+    texture_view->view = view;
+    texture->views.push_back(std::move(texture_view));
   }
-  std::memcpy(gpu_data, grid_pixels, sizeof(grid_pixels));
 
-  vkUnmapMemory(*device_, grid_image_2d_memory_);
+  return texture;
+}
+
+bool TextureCache::FreeTexture(Texture* texture) {
+  // TODO(DrChat)
+  return false;
+}
+
+bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
+                                   Texture* dest, TextureInfo src) {
+  // TODO: We need to allocate memory to use as a staging buffer. We can then
+  // raw copy the texture from system memory into the staging buffer and use a
+  // shader to convert the texture into a format consumable by the host GPU.
+
+  // Need to have unique memory for every upload for at least one frame. If we
+  // run out of memory, we need to flush all queued upload commands to the GPU.
+
+  // TODO: Upload memory here.
+
+  // Insert a memory barrier into the command buffer to ensure the upload has
+  // finished before we copy it into the destination texture.
+  VkBufferMemoryBarrier upload_barrier = {
+      VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+      NULL,
+      VK_ACCESS_HOST_WRITE_BIT,
+      VK_ACCESS_TRANSFER_READ_BIT,
+      VK_QUEUE_FAMILY_IGNORED,
+      VK_QUEUE_FAMILY_IGNORED,
+      staging_buffer_,
+      0,
+      2048 * 2048 * 4,
+  };
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
+                       &upload_barrier, 0, nullptr);
+
+  // Transition the texture into a transfer destination layout.
+  VkImageMemoryBarrier barrier;
+  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  barrier.pNext = nullptr;
+  barrier.srcAccessMask = 0;
+  barrier.dstAccessMask =
+      VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT;
+  barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.image = dest->image;
+  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
+
+  // For now, just transfer the grid we uploaded earlier into the texture.
+  VkBufferImageCopy copy_region;
+  copy_region.bufferOffset = 0;
+  copy_region.bufferRowLength = 0;
+  copy_region.bufferImageHeight = 0;
+  copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+  copy_region.imageOffset = {0, 0, 0};
+  copy_region.imageExtent = {dest->texture_info.width + 1,
+                             dest->texture_info.height + 1,
+                             dest->texture_info.depth + 1};
+  vkCmdCopyBufferToImage(command_buffer, staging_buffer_, dest->image,
+                         VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy_region);
+
+  // Now transition the texture into a shader readonly source.
+  barrier.srcAccessMask = barrier.dstAccessMask;
+  barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+  barrier.oldLayout = barrier.newLayout;
+  barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
+
+  return true;
 }
 
 VkDescriptorSet TextureCache::PrepareTextureSet(
@@ -179,9 +400,11 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
   // shaders.
   bool any_failed = false;
   any_failed =
-      !SetupTextureBindings(update_set_info, vertex_bindings) || any_failed;
+      !SetupTextureBindings(update_set_info, vertex_bindings, command_buffer) ||
+      any_failed;
   any_failed =
-      !SetupTextureBindings(update_set_info, pixel_bindings) || any_failed;
+      !SetupTextureBindings(update_set_info, pixel_bindings, command_buffer) ||
+      any_failed;
   if (any_failed) {
     XELOGW("Failed to setup one or more texture bindings");
     // TODO(benvanik): actually bail out here?
@@ -269,13 +492,16 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
 
 bool TextureCache::SetupTextureBindings(
     UpdateSetInfo* update_set_info,
-    const std::vector<Shader::TextureBinding>& bindings) {
+    const std::vector<Shader::TextureBinding>& bindings,
+    VkCommandBuffer command_buffer) {
   bool any_failed = false;
   for (auto& binding : bindings) {
     uint32_t fetch_bit = 1 << binding.fetch_constant;
     if ((update_set_info->has_setup_fetch_mask & fetch_bit) == 0) {
       // Needs setup.
-      any_failed = !SetupTextureBinding(update_set_info, binding) || any_failed;
+      any_failed =
+          !SetupTextureBinding(update_set_info, binding, command_buffer) ||
+          any_failed;
       update_set_info->has_setup_fetch_mask |= fetch_bit;
     }
   }
@@ -283,7 +509,8 @@ bool TextureCache::SetupTextureBindings(
 }
 
 bool TextureCache::SetupTextureBinding(UpdateSetInfo* update_set_info,
-                                       const Shader::TextureBinding& binding) {
+                                       const Shader::TextureBinding& binding,
+                                       VkCommandBuffer command_buffer) {
   auto& regs = *register_file_;
   int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6;
   auto group =
@@ -308,41 +535,21 @@ bool TextureCache::SetupTextureBinding(UpdateSetInfo* update_set_info,
     return false;  // invalid texture used
   }
 
+  auto texture = Demand(texture_info, command_buffer);
+  auto sampler = Demand(sampler_info);
+  assert_true(texture != nullptr && sampler != nullptr);
+
   trace_writer_->WriteMemoryRead(texture_info.guest_address,
                                  texture_info.input_length);
 
-  // TODO(benvanik): reuse.
-  VkSamplerCreateInfo sampler_create_info;
-  sampler_create_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
-  sampler_create_info.pNext = nullptr;
-  sampler_create_info.flags = 0;
-  sampler_create_info.magFilter = VK_FILTER_NEAREST;
-  sampler_create_info.minFilter = VK_FILTER_NEAREST;
-  sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
-  sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT;
-  sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
-  sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
-  sampler_create_info.mipLodBias = 0.0f;
-  sampler_create_info.anisotropyEnable = VK_FALSE;
-  sampler_create_info.maxAnisotropy = 1.0f;
-  sampler_create_info.compareEnable = VK_FALSE;
-  sampler_create_info.compareOp = VK_COMPARE_OP_ALWAYS;
-  sampler_create_info.minLod = 0.0f;
-  sampler_create_info.maxLod = 0.0f;
-  sampler_create_info.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK;
-  sampler_create_info.unnormalizedCoordinates = VK_FALSE;
-  VkSampler sampler;
-  auto err = vkCreateSampler(*device_, &sampler_create_info, nullptr, &sampler);
-  CheckResult(err, "vkCreateSampler");
-
   auto& sampler_write =
       update_set_info->sampler_infos[update_set_info->sampler_write_count++];
-  sampler_write.sampler = sampler;
+  sampler_write.sampler = sampler->sampler;
 
   auto& image_write =
       update_set_info->image_2d_infos[update_set_info->image_2d_write_count++];
-  image_write.imageView = grid_image_2d_view_;
-  image_write.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+  image_write.imageView = texture->views[0]->view;
+  image_write.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
 
   return true;
 }
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index 9ba3f3577..896bb3155 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -10,8 +10,12 @@
 #ifndef XENIA_GPU_VULKAN_TEXTURE_CACHE_H_
 #define XENIA_GPU_VULKAN_TEXTURE_CACHE_H_
 
+#include <unordered_map>
+
 #include "xenia/gpu/register_file.h"
+#include "xenia/gpu/sampler_info.h"
 #include "xenia/gpu/shader.h"
+#include "xenia/gpu/texture_info.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/ui/vulkan/vulkan.h"
@@ -50,14 +54,51 @@ class TextureCache {
 
  private:
   struct UpdateSetInfo;
+  struct TextureView;
 
-  void SetupGridImages();
+  // This represents an uploaded Vulkan texture.
+  struct Texture {
+    TextureInfo texture_info;
+    VkDeviceMemory texture_memory;
+    VkDeviceSize memory_offset;
+    VkDeviceSize memory_size;
+    VkImage image;
+    VkFormat format;
+    std::vector<std::unique_ptr<TextureView>> views;
+  };
 
-  bool SetupTextureBindings(
-      UpdateSetInfo* update_set_info,
-      const std::vector<Shader::TextureBinding>& bindings);
+  struct TextureView {
+    Texture* texture;
+    VkImageView view;
+  };
+
+  // Cached Vulkan sampler.
+  struct Sampler {
+    SamplerInfo sampler_info;
+    VkSampler sampler;
+  };
+
+  // Demands a texture. If command_buffer is null and the texture hasn't been
+  // uploaded to graphics memory already, we will return null and bail.
+  Texture* Demand(const TextureInfo& texture_info,
+                  VkCommandBuffer command_buffer = nullptr);
+  Sampler* Demand(const SamplerInfo& sampler_info);
+
+  // Allocates a new texture and memory to back it on the GPU.
+  Texture* AllocateTexture(TextureInfo texture_info);
+  bool FreeTexture(Texture* texture);
+
+  // Queues commands to upload a texture from system memory, applying any
+  // conversions necessary.
+  bool UploadTexture2D(VkCommandBuffer command_buffer, Texture* dest,
+                       TextureInfo src);
+
+  bool SetupTextureBindings(UpdateSetInfo* update_set_info,
+                            const std::vector<Shader::TextureBinding>& bindings,
+                            VkCommandBuffer command_buffer = nullptr);
   bool SetupTextureBinding(UpdateSetInfo* update_set_info,
-                           const Shader::TextureBinding& binding);
+                           const Shader::TextureBinding& binding,
+                           VkCommandBuffer command_buffer = nullptr);
 
   RegisterFile* register_file_ = nullptr;
   TraceWriter* trace_writer_ = nullptr;
@@ -66,9 +107,11 @@ class TextureCache {
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr;
 
-  VkDeviceMemory grid_image_2d_memory_ = nullptr;
-  VkImage grid_image_2d_ = nullptr;
-  VkImageView grid_image_2d_view_ = nullptr;
+  // Temporary until we have circular buffers.
+  VkBuffer staging_buffer_ = nullptr;
+  VkDeviceMemory staging_buffer_mem_ = nullptr;
+  std::unordered_map<uint64_t, std::unique_ptr<Texture>> textures_;
+  std::unordered_map<uint64_t, std::unique_ptr<Sampler>> samplers_;
 
   struct UpdateSetInfo {
     // Bitmap of all 32 fetch constants and whether they have been setup yet.
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 1bd05f16a..48c7d681d 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -217,6 +217,14 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
   auto err = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
   CheckResult(err, "vkBeginCommandBuffer");
 
+  // Upload and set descriptors for all textures.
+  // We do this outside of the render pass so the texture cache can upload and
+  // convert textures.
+  auto samplers = PopulateSamplers(command_buffer, vertex_shader, pixel_shader);
+  if (!samplers) {
+    return false;
+  }
+
   // Begin the render pass.
   // This will setup our framebuffer and begin the pass in the command buffer.
   auto render_state = render_cache_->BeginRenderPass(
@@ -253,11 +261,10 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     return false;
   }
 
-  // Upload and set descriptors for all textures.
-  if (!PopulateSamplers(command_buffer, vertex_shader, pixel_shader)) {
-    render_cache_->EndRenderPass();
-    return false;
-  }
+  // Bind samplers/textures.
+  vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                          pipeline_cache_->pipeline_layout(), 1, 1, &samplers,
+                          0, nullptr);
 
   // Actually issue the draw.
   if (!index_buffer_info) {
@@ -471,9 +478,9 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
   return true;
 }
 
-bool VulkanCommandProcessor::PopulateSamplers(VkCommandBuffer command_buffer,
-                                              VulkanShader* vertex_shader,
-                                              VulkanShader* pixel_shader) {
+VkDescriptorSet VulkanCommandProcessor::PopulateSamplers(
+    VkCommandBuffer command_buffer, VulkanShader* vertex_shader,
+    VulkanShader* pixel_shader) {
 #if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // FINE_GRAINED_DRAW_SCOPES
@@ -483,20 +490,63 @@ bool VulkanCommandProcessor::PopulateSamplers(VkCommandBuffer command_buffer,
       pixel_shader->texture_bindings());
   if (!descriptor_set) {
     // Unable to bind set.
-    return false;
+    return nullptr;
   }
 
-  // Bind samplers/textures.
-  vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
-                          pipeline_cache_->pipeline_layout(), 1, 1,
-                          &descriptor_set, 0, nullptr);
-
-  return true;
+  return descriptor_set;
 }
 
 bool VulkanCommandProcessor::IssueCopy() {
   SCOPE_profile_cpu_f("gpu");
-  // TODO(benvanik): resolve.
+  auto& regs = *register_file_;
+
+  // This is used to resolve surfaces, taking them from EDRAM render targets
+  // to system memory. It can optionally clear color/depth surfaces, too.
+  // The command buffer has stuff for actually doing this by drawing, however
+  // we should be able to do it without that much easier.
+
+  uint32_t copy_control = regs[XE_GPU_REG_RB_COPY_CONTROL].u32;
+  // Render targets 0-3, 4 = depth
+  uint32_t copy_src_select = copy_control & 0x7;
+  bool color_clear_enabled = (copy_control >> 8) & 0x1;
+  bool depth_clear_enabled = (copy_control >> 9) & 0x1;
+  auto copy_command = static_cast<CopyCommand>((copy_control >> 20) & 0x3);
+
+  uint32_t copy_dest_info = regs[XE_GPU_REG_RB_COPY_DEST_INFO].u32;
+  auto copy_dest_endian = static_cast<Endian128>(copy_dest_info & 0x7);
+  uint32_t copy_dest_array = (copy_dest_info >> 3) & 0x1;
+  assert_true(copy_dest_array == 0);
+  uint32_t copy_dest_slice = (copy_dest_info >> 4) & 0x7;
+  assert_true(copy_dest_slice == 0);
+  auto copy_dest_format =
+    static_cast<ColorFormat>((copy_dest_info >> 7) & 0x3F);
+  uint32_t copy_dest_number = (copy_dest_info >> 13) & 0x7;
+  // assert_true(copy_dest_number == 0); // ?
+  uint32_t copy_dest_bias = (copy_dest_info >> 16) & 0x3F;
+  // assert_true(copy_dest_bias == 0);
+  uint32_t copy_dest_swap = (copy_dest_info >> 25) & 0x1;
+
+  uint32_t copy_dest_base = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32;
+  uint32_t copy_dest_pitch = regs[XE_GPU_REG_RB_COPY_DEST_PITCH].u32;
+  uint32_t copy_dest_height = (copy_dest_pitch >> 16) & 0x3FFF;
+  copy_dest_pitch &= 0x3FFF;
+
+  // None of this is supported yet:
+  uint32_t copy_surface_slice = regs[XE_GPU_REG_RB_COPY_SURFACE_SLICE].u32;
+  assert_true(copy_surface_slice == 0);
+  uint32_t copy_func = regs[XE_GPU_REG_RB_COPY_FUNC].u32;
+  assert_true(copy_func == 0);
+  uint32_t copy_ref = regs[XE_GPU_REG_RB_COPY_REF].u32;
+  assert_true(copy_ref == 0);
+  uint32_t copy_mask = regs[XE_GPU_REG_RB_COPY_MASK].u32;
+  assert_true(copy_mask == 0);
+
+  // RB_SURFACE_INFO
+  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
+  uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
+  uint32_t surface_pitch = surface_info & 0x3FFF;
+  auto surface_msaa = static_cast<MsaaSamples>((surface_info >> 16) & 0x3);
+
   return true;
 }
 
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index 43aec9edd..b45be07fb 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -49,6 +49,8 @@ class VulkanCommandProcessor : public CommandProcessor {
 
   void ClearCaches() override;
 
+  RenderCache* render_cache() { return render_cache_.get(); }
+
  private:
   bool SetupContext() override;
   void ShutdownContext() override;
@@ -73,9 +75,9 @@ class VulkanCommandProcessor : public CommandProcessor {
                            IndexBufferInfo* index_buffer_info);
   bool PopulateVertexBuffers(VkCommandBuffer command_buffer,
                              VulkanShader* vertex_shader);
-  bool PopulateSamplers(VkCommandBuffer command_buffer,
-                        VulkanShader* vertex_shader,
-                        VulkanShader* pixel_shader);
+  VkDescriptorSet PopulateSamplers(VkCommandBuffer command_buffer,
+                                   VulkanShader* vertex_shader,
+                                   VulkanShader* pixel_shader);
   bool IssueCopy() override;
 
   xe::ui::vulkan::VulkanDevice* device_ = nullptr;

From 4e2753970943649c747f2413b01216430da6eaa3 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 8 Mar 2016 17:57:04 -0600
Subject: [PATCH 08/77] Fix Vulkan texture drawing.

---
 src/xenia/gpu/spirv_shader_translator.cc |  88 +++---
 src/xenia/gpu/spirv_shader_translator.h  |   2 +-
 src/xenia/gpu/vulkan/texture_cache.cc    | 347 ++++++++++++++---------
 src/xenia/gpu/vulkan/texture_cache.h     |  75 +++--
 4 files changed, 310 insertions(+), 202 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 57af04e24..3f991baa8 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -164,33 +164,37 @@ void SpirvShaderTranslator::StartTranslation() {
                                   push_constants_type, "push_consts");
 
   // Texture bindings
-  Id samplers_t = b.makeSamplerType();
-  Id img_t[] = {
-      b.makeImageType(float_type_, spv::Dim::Dim1D, false, false, false, 1,
-                      spv::ImageFormat::ImageFormatUnknown),
-      b.makeImageType(float_type_, spv::Dim::Dim2D, false, false, false, 1,
-                      spv::ImageFormat::ImageFormatUnknown),
-      b.makeImageType(float_type_, spv::Dim::Dim3D, false, false, false, 1,
-                      spv::ImageFormat::ImageFormatUnknown),
-      b.makeImageType(float_type_, spv::Dim::DimCube, false, false, false, 1,
-                      spv::ImageFormat::ImageFormatUnknown)};
+  Id tex_t[] = {b.makeSampledImageType(b.makeImageType(
+                    float_type_, spv::Dim::Dim1D, false, false, false, 1,
+                    spv::ImageFormat::ImageFormatUnknown)),
+                b.makeSampledImageType(b.makeImageType(
+                    float_type_, spv::Dim::Dim2D, false, false, false, 1,
+                    spv::ImageFormat::ImageFormatUnknown)),
+                b.makeSampledImageType(b.makeImageType(
+                    float_type_, spv::Dim::Dim3D, false, false, false, 1,
+                    spv::ImageFormat::ImageFormatUnknown)),
+                b.makeSampledImageType(b.makeImageType(
+                    float_type_, spv::Dim::DimCube, false, false, false, 1,
+                    spv::ImageFormat::ImageFormatUnknown))};
 
-  Id samplers_a = b.makeArrayType(samplers_t, b.makeUintConstant(32), 0);
-  Id img_a_t[] = {b.makeArrayType(img_t[0], b.makeUintConstant(32), 0),
-                  b.makeArrayType(img_t[1], b.makeUintConstant(32), 0),
-                  b.makeArrayType(img_t[2], b.makeUintConstant(32), 0),
-                  b.makeArrayType(img_t[3], b.makeUintConstant(32), 0)};
+  // Id samplers_a = b.makeArrayType(sampler_t, b.makeUintConstant(32), 0);
+  Id tex_a_t[] = {b.makeArrayType(tex_t[0], b.makeUintConstant(32), 0),
+                  b.makeArrayType(tex_t[1], b.makeUintConstant(32), 0),
+                  b.makeArrayType(tex_t[2], b.makeUintConstant(32), 0),
+                  b.makeArrayType(tex_t[3], b.makeUintConstant(32), 0)};
 
-  samplers_ = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
-                               samplers_a, "samplers");
-  b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
-  b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
+  // TODO(DrChat): See texture_cache.cc - do we need separate samplers here?
+  // samplers_ =
+  // b.createVariable(spv::StorageClass::StorageClassUniformConstant,
+  //                              samplers_a, "samplers");
+  // b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
+  // b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
   for (int i = 0; i < 4; i++) {
-    img_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
-                               img_a_t[i],
-                               xe::format_string("images%dD", i + 1).c_str());
-    b.addDecoration(img_[i], spv::Decoration::DecorationDescriptorSet, 1);
-    b.addDecoration(img_[i], spv::Decoration::DecorationBinding, i + 1);
+    tex_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
+                               tex_a_t[i],
+                               xe::format_string("textures%dD", i + 1).c_str());
+    b.addDecoration(tex_[i], spv::Decoration::DecorationDescriptorSet, 1);
+    b.addDecoration(tex_[i], spv::Decoration::DecorationBinding, i + 1);
   }
 
   // Interpolators.
@@ -674,25 +678,15 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
 
   switch (instr.opcode) {
     case FetchOpcode::kTextureFetch: {
-      auto image_index = b.makeUintConstant(instr.operands[1].storage_index);
-      auto image_ptr =
+      auto texture_index = b.makeUintConstant(instr.operands[1].storage_index);
+      auto texture_ptr =
           b.createAccessChain(spv::StorageClass::StorageClassUniformConstant,
-                              img_[dim_idx], std::vector<Id>({image_index}));
-      auto sampler_ptr =
-          b.createAccessChain(spv::StorageClass::StorageClassUniformConstant,
-                              samplers_, std::vector<Id>({image_index}));
-      auto image = b.createLoad(image_ptr);
-      auto sampler = b.createLoad(sampler_ptr);
-      assert(b.isImageType(b.getTypeId(image)));
-      assert(b.isSamplerType(b.getTypeId(sampler)));
-
-      auto sampled_image_type = b.makeSampledImageType(b.getImageType(image));
-      auto tex = b.createBinOp(spv::Op::OpSampledImage, sampled_image_type,
-                               image, sampler);
+                              tex_[dim_idx], std::vector<Id>({texture_index}));
+      auto texture = b.createLoad(texture_ptr);
 
       spv::Builder::TextureParameters params = {0};
       params.coords = src;
-      params.sampler = tex;
+      params.sampler = texture;
       dest = b.createTextureCall(spv::NoPrecision, vec4_float_type_, false,
                                  false, false, false, false, params);
     } break;
@@ -1741,10 +1735,18 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
     auto n_dst = b.getNumTypeComponents(storage_type);
     assert_true(n_el < n_dst);
 
-    constituents.push_back(source_value_id);
-    for (int i = n_el; i < n_dst; i++) {
-      // Pad with zeroes.
-      constituents.push_back(b.makeFloatConstant(0.f));
+    if (n_el == 1) {
+      // Smear scalar.
+      for (int i = 0; i < n_dst; i++) {
+        constituents.push_back(source_value_id);
+      }
+    } else {
+      // FIXME: This may not work as intended.
+      constituents.push_back(source_value_id);
+      for (int i = n_el; i < n_dst; i++) {
+        // Pad with zeroes.
+        constituents.push_back(b.makeFloatConstant(0.f));
+      }
     }
 
     source_value_id =
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index ed4356322..3327dccbd 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -122,7 +122,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
   spv::Id interpolators_ = 0;
   spv::Id frag_outputs_ = 0;
   spv::Id samplers_ = 0;
-  spv::Id img_[4] = {0};  // Images {1D, 2D, 3D, Cube}
+  spv::Id tex_[4] = {0};  // Images {1D, 2D, 3D, Cube}
 
   // Map of {binding -> {offset -> spv input}}
   std::map<uint32_t, std::map<uint32_t, spv::Id>> vertex_binding_map_;
diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 8a8e2e2f4..5c6e42b8b 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -42,7 +42,7 @@ TextureCache::TextureCache(RegisterFile* register_file,
   VkDescriptorPoolSize pool_sizes[2];
   pool_sizes[0].type = VK_DESCRIPTOR_TYPE_SAMPLER;
   pool_sizes[0].descriptorCount = 32;
-  pool_sizes[1].type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+  pool_sizes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
   pool_sizes[1].descriptorCount = 32;
   descriptor_pool_info.poolSizeCount = 2;
   descriptor_pool_info.pPoolSizes = pool_sizes;
@@ -63,7 +63,7 @@ TextureCache::TextureCache(RegisterFile* register_file,
   for (int i = 0; i < 4; ++i) {
     auto& texture_binding = bindings[1 + i];
     texture_binding.binding = 1 + i;
-    texture_binding.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+    texture_binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
     texture_binding.descriptorCount = kMaxTextureSamplers;
     texture_binding.stageFlags =
         VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT;
@@ -94,35 +94,37 @@ TextureCache::TextureCache(RegisterFile* register_file,
   err =
       vkCreateBuffer(*device_, &staging_buffer_info, nullptr, &staging_buffer_);
   CheckResult(err, "vkCreateBuffer");
-
-  if (err == VK_SUCCESS) {
-    VkMemoryRequirements staging_buffer_reqs;
-    vkGetBufferMemoryRequirements(*device_, staging_buffer_,
-                                  &staging_buffer_reqs);
-    staging_buffer_mem_ = device_->AllocateMemory(staging_buffer_reqs);
-    assert_not_null(staging_buffer_mem_);
-
-    err = vkBindBufferMemory(*device_, staging_buffer_, staging_buffer_mem_, 0);
-    CheckResult(err, "vkBindBufferMemory");
-
-    // Upload a grid into the staging buffer.
-    uint32_t* gpu_data = nullptr;
-    err =
-        vkMapMemory(*device_, staging_buffer_mem_, 0, staging_buffer_info.size,
-                    0, reinterpret_cast<void**>(&gpu_data));
-    CheckResult(err, "vkMapMemory");
-
-    int width = 2048;
-    int height = 2048;
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        gpu_data[y * width + x] =
-            ((y % 32 < 16) ^ (x % 32 >= 16)) ? 0xFF0000FF : 0xFFFFFFFF;
-      }
-    }
-
-    vkUnmapMemory(*device_, staging_buffer_mem_);
+  if (err != VK_SUCCESS) {
+    // This isn't good.
+    assert_always();
+    return;
   }
+
+  VkMemoryRequirements staging_buffer_reqs;
+  vkGetBufferMemoryRequirements(*device_, staging_buffer_,
+                                &staging_buffer_reqs);
+  staging_buffer_mem_ = device_->AllocateMemory(staging_buffer_reqs);
+  assert_not_null(staging_buffer_mem_);
+
+  err = vkBindBufferMemory(*device_, staging_buffer_, staging_buffer_mem_, 0);
+  CheckResult(err, "vkBindBufferMemory");
+
+  // Upload a grid into the staging buffer.
+  uint32_t* gpu_data = nullptr;
+  err = vkMapMemory(*device_, staging_buffer_mem_, 0, staging_buffer_info.size,
+                    0, reinterpret_cast<void**>(&gpu_data));
+  CheckResult(err, "vkMapMemory");
+
+  int width = 2048;
+  int height = 2048;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      gpu_data[y * width + x] =
+          ((y % 32 < 16) ^ (x % 32 >= 16)) ? 0xFF0000FF : 0xFFFFFFFF;
+    }
+  }
+
+  vkUnmapMemory(*device_, staging_buffer_mem_);
 }
 
 TextureCache::~TextureCache() {
@@ -131,9 +133,141 @@ TextureCache::~TextureCache() {
   vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr);
 }
 
+TextureCache::Texture* TextureCache::AllocateTexture(
+    const TextureInfo& texture_info) {
+  // Create an image first.
+  VkImageCreateInfo image_info = {};
+  image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+  switch (texture_info.dimension) {
+    case Dimension::k1D:
+      image_info.imageType = VK_IMAGE_TYPE_1D;
+      break;
+    case Dimension::k2D:
+      image_info.imageType = VK_IMAGE_TYPE_2D;
+      break;
+    case Dimension::k3D:
+      image_info.imageType = VK_IMAGE_TYPE_3D;
+      break;
+    case Dimension::kCube:
+      image_info.imageType = VK_IMAGE_TYPE_2D;
+      image_info.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
+      break;
+    default:
+      assert_unhandled_case(texture_info.dimension);
+      return nullptr;
+  }
+
+  // TODO: Format
+  image_info.format = VK_FORMAT_R8G8B8A8_UNORM;
+  image_info.extent = {texture_info.width + 1, texture_info.height + 1,
+                       texture_info.depth + 1};
+  image_info.mipLevels = 1;
+  image_info.arrayLayers = 1;
+  image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+  image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
+  image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT |
+                     VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                     VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+  image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  image_info.queueFamilyIndexCount = 0;
+  image_info.pQueueFamilyIndices = nullptr;
+  image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  VkImage image;
+  auto err = vkCreateImage(*device_, &image_info, nullptr, &image);
+  CheckResult(err, "vkCreateImage");
+
+  VkMemoryRequirements mem_requirements;
+  vkGetImageMemoryRequirements(*device_, image, &mem_requirements);
+
+  // TODO: Use a circular buffer or something else to allocate this memory.
+  // The device has a limited amount (around 64) of memory allocations that we
+  // can make.
+  // Now that we have the size, back the image with GPU memory.
+  auto memory = device_->AllocateMemory(mem_requirements, 0);
+  if (!memory) {
+    // Crap.
+    assert_always();
+    vkDestroyImage(*device_, image, nullptr);
+    return nullptr;
+  }
+
+  err = vkBindImageMemory(*device_, image, memory, 0);
+  CheckResult(err, "vkBindImageMemory");
+
+  auto texture = new Texture();
+  texture->format = image_info.format;
+  texture->image = image;
+  texture->image_layout = image_info.initialLayout;
+  texture->image_memory = memory;
+  texture->memory_offset = 0;
+  texture->memory_size = mem_requirements.size;
+  texture->texture_info = texture_info;
+
+  // Create a default view, just for kicks.
+  VkImageViewCreateInfo view_info;
+  view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+  view_info.pNext = nullptr;
+  view_info.flags = 0;
+  view_info.image = image;
+  view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+  view_info.format = image_info.format;
+  view_info.components = {
+      VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B,
+      VK_COMPONENT_SWIZZLE_A,
+  };
+  view_info.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+  VkImageView view;
+  err = vkCreateImageView(*device_, &view_info, nullptr, &view);
+  CheckResult(err, "vkCreateImageView");
+  if (err == VK_SUCCESS) {
+    auto texture_view = std::make_unique<TextureView>();
+    texture_view->texture = texture;
+    texture_view->view = view;
+    texture->views.push_back(std::move(texture_view));
+  }
+
+  return texture;
+}
+
+bool TextureCache::FreeTexture(Texture* texture) {
+  // TODO(DrChat)
+  return false;
+}
+
+TextureCache::Texture* TextureCache::DemandResolveTexture(
+    const TextureInfo& texture_info, TextureFormat format,
+    uint32_t* out_offset_x, uint32_t* out_offset_y) {
+  // Check to see if we've already used a texture at this location.
+  auto texture = LookupAddress(
+      texture_info.guest_address, texture_info.size_2d.block_width,
+      texture_info.size_2d.block_height, format, out_offset_x, out_offset_y);
+  if (texture) {
+    return texture;
+  }
+
+  // Check resolve textures.
+  for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
+       ++it) {
+    texture = (*it).get();
+    if (texture_info.guest_address == texture->texture_info.guest_address &&
+        texture_info.size_2d.logical_width ==
+            texture->texture_info.size_2d.logical_width &&
+        texture_info.size_2d.logical_height ==
+            texture->texture_info.size_2d.logical_height) {
+      // Exact match.
+      return texture;
+    }
+  }
+
+  // No texture at this location. Make a new one.
+  texture = AllocateTexture(texture_info);
+  resolve_textures_.push_back(std::unique_ptr<Texture>(texture));
+  return texture;
+}
+
 TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
                                             VkCommandBuffer command_buffer) {
-  // Run a tight loop to scan for an existing texture.
+  // Run a tight loop to scan for an exact match existing texture.
   auto texture_hash = texture_info.hash();
   for (auto it = textures_.find(texture_hash); it != textures_.end(); ++it) {
     if (it->second->texture_info == texture_info) {
@@ -141,15 +275,25 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
     }
   }
 
-  // Though we didn't find an exact match, that doesn't mean we're out of the
-  // woods yet. This texture could either be a portion of another texture or
-  // vice versa. Check for overlap before uploading.
-  for (auto it = textures_.begin(); it != textures_.end(); ++it) {
+  // Check resolve textures.
+  for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
+       ++it) {
+    auto texture = (*it).get();
+    if (texture_info.guest_address == texture->texture_info.guest_address &&
+        texture_info.size_2d.logical_width ==
+            texture->texture_info.size_2d.logical_width &&
+        texture_info.size_2d.logical_height ==
+            texture->texture_info.size_2d.logical_height) {
+      // Exact match.
+      // TODO: Lazy match
+      texture->texture_info = texture_info;
+      textures_[texture_hash] = std::move(*it);
+    }
   }
 
   if (!command_buffer) {
-    // Texture not found and no command buffer was passed allowing us to upload
-    // a new one.
+    // Texture not found and no command buffer was passed, preventing us from
+    // uploading a new one.
     return nullptr;
   }
 
@@ -167,6 +311,12 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
     return nullptr;
   }
 
+  // Though we didn't find an exact match, that doesn't mean we're out of the
+  // woods yet. This texture could either be a portion of another texture or
+  // vice versa. Copy any overlapping textures into this texture.
+  for (auto it = textures_.begin(); it != textures_.end(); ++it) {
+  }
+
   textures_[texture_hash] = std::unique_ptr<Texture>(texture);
 
   return texture;
@@ -199,7 +349,7 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
   sampler_create_info.anisotropyEnable = VK_FALSE;
   sampler_create_info.maxAnisotropy = 1.0f;
   sampler_create_info.compareEnable = VK_FALSE;
-  sampler_create_info.compareOp = VK_COMPARE_OP_ALWAYS;
+  sampler_create_info.compareOp = VK_COMPARE_OP_NEVER;
   sampler_create_info.minLod = 0.0f;
   sampler_create_info.maxLod = 0.0f;
   sampler_create_info.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK;
@@ -220,95 +370,21 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
   return sampler;
 }
 
-TextureCache::Texture* TextureCache::AllocateTexture(TextureInfo texture_info) {
-  // Create an image first.
-  VkImageCreateInfo image_info = {};
-  image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-  switch (texture_info.dimension) {
-    case Dimension::k1D:
-      image_info.imageType = VK_IMAGE_TYPE_1D;
-      break;
-    case Dimension::k2D:
-      image_info.imageType = VK_IMAGE_TYPE_2D;
-      break;
-    case Dimension::k3D:
-      image_info.imageType = VK_IMAGE_TYPE_3D;
-      break;
-    case Dimension::kCube:
-      image_info.imageType = VK_IMAGE_TYPE_2D;
-      image_info.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
-      break;
-    default:
-      assert_unhandled_case(texture_info.dimension);
-      return nullptr;
+TextureCache::Texture* TextureCache::LookupAddress(
+    uint32_t guest_address, uint32_t width, uint32_t height,
+    TextureFormat format, uint32_t* offset_x, uint32_t* offset_y) {
+  for (auto it = textures_.begin(); it != textures_.end(); ++it) {
+    const auto& texture_info = it->second->texture_info;
+    if (texture_info.guest_address == guest_address &&
+        texture_info.dimension == Dimension::k2D &&
+        texture_info.size_2d.input_width == width &&
+        texture_info.size_2d.input_height == height) {
+      return it->second.get();
+    }
   }
 
-  // TODO: Format
-  image_info.format = VK_FORMAT_R8G8B8A8_UNORM;
-  image_info.extent = {texture_info.width + 1, texture_info.height + 1,
-                       texture_info.depth + 1};
-  image_info.mipLevels = 1;
-  image_info.arrayLayers = 1;
-  image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-  image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
-  image_info.usage =
-      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
-  image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  image_info.queueFamilyIndexCount = 0;
-  image_info.pQueueFamilyIndices = nullptr;
-  image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-  VkImage image;
-  auto err = vkCreateImage(*device_, &image_info, nullptr, &image);
-  CheckResult(err, "vkCreateImage");
-
-  VkMemoryRequirements mem_requirements;
-  vkGetImageMemoryRequirements(*device_, image, &mem_requirements);
-
-  // TODO: Use a circular buffer or something else to allocate this memory.
-  // The device has a limited amount (around 64) of memory allocations that we
-  // can make.
-  // Now that we have the size, back the image with GPU memory.
-  auto memory = device_->AllocateMemory(mem_requirements, 0);
-  err = vkBindImageMemory(*device_, image, memory, 0);
-  CheckResult(err, "vkBindImageMemory");
-
-  auto texture = new Texture();
-  texture->format = image_info.format;
-  texture->image = image;
-  texture->memory_offset = 0;
-  texture->memory_size = mem_requirements.size;
-  texture->texture_info = texture_info;
-  texture->texture_memory = memory;
-
-  // Create a default view, just for kicks.
-  VkImageViewCreateInfo view_info;
-  view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-  view_info.pNext = nullptr;
-  view_info.flags = 0;
-  view_info.image = image;
-  view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
-  view_info.format = image_info.format;
-  view_info.components = {
-      VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B,
-      VK_COMPONENT_SWIZZLE_A,
-  };
-  view_info.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
-  VkImageView view;
-  err = vkCreateImageView(*device_, &view_info, nullptr, &view);
-  CheckResult(err, "vkCreateImageView");
-  if (err == VK_SUCCESS) {
-    auto texture_view = std::make_unique<TextureView>();
-    texture_view->texture = texture;
-    texture_view->view = view;
-    texture->views.push_back(std::move(texture_view));
-  }
-
-  return texture;
-}
-
-bool TextureCache::FreeTexture(Texture* texture) {
-  // TODO(DrChat)
-  return false;
+  // TODO: Try to match at an offset.
+  return nullptr;
 }
 
 bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
@@ -359,8 +435,8 @@ bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
   // For now, just transfer the grid we uploaded earlier into the texture.
   VkBufferImageCopy copy_region;
   copy_region.bufferOffset = 0;
-  copy_region.bufferRowLength = 0;
-  copy_region.bufferImageHeight = 0;
+  copy_region.bufferRowLength = 2048;
+  copy_region.bufferImageHeight = 2048;
   copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
   copy_region.imageOffset = {0, 0, 0};
   copy_region.imageExtent = {dest->texture_info.width + 1,
@@ -378,6 +454,7 @@ bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                        nullptr, 1, &barrier);
 
+  dest->image_layout = barrier.newLayout;
   return true;
 }
 
@@ -427,6 +504,8 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
   VkWriteDescriptorSet descriptor_writes[4];
   std::memset(descriptor_writes, 0, sizeof(descriptor_writes));
   uint32_t descriptor_write_count = 0;
+  /*
+  // TODO(DrChat): Do we really need to separate samplers and images here?
   if (update_set_info->sampler_write_count) {
     auto& sampler_write = descriptor_writes[descriptor_write_count++];
     sampler_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
@@ -438,6 +517,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     sampler_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER;
     sampler_write.pImageInfo = update_set_info->sampler_infos;
   }
+  */
   if (update_set_info->image_1d_write_count) {
     auto& image_write = descriptor_writes[descriptor_write_count++];
     image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
@@ -446,7 +526,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     image_write.dstBinding = 1;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_1d_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
     image_write.pImageInfo = update_set_info->image_1d_infos;
   }
   if (update_set_info->image_2d_write_count) {
@@ -457,7 +537,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     image_write.dstBinding = 2;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_2d_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
     image_write.pImageInfo = update_set_info->image_2d_infos;
   }
   if (update_set_info->image_3d_write_count) {
@@ -468,7 +548,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     image_write.dstBinding = 3;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_3d_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
     image_write.pImageInfo = update_set_info->image_3d_infos;
   }
   if (update_set_info->image_cube_write_count) {
@@ -479,7 +559,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     image_write.dstBinding = 4;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_cube_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
     image_write.pImageInfo = update_set_info->image_cube_infos;
   }
   if (descriptor_write_count) {
@@ -542,14 +622,11 @@ bool TextureCache::SetupTextureBinding(UpdateSetInfo* update_set_info,
   trace_writer_->WriteMemoryRead(texture_info.guest_address,
                                  texture_info.input_length);
 
-  auto& sampler_write =
-      update_set_info->sampler_infos[update_set_info->sampler_write_count++];
-  sampler_write.sampler = sampler->sampler;
-
   auto& image_write =
       update_set_info->image_2d_infos[update_set_info->image_2d_write_count++];
   image_write.imageView = texture->views[0]->view;
   image_write.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+  image_write.sampler = sampler->sampler;
 
   return true;
 }
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index 896bb3155..6264a4a98 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -28,6 +28,30 @@ namespace vulkan {
 //
 class TextureCache {
  public:
+  struct TextureView;
+
+  // This represents an uploaded Vulkan texture.
+  struct Texture {
+    TextureInfo texture_info;
+    std::vector<std::unique_ptr<TextureView>> views;
+
+    // True if we know all info about this texture, false otherwise.
+    // (e.g. we resolve to system memory and may not know the full details about
+    // this texture)
+    bool full_texture;
+    VkFormat format;
+    VkImage image;
+    VkImageLayout image_layout;
+    VkDeviceMemory image_memory;
+    VkDeviceSize memory_offset;
+    VkDeviceSize memory_size;
+  };
+
+  struct TextureView {
+    Texture* texture;
+    VkImageView view;
+  };
+
   TextureCache(RegisterFile* register_file, TraceWriter* trace_writer,
                ui::vulkan::VulkanDevice* device);
   ~TextureCache();
@@ -49,28 +73,24 @@ class TextureCache {
   // TODO(benvanik): Resolve.
   // TODO(benvanik): ReadTexture.
 
+  // Demands a texture for the purpose of resolving from EDRAM. This either
+  // creates a new texture or returns a previously created texture. texture_info
+  // is not required to be completely filled out, just guest_address and size.
+  //
+  // It's possible that this may return an image that is larger than the
+  // requested size (e.g. resolving into a bigger texture) or an image that
+  // must have an offset applied. If so, the caller must handle this.
+  // At the very least, it's guaranteed that the image will be large enough to
+  // hold the requested size.
+  Texture* DemandResolveTexture(const TextureInfo& texture_info,
+                                TextureFormat format, uint32_t* out_offset_x,
+                                uint32_t* out_offset_y);
+
   // Clears all cached content.
   void ClearCache();
 
  private:
   struct UpdateSetInfo;
-  struct TextureView;
-
-  // This represents an uploaded Vulkan texture.
-  struct Texture {
-    TextureInfo texture_info;
-    VkDeviceMemory texture_memory;
-    VkDeviceSize memory_offset;
-    VkDeviceSize memory_size;
-    VkImage image;
-    VkFormat format;
-    std::vector<std::unique_ptr<TextureView>> views;
-  };
-
-  struct TextureView {
-    Texture* texture;
-    VkImageView view;
-  };
 
   // Cached Vulkan sampler.
   struct Sampler {
@@ -78,18 +98,28 @@ class TextureCache {
     VkSampler sampler;
   };
 
+  // Allocates a new texture and memory to back it on the GPU.
+  Texture* AllocateTexture(const TextureInfo& texture_info);
+  bool FreeTexture(Texture* texture);
+
   // Demands a texture. If command_buffer is null and the texture hasn't been
   // uploaded to graphics memory already, we will return null and bail.
   Texture* Demand(const TextureInfo& texture_info,
                   VkCommandBuffer command_buffer = nullptr);
   Sampler* Demand(const SamplerInfo& sampler_info);
 
-  // Allocates a new texture and memory to back it on the GPU.
-  Texture* AllocateTexture(TextureInfo texture_info);
-  bool FreeTexture(Texture* texture);
+  // Looks for a texture either containing or matching these parameters.
+  // Caller is responsible for checking if the texture returned is an exact
+  // match or just contains the texture given by the parameters.
+  // If offset_x and offset_y are not null, this may return a texture that
+  // contains this image at an offset.
+  Texture* LookupAddress(uint32_t guest_address, uint32_t width,
+                         uint32_t height, TextureFormat format,
+                         uint32_t* offset_x, uint32_t* offset_y);
 
   // Queues commands to upload a texture from system memory, applying any
-  // conversions necessary.
+  // conversions necessary. This may flush the command buffer to the GPU if we
+  // run out of staging memory.
   bool UploadTexture2D(VkCommandBuffer command_buffer, Texture* dest,
                        TextureInfo src);
 
@@ -112,13 +142,12 @@ class TextureCache {
   VkDeviceMemory staging_buffer_mem_ = nullptr;
   std::unordered_map<uint64_t, std::unique_ptr<Texture>> textures_;
   std::unordered_map<uint64_t, std::unique_ptr<Sampler>> samplers_;
+  std::vector<std::unique_ptr<Texture>> resolve_textures_;
 
   struct UpdateSetInfo {
     // Bitmap of all 32 fetch constants and whether they have been setup yet.
     // This prevents duplication across the vertex and pixel shader.
     uint32_t has_setup_fetch_mask;
-    uint32_t sampler_write_count = 0;
-    VkDescriptorImageInfo sampler_infos[32];
     uint32_t image_1d_write_count = 0;
     VkDescriptorImageInfo image_1d_infos[32];
     uint32_t image_2d_write_count = 0;

From 86cb40f0c69dc78deb03453dbd1e9bee76448f71 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 8 Mar 2016 18:02:03 -0600
Subject: [PATCH 09/77] Change how the render cache renders into EDRAM.
 Rendering directly into the EDRAM buffer is bad because we don't know how the
 GPU lays out memory when it draws. Instead, we create temporary render
 targets and copy EDRAM contents to/from those temporary RTs before and after
 each draw.

---
 src/xenia/gpu/vulkan/render_cache.cc | 347 +++++++++++++++++++--------
 src/xenia/gpu/vulkan/render_cache.h  |  50 +++-
 2 files changed, 295 insertions(+), 102 deletions(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 5637d44eb..a2b496330 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -71,34 +71,6 @@ VkFormat DepthRenderTargetFormatToVkFormat(DepthRenderTargetFormat format) {
   }
 }
 
-// Cached view into the EDRAM memory.
-// The image is aliased to a region of the edram_memory_ based on the tile
-// parameters.
-// TODO(benvanik): reuse VkImage's with multiple VkViews for compatible
-//     formats?
-class CachedTileView {
- public:
-  // Key identifying the view in the cache.
-  TileViewKey key;
-  // Image mapped into EDRAM.
-  VkImage image = nullptr;
-  // Simple view on the image matching the format.
-  VkImageView image_view = nullptr;
-
-  CachedTileView(VkDevice device, VkDeviceMemory edram_memory,
-                 TileViewKey view_key);
-  ~CachedTileView();
-
-  bool IsEqual(const TileViewKey& other_key) const {
-    auto a = reinterpret_cast<const uint64_t*>(&key);
-    auto b = reinterpret_cast<const uint64_t*>(&other_key);
-    return *a == *b;
-  }
-
- private:
-  VkDevice device_ = nullptr;
-};
-
 // Cached framebuffer referencing tile attachments.
 // Each framebuffer is specific to a render pass. Ugh.
 class CachedFramebuffer {
@@ -151,9 +123,11 @@ class CachedRenderPass {
   VkDevice device_ = nullptr;
 };
 
-CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory,
+CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device,
+                               VkCommandBuffer command_buffer,
+                               VkDeviceMemory edram_memory,
                                TileViewKey view_key)
-    : device_(device), key(std::move(view_key)) {
+    : device_(*device), key(std::move(view_key)) {
   // Map format to Vulkan.
   VkFormat vulkan_format = VK_FORMAT_UNDEFINED;
   uint32_t bpp = 4;
@@ -191,8 +165,8 @@ CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory,
   image_info.extent.depth = 1;
   image_info.mipLevels = 1;
   image_info.arrayLayers = 1;
-  // TODO(benvanik): native MSAA support?
-  image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+  image_info.samples =
+      static_cast<VkSampleCountFlagBits>(VK_SAMPLE_COUNT_1_BIT);
   image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
   image_info.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                      VK_IMAGE_USAGE_TRANSFER_DST_BIT |
@@ -203,19 +177,17 @@ CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory,
   image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
   image_info.queueFamilyIndexCount = 0;
   image_info.pQueueFamilyIndices = nullptr;
-  image_info.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
+  image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
   auto err = vkCreateImage(device_, &image_info, nullptr, &image);
   CheckResult(err, "vkCreateImage");
 
-  // Verify our assumptions about memory layout are correct.
-  VkDeviceSize edram_offset = key.tile_offset * 5120;
   VkMemoryRequirements memory_requirements;
-  vkGetImageMemoryRequirements(device, image, &memory_requirements);
-  assert_true(edram_offset + memory_requirements.size <= kEdramBufferCapacity);
-  assert_true(edram_offset % memory_requirements.alignment == 0);
+  vkGetImageMemoryRequirements(*device, image, &memory_requirements);
 
-  // Bind to the region of EDRAM we occupy.
-  err = vkBindImageMemory(device_, image, edram_memory, edram_offset);
+  // Bind to a newly allocated chunk.
+  // TODO: Alias from a really big buffer?
+  memory = device->AllocateMemory(memory_requirements, 0);
+  err = vkBindImageMemory(device_, image, memory, 0);
   CheckResult(err, "vkBindImageMemory");
 
   // Create the image view we'll use to attach it to a framebuffer.
@@ -242,11 +214,34 @@ CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory,
   CheckResult(err, "vkCreateImageView");
 
   // TODO(benvanik): transition to general layout?
+  VkImageMemoryBarrier image_barrier;
+  image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  image_barrier.pNext = nullptr;
+  image_barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
+  image_barrier.dstAccessMask =
+      key.color_or_depth ? VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT
+                         : VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+  image_barrier.dstAccessMask |=
+      VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT;
+  image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
+  image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  image_barrier.image = image;
+  image_barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+  image_barrier.subresourceRange.baseMipLevel = 0;
+  image_barrier.subresourceRange.levelCount = 1;
+  image_barrier.subresourceRange.baseArrayLayer = 0;
+  image_barrier.subresourceRange.layerCount = 1;
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &image_barrier);
 }
 
 CachedTileView::~CachedTileView() {
   vkDestroyImageView(device_, image_view, nullptr);
   vkDestroyImage(device_, image, nullptr);
+  vkFreeMemory(device_, memory, nullptr);
 }
 
 CachedFramebuffer::CachedFramebuffer(
@@ -423,9 +418,10 @@ bool CachedRenderPass::IsCompatible(
 
 RenderCache::RenderCache(RegisterFile* register_file,
                          ui::vulkan::VulkanDevice* device)
-    : register_file_(register_file), device_(*device) {
+    : register_file_(register_file), device_(device) {
+  VkResult status = VK_SUCCESS;
+
   // Create the buffer we'll bind to our memory.
-  // We do this first so we can get the right memory type.
   VkBufferCreateInfo buffer_info;
   buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
   buffer_info.pNext = nullptr;
@@ -436,55 +432,42 @@ RenderCache::RenderCache(RegisterFile* register_file,
   buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
   buffer_info.queueFamilyIndexCount = 0;
   buffer_info.pQueueFamilyIndices = nullptr;
-  auto err = vkCreateBuffer(*device, &buffer_info, nullptr, &edram_buffer_);
-  CheckResult(err, "vkCreateBuffer");
+  status = vkCreateBuffer(*device, &buffer_info, nullptr, &edram_buffer_);
+  CheckResult(status, "vkCreateBuffer");
 
   // Query requirements for the buffer.
   // It should be 1:1.
   VkMemoryRequirements buffer_requirements;
-  vkGetBufferMemoryRequirements(device_, edram_buffer_, &buffer_requirements);
+  vkGetBufferMemoryRequirements(*device_, edram_buffer_, &buffer_requirements);
   assert_true(buffer_requirements.size == kEdramBufferCapacity);
 
-  // Create a dummy image so we can see what memory bits it requires.
-  // They should overlap with the buffer requirements but are likely more
-  // strict.
-  VkImageCreateInfo test_image_info;
-  test_image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-  test_image_info.pNext = nullptr;
-  test_image_info.flags = 0;
-  test_image_info.imageType = VK_IMAGE_TYPE_2D;
-  test_image_info.format = VK_FORMAT_R8G8B8A8_UINT;
-  test_image_info.extent.width = 128;
-  test_image_info.extent.height = 128;
-  test_image_info.extent.depth = 1;
-  test_image_info.mipLevels = 1;
-  test_image_info.arrayLayers = 1;
-  test_image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-  test_image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
-  test_image_info.usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-  test_image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  test_image_info.queueFamilyIndexCount = 0;
-  test_image_info.pQueueFamilyIndices = nullptr;
-  test_image_info.initialLayout = VK_IMAGE_LAYOUT_GENERAL;
-  VkImage test_image = nullptr;
-  err = vkCreateImage(device_, &test_image_info, nullptr, &test_image);
-  CheckResult(err, "vkCreateImage");
-  VkMemoryRequirements image_requirements;
-  vkGetImageMemoryRequirements(device_, test_image, &image_requirements);
-  vkDestroyImage(device_, test_image, nullptr);
-  assert_true((image_requirements.memoryTypeBits &
-               buffer_requirements.memoryTypeBits) != 0);
-
   // Allocate EDRAM memory.
-  VkMemoryRequirements memory_requirements;
-  memory_requirements.size = buffer_requirements.size;
-  memory_requirements.alignment = buffer_requirements.alignment;
-  memory_requirements.memoryTypeBits = image_requirements.memoryTypeBits;
   // TODO(benvanik): do we need it host visible?
-  edram_memory_ = device->AllocateMemory(memory_requirements, 0);
+  edram_memory_ = device->AllocateMemory(buffer_requirements);
+  assert_not_null(edram_memory_);
 
   // Bind buffer to map our entire memory.
-  vkBindBufferMemory(device_, edram_buffer_, edram_memory_, 0);
+  status = vkBindBufferMemory(*device_, edram_buffer_, edram_memory_, 0);
+  CheckResult(status, "vkBindBufferMemory");
+
+  if (status == VK_SUCCESS) {
+    status = vkBindBufferMemory(*device_, edram_buffer_, edram_memory_, 0);
+    CheckResult(status, "vkBindBufferMemory");
+
+    // Upload a grid into the EDRAM buffer.
+    uint32_t* gpu_data = nullptr;
+    status = vkMapMemory(*device_, edram_memory_, 0, buffer_requirements.size,
+                         0, reinterpret_cast<void**>(&gpu_data));
+    CheckResult(status, "vkMapMemory");
+
+    if (status == VK_SUCCESS) {
+      for (int i = 0; i < kEdramBufferCapacity / 4; i++) {
+        gpu_data[i] = (i % 8) >= 4 ? 0xFF0000FF : 0xFFFFFFFF;
+      }
+
+      vkUnmapMemory(*device_, edram_memory_);
+    }
+  }
 }
 
 RenderCache::~RenderCache() {
@@ -503,8 +486,8 @@ RenderCache::~RenderCache() {
   cached_tile_views_.clear();
 
   // Release underlying EDRAM memory.
-  vkDestroyBuffer(device_, edram_buffer_, nullptr);
-  vkFreeMemory(device_, edram_memory_, nullptr);
+  vkDestroyBuffer(*device_, edram_buffer_, nullptr);
+  vkFreeMemory(*device_, edram_memory_, nullptr);
 }
 
 const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
@@ -542,13 +525,74 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
     }
 
     // Lookup or generate a new render pass and framebuffer for the new state.
-    if (!ConfigureRenderPass(config, &render_pass, &framebuffer)) {
+    if (!ConfigureRenderPass(command_buffer, config, &render_pass,
+                             &framebuffer)) {
       return nullptr;
     }
     current_state_.render_pass = render_pass;
     current_state_.render_pass_handle = render_pass->handle;
     current_state_.framebuffer = framebuffer;
     current_state_.framebuffer_handle = framebuffer->handle;
+
+    VkBufferMemoryBarrier barrier;
+    barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+    barrier.pNext = nullptr;
+    barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.buffer = edram_buffer_;
+    barrier.offset = 0;
+    barrier.size = 0;
+
+    // Copy EDRAM buffer into render targets with tight packing.
+    VkBufferImageCopy region;
+    region.bufferRowLength = 0;
+    region.bufferImageHeight = 0;
+    region.imageOffset = {0, 0, 0};
+    region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+    for (int i = 0; i < 4; i++) {
+      auto target = current_state_.framebuffer->color_attachments[i];
+      if (!target) {
+        continue;
+      }
+
+      region.bufferOffset = target->key.tile_offset * 5120;
+
+      // Wait for any potential copies to finish.
+      barrier.offset = region.bufferOffset;
+      barrier.size =
+          target->key.tile_width * 80 * target->key.tile_height * 16 * 4;
+      vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                           VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
+                           &barrier, 0, nullptr);
+
+      region.imageExtent = {target->key.tile_width * 80u,
+                            target->key.tile_height * 16u, 1};
+      vkCmdCopyBufferToImage(command_buffer, edram_buffer_, target->image,
+                             VK_IMAGE_LAYOUT_GENERAL, 1, &region);
+    }
+
+    // Depth
+    auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
+    if (depth_target) {
+      region.imageSubresource = {
+          VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
+      region.bufferOffset = depth_target->key.tile_offset * 5120;
+
+      // Wait for any potential copies to finish.
+      barrier.offset = region.bufferOffset;
+      barrier.size = depth_target->key.tile_width * 80 *
+                     depth_target->key.tile_height * 16 * 4;
+      vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                           VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
+                           &barrier, 0, nullptr);
+
+      region.imageExtent = {depth_target->key.tile_width * 80u,
+                            depth_target->key.tile_height * 16u, 1};
+      vkCmdCopyBufferToImage(command_buffer, edram_buffer_, depth_target->image,
+                             VK_IMAGE_LAYOUT_GENERAL, 1, &region);
+    }
   }
   if (!render_pass) {
     return nullptr;
@@ -593,6 +637,7 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) {
   // RB_SURFACE_INFO
   // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
   config->surface_pitch_px = regs.rb_surface_info & 0x3FFF;
+  // config->surface_height_px = (regs.rb_surface_info >> 18) & 0x3FFF;
   config->surface_msaa =
       static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
 
@@ -643,7 +688,8 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) {
   return true;
 }
 
-bool RenderCache::ConfigureRenderPass(RenderConfiguration* config,
+bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
+                                      RenderConfiguration* config,
                                       CachedRenderPass** out_render_pass,
                                       CachedFramebuffer** out_framebuffer) {
   *out_render_pass = nullptr;
@@ -662,7 +708,7 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config,
 
   // If no render pass was found in the cache create a new one.
   if (!render_pass) {
-    render_pass = new CachedRenderPass(device_, *config);
+    render_pass = new CachedRenderPass(*device_, *config);
     cached_render_passes_.push_back(render_pass);
   }
 
@@ -688,7 +734,8 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config,
       color_key.tile_height = config->surface_height_px / 16;
       color_key.color_or_depth = 1;
       color_key.edram_format = static_cast<uint16_t>(config->color[i].format);
-      target_color_attachments[i] = GetTileView(color_key);
+      target_color_attachments[i] =
+          FindOrCreateTileView(command_buffer, color_key);
       if (!target_color_attachments) {
         XELOGE("Failed to get tile view for color attachment");
         return false;
@@ -702,14 +749,15 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config,
     depth_stencil_key.color_or_depth = 0;
     depth_stencil_key.edram_format =
         static_cast<uint16_t>(config->depth_stencil.format);
-    auto target_depth_stencil_attachment = GetTileView(depth_stencil_key);
+    auto target_depth_stencil_attachment =
+        FindOrCreateTileView(command_buffer, depth_stencil_key);
     if (!target_depth_stencil_attachment) {
       XELOGE("Failed to get tile view for depth/stencil attachment");
       return false;
     }
 
     framebuffer = new CachedFramebuffer(
-        device_, render_pass->handle, config->surface_pitch_px,
+        *device_, render_pass->handle, config->surface_pitch_px,
         config->surface_height_px, target_color_attachments,
         target_depth_stencil_attachment);
     render_pass->cached_framebuffers.push_back(framebuffer);
@@ -720,7 +768,22 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config,
   return true;
 }
 
-CachedTileView* RenderCache::GetTileView(const TileViewKey& view_key) {
+CachedTileView* RenderCache::FindOrCreateTileView(
+    VkCommandBuffer command_buffer, const TileViewKey& view_key) {
+  auto tile_view = FindTileView(view_key);
+  if (tile_view) {
+    return tile_view;
+  }
+
+  // Create a new tile and add to the cache.
+  tile_view =
+      new CachedTileView(device_, command_buffer, edram_memory_, view_key);
+  cached_tile_views_.push_back(tile_view);
+
+  return tile_view;
+}
+
+CachedTileView* RenderCache::FindTileView(const TileViewKey& view_key) const {
   // Check the cache.
   // TODO(benvanik): better lookup.
   for (auto tile_view : cached_tile_views_) {
@@ -729,25 +792,115 @@ CachedTileView* RenderCache::GetTileView(const TileViewKey& view_key) {
     }
   }
 
-  // Create a new tile and add to the cache.
-  auto tile_view = new CachedTileView(device_, edram_memory_, view_key);
-  cached_tile_views_.push_back(tile_view);
-  return tile_view;
+  return nullptr;
 }
 
 void RenderCache::EndRenderPass() {
   assert_not_null(current_command_buffer_);
-  auto command_buffer = current_command_buffer_;
-  current_command_buffer_ = nullptr;
 
   // End the render pass.
-  vkCmdEndRenderPass(command_buffer);
+  vkCmdEndRenderPass(current_command_buffer_);
+
+  // Copy all render targets back into our EDRAM buffer.
+  // Don't bother waiting on this command to complete, as next render pass may
+  // reuse previous framebuffer attachments. If they need this, they will wait.
+  // TODO: Should we bother re-tiling the images on copy back?
+  VkBufferImageCopy region;
+  region.bufferRowLength = 0;
+  region.bufferImageHeight = 0;
+  region.imageOffset = {0, 0, 0};
+  region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+  for (int i = 0; i < 4; i++) {
+    auto target = current_state_.framebuffer->color_attachments[i];
+    if (!target) {
+      continue;
+    }
+
+    region.bufferOffset = target->key.tile_offset * 5120;
+    region.imageExtent = {target->key.tile_width * 80u,
+                          target->key.tile_height * 16u, 1};
+    vkCmdCopyImageToBuffer(current_command_buffer_, target->image,
+                           VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
+  }
+
+  // Depth/stencil
+  auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
+  if (depth_target) {
+    region.imageSubresource = {
+        VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
+    region.bufferOffset = depth_target->key.tile_offset * 5120;
+    region.imageExtent = {depth_target->key.tile_width * 80u,
+                          depth_target->key.tile_height * 16u, 1};
+    vkCmdCopyImageToBuffer(current_command_buffer_, depth_target->image,
+                           VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
+  }
+
+  current_command_buffer_ = nullptr;
 }
 
 void RenderCache::ClearCache() {
   // TODO(benvanik): caching.
 }
 
+void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
+                                 uint32_t edram_base, VkImage image,
+                                 VkImageLayout image_layout,
+                                 bool color_or_depth, int32_t offset_x,
+                                 int32_t offset_y, uint32_t width,
+                                 uint32_t height) {
+  // Transition the texture into a transfer destination layout.
+  VkImageMemoryBarrier image_barrier;
+  image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  image_barrier.pNext = nullptr;
+  image_barrier.srcAccessMask = 0;
+  image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+  image_barrier.oldLayout = image_layout;
+  image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+  image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  image_barrier.image = image;
+  image_barrier.subresourceRange = {0, 0, 1, 0, 1};
+  image_barrier.subresourceRange.aspectMask =
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
+                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+
+  VkBufferMemoryBarrier buffer_barrier;
+  buffer_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+  buffer_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+  buffer_barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+  buffer_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  buffer_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  buffer_barrier.buffer = edram_buffer_;
+  buffer_barrier.offset = edram_base * 5120;
+  buffer_barrier.size = width * height * 4;  // TODO: Calculate this accurately.
+
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
+                       &buffer_barrier, 1, &image_barrier);
+
+  // Issue the copy command.
+  VkBufferImageCopy region;
+  region.bufferImageHeight = 0;
+  region.bufferOffset = edram_base * 5120;
+  region.bufferRowLength = 0;
+  region.imageExtent = {width, height, 1};
+  region.imageOffset = {offset_x, offset_y, 0};
+  region.imageSubresource = {0, 0, 0, 1};
+  region.imageSubresource.aspectMask =
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
+                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  vkCmdCopyBufferToImage(command_buffer, edram_buffer_, image, image_layout, 1,
+                         &region);
+
+  // Transition the image back into its previous layout.
+  image_barrier.srcAccessMask = image_barrier.dstAccessMask;
+  image_barrier.dstAccessMask = 0;
+  std::swap(image_barrier.oldLayout, image_barrier.newLayout);
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &image_barrier);
+}
+
 bool RenderCache::SetShadowRegister(uint32_t* dest, uint32_t register_name) {
   uint32_t value = register_file_->values[register_name].u32;
   if (*dest == value) {
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index 4a1574e9b..13397bf1b 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -41,6 +41,35 @@ struct TileViewKey {
 };
 static_assert(sizeof(TileViewKey) == 8, "Key must be tightly packed");
 
+// Cached view representing EDRAM memory.
+// TODO(benvanik): reuse VkImage's with multiple VkViews for compatible
+//     formats?
+class CachedTileView {
+ public:
+  // Key identifying the view in the cache.
+  TileViewKey key;
+  // Image
+  VkImage image = nullptr;
+  // Simple view on the image matching the format.
+  VkImageView image_view = nullptr;
+  // Memory buffer
+  VkDeviceMemory memory = nullptr;
+
+  CachedTileView(ui::vulkan::VulkanDevice* device,
+                 VkCommandBuffer command_buffer, VkDeviceMemory edram_memory,
+                 TileViewKey view_key);
+  ~CachedTileView();
+
+  bool IsEqual(const TileViewKey& other_key) const {
+    auto a = reinterpret_cast<const uint64_t*>(&key);
+    auto b = reinterpret_cast<const uint64_t*>(&other_key);
+    return *a == *b;
+  }
+
+ private:
+  VkDevice device_ = nullptr;
+};
+
 // Parsed render configuration from the current render state.
 struct RenderConfiguration {
   // Render mode (color+depth, depth-only, etc).
@@ -230,22 +259,33 @@ class RenderCache {
   // Clears all cached content.
   void ClearCache();
 
+  // Queues commands to copy EDRAM contents into an image.
+  void RawCopyToImage(VkCommandBuffer command_buffer, uint32_t edram_base,
+                      VkImage image, VkImageLayout image_layout,
+                      bool color_or_depth, int32_t offset_x, int32_t offset_y,
+                      uint32_t width, uint32_t height);
+
  private:
   // Parses the current state into a configuration object.
   bool ParseConfiguration(RenderConfiguration* config);
 
+  // Finds a tile view. Returns nullptr if none found matching the key.
+  CachedTileView* FindTileView(const TileViewKey& view_key) const;
+
+  // Gets or creates a tile view with the given parameters.
+  CachedTileView* FindOrCreateTileView(VkCommandBuffer command_buffer,
+                                       const TileViewKey& view_key);
+
   // Gets or creates a render pass and frame buffer for the given configuration.
   // This attempts to reuse as much as possible across render passes and
   // framebuffers.
-  bool ConfigureRenderPass(RenderConfiguration* config,
+  bool ConfigureRenderPass(VkCommandBuffer command_buffer,
+                           RenderConfiguration* config,
                            CachedRenderPass** out_render_pass,
                            CachedFramebuffer** out_framebuffer);
 
-  // Gets or creates a tile view with the given parameters.
-  CachedTileView* GetTileView(const TileViewKey& view_key);
-
   RegisterFile* register_file_ = nullptr;
-  VkDevice device_ = nullptr;
+  ui::vulkan::VulkanDevice* device_ = nullptr;
 
   // Entire 10MiB of EDRAM, aliased to hell by various VkImages.
   VkDeviceMemory edram_memory_ = nullptr;

From f839a1293f449a83ac6aadfa8ecaf82d9a36da0f Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 10 Mar 2016 12:59:48 -0600
Subject: [PATCH 10/77] Add a specialized copy command buffer to the vulkan
 swap chain

---
 src/xenia/ui/vulkan/vulkan_swap_chain.cc | 71 +++++++++++++++++-------
 src/xenia/ui/vulkan/vulkan_swap_chain.h  |  6 ++
 2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/src/xenia/ui/vulkan/vulkan_swap_chain.cc b/src/xenia/ui/vulkan/vulkan_swap_chain.cc
index 15d2795fd..ad383f32f 100644
--- a/src/xenia/ui/vulkan/vulkan_swap_chain.cc
+++ b/src/xenia/ui/vulkan/vulkan_swap_chain.cc
@@ -187,6 +187,10 @@ bool VulkanSwapChain::Initialize(VkSurfaceKHR surface) {
       vkAllocateCommandBuffers(*device_, &cmd_buffer_info, &render_cmd_buffer_);
   CheckResult(err, "vkCreateCommandBuffer");
 
+  // Create another command buffer that handles image copies.
+  err = vkAllocateCommandBuffers(*device_, &cmd_buffer_info, &copy_cmd_buffer_);
+  CheckResult(err, "vkCreateCommandBuffer");
+
   // Create the render pass used to draw to the swap chain.
   // The actual framebuffer attached will depend on which image we are drawing
   // into.
@@ -194,7 +198,7 @@ bool VulkanSwapChain::Initialize(VkSurfaceKHR surface) {
   color_attachment.flags = 0;
   color_attachment.format = surface_format_;
   color_attachment.samples = VK_SAMPLE_COUNT_1_BIT;
-  color_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
+  color_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;  // CLEAR;
   color_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
   color_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
   color_attachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
@@ -388,6 +392,7 @@ bool VulkanSwapChain::Begin() {
 
   // Reset all command buffers.
   vkResetCommandBuffer(render_cmd_buffer_, 0);
+  vkResetCommandBuffer(copy_cmd_buffer_, 0);
   auto& current_buffer = buffers_[current_buffer_index_];
 
   // Build the command buffer that will execute all queued rendering buffers.
@@ -399,14 +404,18 @@ bool VulkanSwapChain::Begin() {
   err = vkBeginCommandBuffer(render_cmd_buffer_, &begin_info);
   CheckResult(err, "vkBeginCommandBuffer");
 
-  // Transition the image to a format we can render to.
+  // Start recording the copy command buffer as well.
+  err = vkBeginCommandBuffer(copy_cmd_buffer_, &begin_info);
+  CheckResult(err, "vkBeginCommandBuffer");
+
+  // Transition the image to a format we can copy to.
   VkImageMemoryBarrier pre_image_memory_barrier;
   pre_image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
   pre_image_memory_barrier.pNext = nullptr;
   pre_image_memory_barrier.srcAccessMask = 0;
-  pre_image_memory_barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+  pre_image_memory_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
   pre_image_memory_barrier.oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
-  pre_image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+  pre_image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
   pre_image_memory_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   pre_image_memory_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   pre_image_memory_barrier.image = current_buffer.image;
@@ -416,23 +425,37 @@ bool VulkanSwapChain::Begin() {
   pre_image_memory_barrier.subresourceRange.levelCount = 1;
   pre_image_memory_barrier.subresourceRange.baseArrayLayer = 0;
   pre_image_memory_barrier.subresourceRange.layerCount = 1;
+  vkCmdPipelineBarrier(copy_cmd_buffer_, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &pre_image_memory_barrier);
+
+  // First: Issue a command to clear the render target.
+  VkImageSubresourceRange clear_range = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+  VkClearColorValue clear_color;
+  clear_color.float32[0] = 238 / 255.0f;
+  clear_color.float32[1] = 238 / 255.0f;
+  clear_color.float32[2] = 238 / 255.0f;
+  clear_color.float32[3] = 1.0f;
+  if (FLAGS_vulkan_random_clear_color) {
+    clear_color.float32[0] =
+        rand() / static_cast<float>(RAND_MAX);  // NOLINT(runtime/threadsafe_fn)
+    clear_color.float32[1] = 1.0f;
+    clear_color.float32[2] = 0.0f;
+  }
+  vkCmdClearColorImage(copy_cmd_buffer_, current_buffer.image,
+                       VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear_color, 1,
+                       &clear_range);
+
+  // Transition the image to a color attachment target for drawing.
+  pre_image_memory_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+  pre_image_memory_barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+  pre_image_memory_barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+  pre_image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
   vkCmdPipelineBarrier(render_cmd_buffer_, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                        nullptr, 1, &pre_image_memory_barrier);
 
   // Begin render pass.
-  VkClearValue color_clear_value;
-  color_clear_value.color.float32[0] = 238 / 255.0f;
-  color_clear_value.color.float32[1] = 238 / 255.0f;
-  color_clear_value.color.float32[2] = 238 / 255.0f;
-  color_clear_value.color.float32[3] = 1.0f;
-  if (FLAGS_vulkan_random_clear_color) {
-    color_clear_value.color.float32[0] =
-        rand() / static_cast<float>(RAND_MAX);  // NOLINT(runtime/threadsafe_fn)
-    color_clear_value.color.float32[1] = 1.0f;
-    color_clear_value.color.float32[2] = 0.0f;
-  }
-  VkClearValue clear_values[] = {color_clear_value};
   VkRenderPassBeginInfo render_pass_begin_info;
   render_pass_begin_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
   render_pass_begin_info.pNext = nullptr;
@@ -442,9 +465,8 @@ bool VulkanSwapChain::Begin() {
   render_pass_begin_info.renderArea.offset.y = 0;
   render_pass_begin_info.renderArea.extent.width = surface_width_;
   render_pass_begin_info.renderArea.extent.height = surface_height_;
-  render_pass_begin_info.clearValueCount =
-      static_cast<uint32_t>(xe::countof(clear_values));
-  render_pass_begin_info.pClearValues = clear_values;
+  render_pass_begin_info.clearValueCount = 0;
+  render_pass_begin_info.pClearValues = nullptr;
   vkCmdBeginRenderPass(render_cmd_buffer_, &render_pass_begin_info,
                        VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS);
 
@@ -458,6 +480,7 @@ bool VulkanSwapChain::End() {
   vkCmdEndRenderPass(render_cmd_buffer_);
 
   // Transition the image to a format the presentation engine can source from.
+  // FIXME: Do we need more synchronization here between the copy buffer?
   VkImageMemoryBarrier post_image_memory_barrier;
   post_image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
   post_image_memory_barrier.pNext = nullptr;
@@ -483,14 +506,20 @@ bool VulkanSwapChain::End() {
   auto err = vkEndCommandBuffer(render_cmd_buffer_);
   CheckResult(err, "vkEndCommandBuffer");
 
+  err = vkEndCommandBuffer(copy_cmd_buffer_);
+  CheckResult(err, "vkEndCommandBuffer");
+
+  VkCommandBuffer command_buffers[] = {copy_cmd_buffer_, render_cmd_buffer_};
+
   // Submit rendering.
   VkSubmitInfo render_submit_info;
   render_submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
   render_submit_info.pNext = nullptr;
   render_submit_info.waitSemaphoreCount = 0;
   render_submit_info.pWaitSemaphores = nullptr;
-  render_submit_info.commandBufferCount = 1;
-  render_submit_info.pCommandBuffers = &render_cmd_buffer_;
+  render_submit_info.commandBufferCount =
+      static_cast<uint32_t>(xe::countof(command_buffers));
+  render_submit_info.pCommandBuffers = command_buffers;
   render_submit_info.signalSemaphoreCount = 0;
   render_submit_info.pSignalSemaphores = nullptr;
   {
diff --git a/src/xenia/ui/vulkan/vulkan_swap_chain.h b/src/xenia/ui/vulkan/vulkan_swap_chain.h
index 1d1f578c3..773a52053 100644
--- a/src/xenia/ui/vulkan/vulkan_swap_chain.h
+++ b/src/xenia/ui/vulkan/vulkan_swap_chain.h
@@ -35,11 +35,16 @@ class VulkanSwapChain {
 
   uint32_t surface_width() const { return surface_width_; }
   uint32_t surface_height() const { return surface_height_; }
+  VkImage surface_image() const {
+    return buffers_[current_buffer_index_].image;
+  }
 
   // Render pass used for compositing.
   VkRenderPass render_pass() const { return render_pass_; }
   // Render command buffer, active inside the render pass from Begin to End.
   VkCommandBuffer render_cmd_buffer() const { return render_cmd_buffer_; }
+  // Copy commands, ran before the render command buffer.
+  VkCommandBuffer copy_cmd_buffer() const { return copy_cmd_buffer_; }
 
   // Initializes the swap chain with the given WSI surface.
   bool Initialize(VkSurfaceKHR surface);
@@ -74,6 +79,7 @@ class VulkanSwapChain {
   uint32_t surface_height_ = 0;
   VkFormat surface_format_ = VK_FORMAT_UNDEFINED;
   VkCommandPool cmd_pool_ = nullptr;
+  VkCommandBuffer copy_cmd_buffer_ = nullptr;
   VkCommandBuffer render_cmd_buffer_ = nullptr;
   VkRenderPass render_pass_ = nullptr;
   VkSemaphore image_available_semaphore_ = nullptr;

From f903a559b372755a5d0dbc1f66c95281020f5f34 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 10 Mar 2016 13:01:39 -0600
Subject: [PATCH 11/77] Blit Vulkan CP output to the main window's swap chain

---
 .../gpu/vulkan/vulkan_graphics_system.cc      | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_graphics_system.cc b/src/xenia/gpu/vulkan/vulkan_graphics_system.cc
index 74ec57849..27b2ff073 100644
--- a/src/xenia/gpu/vulkan/vulkan_graphics_system.cc
+++ b/src/xenia/gpu/vulkan/vulkan_graphics_system.cc
@@ -19,14 +19,14 @@
 #include "xenia/gpu/vulkan/vulkan_command_processor.h"
 #include "xenia/gpu/vulkan/vulkan_gpu_flags.h"
 #include "xenia/ui/vulkan/vulkan_provider.h"
+#include "xenia/ui/vulkan/vulkan_swap_chain.h"
 #include "xenia/ui/window.h"
 
 namespace xe {
 namespace gpu {
 namespace vulkan {
 
-VulkanGraphicsSystem::VulkanGraphicsSystem() = default;
-
+VulkanGraphicsSystem::VulkanGraphicsSystem() {}
 VulkanGraphicsSystem::~VulkanGraphicsSystem() = default;
 
 X_STATUS VulkanGraphicsSystem::Setup(cpu::Processor* processor,
@@ -74,12 +74,25 @@ void VulkanGraphicsSystem::Swap(xe::ui::UIEvent* e) {
     return;
   }
 
-  // Blit the frontbuffer.
-  // display_context_->blitter()->BlitTexture2D(
-  //    static_cast<GLuint>(swap_state.front_buffer_texture),
-  //    Rect2D(0, 0, swap_state.width, swap_state.height),
-  //    Rect2D(0, 0, target_window_->width(), target_window_->height()),
-  //    GL_LINEAR, false);
+  auto swap_chain = display_context_->swap_chain();
+  auto copy_cmd_buffer = swap_chain->copy_cmd_buffer();
+
+  VkImageBlit region;
+  region.srcSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+  region.srcOffsets[0] = {0, 0, 0};
+  region.srcOffsets[1] = {static_cast<int32_t>(swap_state.width),
+                          static_cast<int32_t>(swap_state.height), 1};
+
+  region.dstSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+  region.dstOffsets[0] = {0, 0, 0};
+  region.dstOffsets[1] = {static_cast<int32_t>(swap_chain->surface_width()),
+                          static_cast<int32_t>(swap_chain->surface_height()),
+                          1};
+  vkCmdBlitImage(copy_cmd_buffer,
+                 reinterpret_cast<VkImage>(swap_state.front_buffer_texture),
+                 VK_IMAGE_LAYOUT_GENERAL, swap_chain->surface_image(),
+                 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region,
+                 VK_FILTER_LINEAR);
 }
 
 }  // namespace vulkan

From 635d095b8777bef5ded36a7a92cd2835b31ce9e2 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 10 Mar 2016 20:39:46 -0600
Subject: [PATCH 12/77] RenderCache: Track color target / depth target usage,
 refactor RawCopyToImage

---
 src/xenia/gpu/vulkan/render_cache.cc | 131 +++++++++++++++------------
 src/xenia/gpu/vulkan/render_cache.h  |   7 +-
 2 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index a2b496330..727fa59e2 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -508,6 +508,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
   dirty |= SetShadowRegister(&regs.rb_color1_info, XE_GPU_REG_RB_COLOR1_INFO);
   dirty |= SetShadowRegister(&regs.rb_color2_info, XE_GPU_REG_RB_COLOR2_INFO);
   dirty |= SetShadowRegister(&regs.rb_color3_info, XE_GPU_REG_RB_COLOR3_INFO);
+  dirty |= SetShadowRegister(&regs.rb_depthcontrol, XE_GPU_REG_RB_DEPTHCONTROL);
   dirty |= SetShadowRegister(&regs.rb_depth_info, XE_GPU_REG_RB_DEPTH_INFO);
   dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_tl,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL);
@@ -529,6 +530,12 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
                              &framebuffer)) {
       return nullptr;
     }
+
+    for (int i = 0; i < 4; i++) {
+      config->color[i].used = pixel_shader->writes_color_target(i);
+    }
+    config->depth_stencil.used = !!(regs.rb_depthcontrol & (0x4 | 0x2));
+
     current_state_.render_pass = render_pass;
     current_state_.render_pass_handle = render_pass->handle;
     current_state_.framebuffer = framebuffer;
@@ -550,10 +557,33 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
     region.bufferRowLength = 0;
     region.bufferImageHeight = 0;
     region.imageOffset = {0, 0, 0};
+
+    // Depth
+    auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
+    if (depth_target && current_state_.config.depth_stencil.used) {
+      region.imageSubresource = {
+          VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
+      region.bufferOffset = depth_target->key.tile_offset * 5120;
+
+      // Wait for any potential copies to finish.
+      barrier.offset = region.bufferOffset;
+      barrier.size = depth_target->key.tile_width * 80 *
+                     depth_target->key.tile_height * 16 * 4;
+      vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                           VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
+                           &barrier, 0, nullptr);
+
+      region.imageExtent = {depth_target->key.tile_width * 80u,
+                            depth_target->key.tile_height * 16u, 1};
+      vkCmdCopyBufferToImage(command_buffer, edram_buffer_, depth_target->image,
+                             VK_IMAGE_LAYOUT_GENERAL, 1, &region);
+    }
+
+    // Color
     region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
     for (int i = 0; i < 4; i++) {
       auto target = current_state_.framebuffer->color_attachments[i];
-      if (!target) {
+      if (!target || !current_state_.config.color[i].used) {
         continue;
       }
 
@@ -572,27 +602,6 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
       vkCmdCopyBufferToImage(command_buffer, edram_buffer_, target->image,
                              VK_IMAGE_LAYOUT_GENERAL, 1, &region);
     }
-
-    // Depth
-    auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
-    if (depth_target) {
-      region.imageSubresource = {
-          VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
-      region.bufferOffset = depth_target->key.tile_offset * 5120;
-
-      // Wait for any potential copies to finish.
-      barrier.offset = region.bufferOffset;
-      barrier.size = depth_target->key.tile_width * 80 *
-                     depth_target->key.tile_height * 16 * 4;
-      vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                           VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
-                           &barrier, 0, nullptr);
-
-      region.imageExtent = {depth_target->key.tile_width * 80u,
-                            depth_target->key.tile_height * 16u, 1};
-      vkCmdCopyBufferToImage(command_buffer, edram_buffer_, depth_target->image,
-                             VK_IMAGE_LAYOUT_GENERAL, 1, &region);
-    }
   }
   if (!render_pass) {
     return nullptr;
@@ -809,10 +818,23 @@ void RenderCache::EndRenderPass() {
   region.bufferRowLength = 0;
   region.bufferImageHeight = 0;
   region.imageOffset = {0, 0, 0};
+  // Depth/stencil
+  auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
+  if (depth_target && current_state_.config.depth_stencil.used) {
+    region.imageSubresource = {
+        VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
+    region.bufferOffset = depth_target->key.tile_offset * 5120;
+    region.imageExtent = {depth_target->key.tile_width * 80u,
+                          depth_target->key.tile_height * 16u, 1};
+    vkCmdCopyImageToBuffer(current_command_buffer_, depth_target->image,
+                           VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
+  }
+
+  // Color
   region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
   for (int i = 0; i < 4; i++) {
     auto target = current_state_.framebuffer->color_attachments[i];
-    if (!target) {
+    if (!target || !current_state_.config.color[i].used) {
       continue;
     }
 
@@ -823,18 +845,6 @@ void RenderCache::EndRenderPass() {
                            VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
   }
 
-  // Depth/stencil
-  auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
-  if (depth_target) {
-    region.imageSubresource = {
-        VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
-    region.bufferOffset = depth_target->key.tile_offset * 5120;
-    region.imageExtent = {depth_target->key.tile_width * 80u,
-                          depth_target->key.tile_height * 16u, 1};
-    vkCmdCopyImageToBuffer(current_command_buffer_, depth_target->image,
-                           VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
-  }
-
   current_command_buffer_ = nullptr;
 }
 
@@ -845,24 +855,27 @@ void RenderCache::ClearCache() {
 void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
                                  uint32_t edram_base, VkImage image,
                                  VkImageLayout image_layout,
-                                 bool color_or_depth, int32_t offset_x,
-                                 int32_t offset_y, uint32_t width,
-                                 uint32_t height) {
+                                 bool color_or_depth, VkOffset3D offset,
+                                 VkExtent3D extents) {
   // Transition the texture into a transfer destination layout.
   VkImageMemoryBarrier image_barrier;
   image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
   image_barrier.pNext = nullptr;
-  image_barrier.srcAccessMask = 0;
-  image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-  image_barrier.oldLayout = image_layout;
-  image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
   image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  image_barrier.image = image;
-  image_barrier.subresourceRange = {0, 0, 1, 0, 1};
-  image_barrier.subresourceRange.aspectMask =
-      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
-                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  if (image_layout != VK_IMAGE_LAYOUT_GENERAL &&
+      image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+    image_barrier.srcAccessMask = 0;
+    image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    image_barrier.oldLayout = image_layout;
+    image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+    image_barrier.image = image;
+    image_barrier.subresourceRange = {0, 0, 1, 0, 1};
+    image_barrier.subresourceRange.aspectMask =
+        color_or_depth
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  }
 
   VkBufferMemoryBarrier buffer_barrier;
   buffer_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -872,7 +885,8 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
   buffer_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   buffer_barrier.buffer = edram_buffer_;
   buffer_barrier.offset = edram_base * 5120;
-  buffer_barrier.size = width * height * 4;  // TODO: Calculate this accurately.
+  // TODO: Calculate this accurately (need texel size)
+  buffer_barrier.size = extents.width * extents.height * 4;
 
   vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
@@ -880,11 +894,11 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
 
   // Issue the copy command.
   VkBufferImageCopy region;
-  region.bufferImageHeight = 0;
   region.bufferOffset = edram_base * 5120;
+  region.bufferImageHeight = 0;
   region.bufferRowLength = 0;
-  region.imageExtent = {width, height, 1};
-  region.imageOffset = {offset_x, offset_y, 0};
+  region.imageOffset = offset;
+  region.imageExtent = extents;
   region.imageSubresource = {0, 0, 0, 1};
   region.imageSubresource.aspectMask =
       color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
@@ -893,12 +907,15 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
                          &region);
 
   // Transition the image back into its previous layout.
-  image_barrier.srcAccessMask = image_barrier.dstAccessMask;
-  image_barrier.dstAccessMask = 0;
-  std::swap(image_barrier.oldLayout, image_barrier.newLayout);
-  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
-                       nullptr, 1, &image_barrier);
+  if (image_layout != VK_IMAGE_LAYOUT_GENERAL &&
+      image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+    image_barrier.srcAccessMask = image_barrier.dstAccessMask;
+    image_barrier.dstAccessMask = 0;
+    std::swap(image_barrier.oldLayout, image_barrier.newLayout);
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                         nullptr, 1, &image_barrier);
+  }
 }
 
 bool RenderCache::SetShadowRegister(uint32_t* dest, uint32_t register_name) {
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index 13397bf1b..d1aad23de 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -82,11 +82,13 @@ struct RenderConfiguration {
   MsaaSamples surface_msaa;
   // Color attachments for the 4 render targets.
   struct {
+    bool used;
     uint32_t edram_base;
     ColorRenderTargetFormat format;
   } color[4];
   // Depth/stencil attachment.
   struct {
+    bool used;
     uint32_t edram_base;
     DepthRenderTargetFormat format;
   } depth_stencil;
@@ -262,8 +264,8 @@ class RenderCache {
   // Queues commands to copy EDRAM contents into an image.
   void RawCopyToImage(VkCommandBuffer command_buffer, uint32_t edram_base,
                       VkImage image, VkImageLayout image_layout,
-                      bool color_or_depth, int32_t offset_x, int32_t offset_y,
-                      uint32_t width, uint32_t height);
+                      bool color_or_depth, VkOffset3D offset,
+                      VkExtent3D extents);
 
  private:
   // Parses the current state into a configuration object.
@@ -309,6 +311,7 @@ class RenderCache {
     uint32_t rb_color1_info;
     uint32_t rb_color2_info;
     uint32_t rb_color3_info;
+    uint32_t rb_depthcontrol;
     uint32_t rb_depth_info;
     uint32_t pa_sc_window_scissor_tl;
     uint32_t pa_sc_window_scissor_br;

From 5ba04b9e55d76eb82bbadc8fee5e4056743b3e53 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 12 Mar 2016 11:49:59 -0600
Subject: [PATCH 13/77] RenderCache::ClearEDRAMColor/ClearEDRAMDepthStencil

---
 src/xenia/gpu/vulkan/render_cache.cc | 84 +++++++++++++++++++++++++++-
 src/xenia/gpu/vulkan/render_cache.h  | 11 ++++
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 727fa59e2..379b3893f 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -531,6 +531,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
       return nullptr;
     }
 
+    // Speculatively see if targets are actually used so we can skip copies
     for (int i = 0; i < 4; i++) {
       config->color[i].used = pixel_shader->writes_color_target(i);
     }
@@ -646,7 +647,6 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) {
   // RB_SURFACE_INFO
   // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
   config->surface_pitch_px = regs.rb_surface_info & 0x3FFF;
-  // config->surface_height_px = (regs.rb_surface_info >> 18) & 0x3FFF;
   config->surface_msaa =
       static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
 
@@ -814,6 +814,11 @@ void RenderCache::EndRenderPass() {
   // Don't bother waiting on this command to complete, as next render pass may
   // reuse previous framebuffer attachments. If they need this, they will wait.
   // TODO: Should we bother re-tiling the images on copy back?
+  //
+  // FIXME: There's a case where we may have a really big render target (as we
+  // can't get the correct height atm) and we may end up overwriting the valid
+  // contents of another render target by mistake! Need to reorder copy commands
+  // to avoid this.
   VkBufferImageCopy region;
   region.bufferRowLength = 0;
   region.bufferImageHeight = 0;
@@ -918,6 +923,83 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
   }
 }
 
+void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
+                                  uint32_t edram_base,
+                                  ColorRenderTargetFormat format,
+                                  uint32_t pitch, uint32_t height,
+                                  float* color) {
+  // Grab a tile view (as we need to clear an image first)
+  TileViewKey key;
+  key.color_or_depth = 1;
+  key.edram_format = static_cast<uint16_t>(format);
+  key.tile_offset = edram_base;
+  key.tile_width = pitch / 80;
+  key.tile_height = height / 16;
+  auto tile_view = FindOrCreateTileView(command_buffer, key);
+  assert_not_null(tile_view);
+
+  VkImageSubresourceRange range = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+  VkClearColorValue clear_value;
+  std::memcpy(clear_value.float32, color, sizeof(float) * 4);
+
+  // Issue a clear command
+  vkCmdClearColorImage(command_buffer, tile_view->image,
+                       VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range);
+
+  // Copy image back into EDRAM buffer
+  VkBufferImageCopy copy_range;
+  copy_range.bufferOffset = edram_base * 5120;
+  copy_range.bufferImageHeight = 0;
+  copy_range.bufferRowLength = 0;
+  copy_range.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+  copy_range.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
+  copy_range.imageOffset = {0, 0, 0};
+  vkCmdCopyImageToBuffer(command_buffer, tile_view->image,
+                         VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1,
+                         &copy_range);
+}
+
+void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
+                                         uint32_t edram_base,
+                                         DepthRenderTargetFormat format,
+                                         uint32_t pitch, uint32_t height,
+                                         float depth, uint32_t stencil) {
+  // Grab a tile view (as we need to clear an image first)
+  TileViewKey key;
+  key.color_or_depth = 0;
+  key.edram_format = static_cast<uint16_t>(format);
+  key.tile_offset = edram_base;
+  key.tile_width = pitch / 80;
+  key.tile_height = height / 16;
+  auto tile_view = FindOrCreateTileView(command_buffer, key);
+  assert_not_null(tile_view);
+
+  VkImageSubresourceRange range = {
+      VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 1, 0, 1,
+  };
+  VkClearDepthStencilValue clear_value;
+  clear_value.depth = depth;
+  clear_value.stencil = stencil;
+
+  // Issue a clear command
+  vkCmdClearDepthStencilImage(command_buffer, tile_view->image,
+                              VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range);
+
+  // Copy image back into EDRAM buffer
+  VkBufferImageCopy copy_range;
+  copy_range.bufferOffset = edram_base * 5120;
+  copy_range.bufferImageHeight = 0;
+  copy_range.bufferRowLength = 0;
+  copy_range.imageSubresource = {
+      VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1,
+  };
+  copy_range.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
+  copy_range.imageOffset = {0, 0, 0};
+  vkCmdCopyImageToBuffer(command_buffer, tile_view->image,
+                         VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1,
+                         &copy_range);
+}
+
 bool RenderCache::SetShadowRegister(uint32_t* dest, uint32_t register_name) {
   uint32_t value = register_file_->values[register_name].u32;
   if (*dest == value) {
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index d1aad23de..4de9d0e72 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -12,6 +12,7 @@
 
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shader.h"
+#include "xenia/gpu/texture_info.h"
 #include "xenia/gpu/vulkan/vulkan_shader.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/ui/vulkan/vulkan.h"
@@ -267,6 +268,16 @@ class RenderCache {
                       bool color_or_depth, VkOffset3D offset,
                       VkExtent3D extents);
 
+  // Queues commands to clear EDRAM contents with a solid color
+  void ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base,
+                       ColorRenderTargetFormat format, uint32_t pitch,
+                       uint32_t height, float* color);
+  // Queues commands to clear EDRAM contents with depth/stencil values.
+  void ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
+                              uint32_t edram_base,
+                              DepthRenderTargetFormat format, uint32_t pitch,
+                              uint32_t height, float depth, uint32_t stencil);
+
  private:
   // Parses the current state into a configuration object.
   bool ParseConfiguration(RenderConfiguration* config);

From 822d61c3d96b35faa6e94f0c78139aeea97c43aa Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 12 Mar 2016 22:03:11 -0600
Subject: [PATCH 14/77] Fix a few stale data usage bugs in the pipeline cache.
 Hook up part of depth/stencil tests/writes

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 51 ++++++++++++++++++++------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index 542329af5..b69aa0243 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -291,8 +291,8 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state,
   pipeline_info.basePipelineHandle = nullptr;
   pipeline_info.basePipelineIndex = 0;
   VkPipeline pipeline = nullptr;
-  auto err = vkCreateGraphicsPipelines(device_, nullptr, 1, &pipeline_info,
-                                       nullptr, &pipeline);
+  auto err = vkCreateGraphicsPipelines(device_, pipeline_cache_, 1,
+                                       &pipeline_info, nullptr, &pipeline);
   CheckResult(err, "vkCreateGraphicsPipelines");
 
   // Add to cache with the hash key for reuse.
@@ -338,6 +338,8 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
 
   bool window_offset_dirty = SetShadowRegister(&regs.pa_sc_window_offset,
                                                XE_GPU_REG_PA_SC_WINDOW_OFFSET);
+  window_offset_dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
+                                           XE_GPU_REG_PA_SU_SC_MODE_CNTL);
 
   // Window parameters.
   // http://ftp.tku.edu.tw/NetBSD/NetBSD-current/xsrc/external/mit/xf86-video-ati/dist/src/r600_reg_auto_r6xx.h
@@ -660,13 +662,13 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages(
   dirty |= regs.vertex_shader != vertex_shader;
   dirty |= regs.pixel_shader != pixel_shader;
   dirty |= regs.primitive_type != primitive_type;
+  regs.vertex_shader = vertex_shader;
+  regs.pixel_shader = pixel_shader;
+  regs.primitive_type = primitive_type;
   XXH64_update(&hash_state_, &regs, sizeof(regs));
   if (!dirty) {
     return UpdateStatus::kCompatible;
   }
-  regs.vertex_shader = vertex_shader;
-  regs.pixel_shader = pixel_shader;
-  regs.primitive_type = primitive_type;
 
   update_shader_stages_stage_count_ = 0;
 
@@ -723,11 +725,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState(
 
   bool dirty = false;
   dirty |= vertex_shader != regs.vertex_shader;
+  regs.vertex_shader = vertex_shader;
   XXH64_update(&hash_state_, &regs, sizeof(regs));
   if (!dirty) {
     return UpdateStatus::kCompatible;
   }
-  regs.vertex_shader = vertex_shader;
 
   state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
   state_info.pNext = nullptr;
@@ -843,11 +845,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateInputAssemblyState(
                              XE_GPU_REG_PA_SU_SC_MODE_CNTL);
   dirty |= SetShadowRegister(&regs.multi_prim_ib_reset_index,
                              XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX);
+  regs.primitive_type = primitive_type;
   XXH64_update(&hash_state_, &regs, sizeof(regs));
   if (!dirty) {
     return UpdateStatus::kCompatible;
   }
-  regs.primitive_type = primitive_type;
 
   state_info.sType =
       VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
@@ -1038,11 +1040,38 @@ PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() {
   state_info.pNext = nullptr;
   state_info.flags = 0;
 
-  state_info.depthTestEnable = VK_FALSE;
-  state_info.depthWriteEnable = VK_FALSE;
-  state_info.depthCompareOp = VK_COMPARE_OP_ALWAYS;
+  static const VkCompareOp compare_func_map[] = {
+      /*  0 */ VK_COMPARE_OP_NEVER,
+      /*  1 */ VK_COMPARE_OP_LESS,
+      /*  2 */ VK_COMPARE_OP_EQUAL,
+      /*  3 */ VK_COMPARE_OP_LESS_OR_EQUAL,
+      /*  4 */ VK_COMPARE_OP_GREATER,
+      /*  5 */ VK_COMPARE_OP_NOT_EQUAL,
+      /*  6 */ VK_COMPARE_OP_GREATER_OR_EQUAL,
+      /*  7 */ VK_COMPARE_OP_ALWAYS,
+  };
+  static const VkStencilOp stencil_op_map[] = {
+      /*  0 */ VK_STENCIL_OP_KEEP,
+      /*  1 */ VK_STENCIL_OP_ZERO,
+      /*  2 */ VK_STENCIL_OP_REPLACE,
+      /*  3 */ VK_STENCIL_OP_INCREMENT_AND_WRAP,
+      /*  4 */ VK_STENCIL_OP_DECREMENT_AND_WRAP,
+      /*  5 */ VK_STENCIL_OP_INVERT,
+      /*  6 */ VK_STENCIL_OP_INCREMENT_AND_CLAMP,
+      /*  7 */ VK_STENCIL_OP_DECREMENT_AND_CLAMP,
+  };
+
+  // Depth state
+  // TODO: EARLY_Z_ENABLE (needs to be enabled in shaders)
+  state_info.depthWriteEnable = !!(regs.rb_depthcontrol & 0x4);
+  state_info.depthTestEnable = !!(regs.rb_depthcontrol & 0x2);
+  state_info.stencilTestEnable = !!(regs.rb_depthcontrol & 0x1);
+
+  state_info.depthCompareOp =
+      compare_func_map[(regs.rb_depthcontrol & 0x70) >> 4];
   state_info.depthBoundsTestEnable = VK_FALSE;
-  state_info.stencilTestEnable = VK_FALSE;
+
+  // Stencil state
   state_info.front.failOp = VK_STENCIL_OP_KEEP;
   state_info.front.passOp = VK_STENCIL_OP_KEEP;
   state_info.front.depthFailOp = VK_STENCIL_OP_KEEP;

From 245102e9e5fbf1bd8f68d51a8901f2866d25794e Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 12 Mar 2016 22:04:32 -0600
Subject: [PATCH 15/77] RenderCache::BlitToImage

---
 src/xenia/gpu/vulkan/render_cache.cc | 124 +++++++++++++++++++++++++--
 src/xenia/gpu/vulkan/render_cache.h  |   7 ++
 2 files changed, 125 insertions(+), 6 deletions(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 379b3893f..5047bff21 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -508,12 +508,12 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
   dirty |= SetShadowRegister(&regs.rb_color1_info, XE_GPU_REG_RB_COLOR1_INFO);
   dirty |= SetShadowRegister(&regs.rb_color2_info, XE_GPU_REG_RB_COLOR2_INFO);
   dirty |= SetShadowRegister(&regs.rb_color3_info, XE_GPU_REG_RB_COLOR3_INFO);
-  dirty |= SetShadowRegister(&regs.rb_depthcontrol, XE_GPU_REG_RB_DEPTHCONTROL);
   dirty |= SetShadowRegister(&regs.rb_depth_info, XE_GPU_REG_RB_DEPTH_INFO);
   dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_tl,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL);
   dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_br,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR);
+  regs.rb_depthcontrol = register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32;
   if (!dirty && current_state_.render_pass) {
     // No registers have changed so we can reuse the previous render pass -
     // just begin with what we had.
@@ -880,6 +880,10 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
         color_or_depth
             ? VK_IMAGE_ASPECT_COLOR_BIT
             : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                         nullptr, 1, &image_barrier);
   }
 
   VkBufferMemoryBarrier buffer_barrier;
@@ -895,7 +899,7 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
 
   vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
-                       &buffer_barrier, 1, &image_barrier);
+                       &buffer_barrier, 0, nullptr);
 
   // Issue the copy command.
   VkBufferImageCopy region;
@@ -923,6 +927,114 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
   }
 }
 
+void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
+                              uint32_t edram_base, uint32_t pitch,
+                              uint32_t height, VkImage image,
+                              VkImageLayout image_layout, bool color_or_depth,
+                              uint32_t format, VkFilter filter,
+                              VkOffset3D offset, VkExtent3D extents) {
+  // Grab a tile view that represents the source image.
+  TileViewKey key;
+  key.color_or_depth = color_or_depth ? 1 : 0;
+  key.edram_format = format;
+  key.tile_offset = edram_base;
+  key.tile_width = xe::round_up(pitch, 80) / 80;
+  key.tile_height = xe::round_up(height, 16) / 16;
+  auto tile_view = FindOrCreateTileView(command_buffer, key);
+  assert_not_null(tile_view);
+
+  // Issue a memory barrier before we update this tile view.
+  VkBufferMemoryBarrier buffer_barrier;
+  buffer_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+  buffer_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+  buffer_barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+  buffer_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  buffer_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  buffer_barrier.buffer = edram_buffer_;
+  buffer_barrier.offset = edram_base * 5120;
+  // TODO: Calculate this accurately (need texel size)
+  buffer_barrier.size = extents.width * extents.height * 4;
+
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
+                       &buffer_barrier, 0, nullptr);
+
+  // Update the tile view with current EDRAM contents.
+  VkBufferImageCopy buffer_copy;
+  buffer_copy.bufferOffset = edram_base * 5120;
+  buffer_copy.bufferImageHeight = 0;
+  buffer_copy.bufferRowLength = 0;
+  buffer_copy.imageSubresource = {0, 0, 0, 1};
+  buffer_copy.imageSubresource.aspectMask =
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
+                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  buffer_copy.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
+  buffer_copy.imageOffset = {0, 0, 0};
+  vkCmdCopyBufferToImage(command_buffer, edram_buffer_, tile_view->image,
+                         VK_IMAGE_LAYOUT_GENERAL, 1, &buffer_copy);
+
+  // Transition the image into a transfer destination layout, if needed.
+  // TODO: Util function for this
+  VkImageMemoryBarrier image_barrier;
+  image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  image_barrier.pNext = nullptr;
+  image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  if (image_layout != VK_IMAGE_LAYOUT_GENERAL &&
+      image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+    image_barrier.srcAccessMask = 0;
+    image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    image_barrier.oldLayout = image_layout;
+    image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+    image_barrier.image = image;
+    image_barrier.subresourceRange = {0, 0, 1, 0, 1};
+    image_barrier.subresourceRange.aspectMask =
+        color_or_depth
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                         nullptr, 1, &image_barrier);
+  }
+
+  // If we overflow we'll lose the device here.
+  assert_true(extents.width <= key.tile_width * 80u);
+  assert_true(extents.height <= key.tile_height * 16u);
+
+  // Now issue the blit to the destination.
+  VkImageBlit image_blit;
+  image_blit.srcSubresource = {0, 0, 0, 1};
+  image_blit.srcSubresource.aspectMask =
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
+                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  image_blit.srcOffsets[0] = {0, 0, 0};
+  image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height),
+                              int32_t(extents.depth)};
+
+  image_blit.dstSubresource = {0, 0, 0, 1};
+  image_blit.dstSubresource.aspectMask =
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
+                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  image_blit.dstOffsets[0] = offset;
+  image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width),
+                              offset.y + int32_t(extents.height),
+                              offset.z + int32_t(extents.depth)};
+  vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL,
+                 image, image_layout, 1, &image_blit, filter);
+
+  // Transition the image back into its previous layout.
+  if (image_layout != VK_IMAGE_LAYOUT_GENERAL &&
+      image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+    image_barrier.srcAccessMask = image_barrier.dstAccessMask;
+    image_barrier.dstAccessMask = 0;
+    std::swap(image_barrier.oldLayout, image_barrier.newLayout);
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                         nullptr, 1, &image_barrier);
+  }
+}
+
 void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
                                   uint32_t edram_base,
                                   ColorRenderTargetFormat format,
@@ -933,8 +1045,8 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
   key.color_or_depth = 1;
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
-  key.tile_width = pitch / 80;
-  key.tile_height = height / 16;
+  key.tile_width = xe::round_up(pitch, 80) / 80;
+  key.tile_height = xe::round_up(height, 16) / 16;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
@@ -969,8 +1081,8 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
   key.color_or_depth = 0;
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
-  key.tile_width = pitch / 80;
-  key.tile_height = height / 16;
+  key.tile_width = xe::round_up(pitch, 80) / 80;
+  key.tile_height = xe::round_up(height, 16) / 16;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index 4de9d0e72..97816c365 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -268,6 +268,13 @@ class RenderCache {
                       bool color_or_depth, VkOffset3D offset,
                       VkExtent3D extents);
 
+  // Queues commands to blit EDRAM contents into an image.
+  void BlitToImage(VkCommandBuffer command_buffer, uint32_t edram_base,
+                   uint32_t pitch, uint32_t height, VkImage image,
+                   VkImageLayout image_layout, bool color_or_depth,
+                   uint32_t format, VkFilter filter, VkOffset3D offset,
+                   VkExtent3D extents);
+
   // Queues commands to clear EDRAM contents with a solid color
   void ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base,
                        ColorRenderTargetFormat format, uint32_t pitch,

From 54f89825d978968250abdde12817f7e522fc618d Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 15 Mar 2016 00:30:39 -0500
Subject: [PATCH 16/77] SPIR-V Dp2Add/Dp3

---
 src/xenia/gpu/spirv_shader_translator.cc | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 3f991baa8..28158ed20 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -773,6 +773,26 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       // TODO
     } break;
 
+    case AluVectorOpcode::kDp2Add: {
+      auto src0_xy = b.createOp(spv::Op::OpVectorShuffle, vec2_float_type_,
+                                {sources[0], sources[0], 0, 1});
+      auto src1_xy = b.createOp(spv::Op::OpVectorShuffle, vec2_float_type_,
+                                {sources[1], sources[1], 0, 1});
+      auto src2_x = b.createCompositeExtract(sources[2], float_type_, 0);
+      auto dot = b.createBinOp(spv::Op::OpDot, float_type_, src0_xy, src1_xy);
+      dest = b.createBinOp(spv::Op::OpFAdd, float_type_, dot, src2_x);
+      dest = b.smearScalar(spv::NoPrecision, dest, vec4_float_type_);
+    } break;
+
+    case AluVectorOpcode::kDp3: {
+      auto src0_xyz = b.createOp(spv::Op::OpVectorShuffle, vec3_float_type_,
+                                 {sources[0], sources[0], 0, 1, 2});
+      auto src1_xyz = b.createOp(spv::Op::OpVectorShuffle, vec3_float_type_,
+                                 {sources[1], sources[1], 0, 1, 2});
+      auto dot = b.createBinOp(spv::Op::OpDot, float_type_, src0_xyz, src1_xyz);
+      dest = b.smearScalar(spv::NoPrecision, dot, vec4_float_type_);
+    } break;
+
     case AluVectorOpcode::kDp4: {
       dest = b.createBinOp(spv::Op::OpDot, float_type_, sources[0], sources[1]);
     } break;
@@ -1050,9 +1070,11 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     } break;
 
     default:
+      assert_unhandled_case(instr.vector_opcode);
       break;
   }
 
+  assert_not_zero(dest);
   if (dest) {
     // If predicated, discard the result from the instruction.
     Id pv_dest = dest;
@@ -1477,9 +1499,11 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     default:
+      assert_unhandled_case(instr.scalar_opcode);
       break;
   }
 
+  assert_not_zero(dest);
   if (dest) {
     // If predicated, discard the result from the instruction.
     Id ps_dest = dest;

From 1831e7a936b0acd866a2771d0bbf7ddc27cfdcb4 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 16 Mar 2016 14:45:40 -0500
Subject: [PATCH 17/77] Pipeline stencil state

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 32 ++++++++++++++++++--------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index b69aa0243..ca7c37b46 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -1068,18 +1068,32 @@ PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() {
   state_info.stencilTestEnable = !!(regs.rb_depthcontrol & 0x1);
 
   state_info.depthCompareOp =
-      compare_func_map[(regs.rb_depthcontrol & 0x70) >> 4];
+      compare_func_map[(regs.rb_depthcontrol >> 4) & 0x7];
   state_info.depthBoundsTestEnable = VK_FALSE;
 
+  uint32_t stencil_ref = (regs.rb_stencilrefmask & 0x000000FF);
+  uint32_t stencil_read_mask = (regs.rb_stencilrefmask & 0x0000FF00) >> 8;
+
   // Stencil state
-  state_info.front.failOp = VK_STENCIL_OP_KEEP;
-  state_info.front.passOp = VK_STENCIL_OP_KEEP;
-  state_info.front.depthFailOp = VK_STENCIL_OP_KEEP;
-  state_info.front.compareOp = VK_COMPARE_OP_ALWAYS;
-  state_info.back.failOp = VK_STENCIL_OP_KEEP;
-  state_info.back.passOp = VK_STENCIL_OP_KEEP;
-  state_info.back.depthFailOp = VK_STENCIL_OP_KEEP;
-  state_info.back.compareOp = VK_COMPARE_OP_ALWAYS;
+  state_info.front.compareOp =
+      compare_func_map[(regs.rb_depthcontrol >> 8) & 0x7];
+  state_info.front.failOp = stencil_op_map[(regs.rb_depthcontrol >> 11) & 0x7];
+  state_info.front.passOp = stencil_op_map[(regs.rb_depthcontrol >> 14) & 0x7];
+  state_info.front.depthFailOp =
+      stencil_op_map[(regs.rb_depthcontrol >> 17) & 0x7];
+
+  // BACKFACE_ENABLE
+  if (!!(regs.rb_depthcontrol & 0x80)) {
+    state_info.back.compareOp =
+        compare_func_map[(regs.rb_depthcontrol >> 20) & 0x7];
+    state_info.back.failOp = stencil_op_map[(regs.rb_depthcontrol >> 23) & 0x7];
+    state_info.back.passOp = stencil_op_map[(regs.rb_depthcontrol >> 26) & 0x7];
+    state_info.back.depthFailOp =
+        stencil_op_map[(regs.rb_depthcontrol >> 29) & 0x7];
+  } else {
+    // Back state is identical to front state.
+    std::memcpy(&state_info.back, &state_info.front, sizeof(VkStencilOpState));
+  }
 
   // Ignored; set dynamically.
   state_info.minDepthBounds = 0;

From 0e3c113375bb5ba88a863e1127cfa17190b8f195 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 17 Mar 2016 21:55:16 -0500
Subject: [PATCH 18/77] Physical write watches -> access watches (read and/or
 write watching)

---
 src/xenia/cpu/mmio_handler.cc      | 157 ++++++++++++++++++-----------
 src/xenia/cpu/mmio_handler.h       |  36 ++++---
 src/xenia/gpu/gl4/texture_cache.cc |  55 +++++++---
 src/xenia/gpu/gl4/texture_cache.h  |   8 +-
 src/xenia/memory.cc                |  25 +++--
 src/xenia/memory.h                 |  11 +-
 6 files changed, 189 insertions(+), 103 deletions(-)

diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc
index e5412d8e7..3edd9703e 100644
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@@ -87,13 +87,12 @@ bool MMIOHandler::CheckStore(uint32_t virtual_address, uint32_t value) {
   return false;
 }
 
-uintptr_t MMIOHandler::AddPhysicalWriteWatch(uint32_t guest_address,
-                                             size_t length,
-                                             WriteWatchCallback callback,
-                                             void* callback_context,
-                                             void* callback_data) {
-  uint32_t base_address = guest_address;
-  assert_true(base_address < 0x1FFFFFFF);
+uintptr_t MMIOHandler::AddPhysicalAccessWatch(uint32_t guest_address,
+                                              size_t length, WatchType type,
+                                              AccessWatchCallback callback,
+                                              void* callback_context,
+                                              void* callback_data) {
+  uint32_t base_address = guest_address & 0x1FFFFFFF;
 
   // Can only protect sizes matching system page size.
   // This means we need to round up, which will cause spurious access
@@ -103,32 +102,45 @@ uintptr_t MMIOHandler::AddPhysicalWriteWatch(uint32_t guest_address,
                         xe::memory::page_size());
   base_address = base_address - (base_address % xe::memory::page_size());
 
+  auto lock = global_critical_region_.Acquire();
+
   // Add to table. The slot reservation may evict a previous watch, which
   // could include our target, so we do it first.
-  auto entry = new WriteWatchEntry();
+  auto entry = new AccessWatchEntry();
   entry->address = base_address;
   entry->length = uint32_t(length);
   entry->callback = callback;
   entry->callback_context = callback_context;
   entry->callback_data = callback_data;
-  global_critical_region_.mutex().lock();
-  write_watches_.push_back(entry);
-  global_critical_region_.mutex().unlock();
+  access_watches_.push_back(entry);
 
-  // Make the desired range read only under all address spaces.
+  auto page_access = memory::PageAccess::kNoAccess;
+  switch (type) {
+    case kWatchWrite:
+      page_access = memory::PageAccess::kReadOnly;
+      break;
+    case kWatchReadWrite:
+      page_access = memory::PageAccess::kNoAccess;
+      break;
+    default:
+      assert_unhandled_case(type);
+      break;
+  }
+
+  // Protect the range under all address spaces
   memory::Protect(physical_membase_ + entry->address, entry->length,
-                  xe::memory::PageAccess::kReadOnly, nullptr);
+                  page_access, nullptr);
   memory::Protect(virtual_membase_ + 0xA0000000 + entry->address, entry->length,
-                  xe::memory::PageAccess::kReadOnly, nullptr);
+                  page_access, nullptr);
   memory::Protect(virtual_membase_ + 0xC0000000 + entry->address, entry->length,
-                  xe::memory::PageAccess::kReadOnly, nullptr);
+                  page_access, nullptr);
   memory::Protect(virtual_membase_ + 0xE0000000 + entry->address, entry->length,
-                  xe::memory::PageAccess::kReadOnly, nullptr);
+                  page_access, nullptr);
 
   return reinterpret_cast<uintptr_t>(entry);
 }
 
-void MMIOHandler::ClearWriteWatch(WriteWatchEntry* entry) {
+void MMIOHandler::ClearAccessWatch(AccessWatchEntry* entry) {
   memory::Protect(physical_membase_ + entry->address, entry->length,
                   xe::memory::PageAccess::kReadWrite, nullptr);
   memory::Protect(virtual_membase_ + 0xA0000000 + entry->address, entry->length,
@@ -139,19 +151,20 @@ void MMIOHandler::ClearWriteWatch(WriteWatchEntry* entry) {
                   xe::memory::PageAccess::kReadWrite, nullptr);
 }
 
-void MMIOHandler::CancelWriteWatch(uintptr_t watch_handle) {
-  auto entry = reinterpret_cast<WriteWatchEntry*>(watch_handle);
+void MMIOHandler::CancelAccessWatch(uintptr_t watch_handle) {
+  auto entry = reinterpret_cast<AccessWatchEntry*>(watch_handle);
+  auto lock = global_critical_region_.Acquire();
 
   // Allow access to the range again.
-  ClearWriteWatch(entry);
+  ClearAccessWatch(entry);
 
   // Remove from table.
-  global_critical_region_.mutex().lock();
-  auto it = std::find(write_watches_.begin(), write_watches_.end(), entry);
-  if (it != write_watches_.end()) {
-    write_watches_.erase(it);
+  auto it = std::find(access_watches_.begin(), access_watches_.end(), entry);
+  assert_false(it == access_watches_.end());
+
+  if (it != access_watches_.end()) {
+    access_watches_.erase(it);
   }
-  global_critical_region_.mutex().unlock();
 
   delete entry;
 }
@@ -159,18 +172,19 @@ void MMIOHandler::CancelWriteWatch(uintptr_t watch_handle) {
 void MMIOHandler::InvalidateRange(uint32_t physical_address, size_t length) {
   auto lock = global_critical_region_.Acquire();
 
-  for (auto it = write_watches_.begin(); it != write_watches_.end();) {
+  for (auto it = access_watches_.begin(); it != access_watches_.end();) {
     auto entry = *it;
     if ((entry->address <= physical_address &&
          entry->address + entry->length > physical_address) ||
         (entry->address >= physical_address &&
          entry->address < physical_address + length)) {
       // This watch lies within the range. End it.
-      ClearWriteWatch(entry);
+      ClearAccessWatch(entry);
       entry->callback(entry->callback_context, entry->callback_data,
                       entry->address);
 
-      it = write_watches_.erase(it);
+      it = access_watches_.erase(it);
+      delete entry;
       continue;
     }
 
@@ -178,50 +192,49 @@ void MMIOHandler::InvalidateRange(uint32_t physical_address, size_t length) {
   }
 }
 
-bool MMIOHandler::CheckWriteWatch(uint64_t fault_address) {
-  uint32_t physical_address = uint32_t(fault_address);
-  if (physical_address > 0x1FFFFFFF) {
-    physical_address &= 0x1FFFFFFF;
-  }
-  std::list<WriteWatchEntry*> pending_invalidates;
-  global_critical_region_.mutex().lock();
-  // Now that we hold the lock, recheck and see if the pages are still
-  // protected.
-  memory::PageAccess cur_access;
-  size_t page_length = memory::page_size();
-  memory::QueryProtect((void*)fault_address, page_length, cur_access);
-  if (cur_access != memory::PageAccess::kReadOnly &&
-      cur_access != memory::PageAccess::kNoAccess) {
-    // Another thread has cleared this write watch. Abort.
-    global_critical_region_.mutex().unlock();
-    return true;
+bool MMIOHandler::IsRangeWatched(uint32_t physical_address, size_t length) {
+  auto lock = global_critical_region_.Acquire();
+
+  for (auto it = access_watches_.begin(); it != access_watches_.end(); ++it) {
+    auto entry = *it;
+    if ((entry->address <= physical_address &&
+         entry->address + entry->length > physical_address) ||
+        (entry->address >= physical_address &&
+         entry->address < physical_address + length)) {
+      // This watch lies within the range.
+      return true;
+    }
   }
 
-  for (auto it = write_watches_.begin(); it != write_watches_.end();) {
+  return false;
+}
+
+bool MMIOHandler::CheckAccessWatch(uint32_t physical_address) {
+  auto lock = global_critical_region_.Acquire();
+
+  bool hit = false;
+  for (auto it = access_watches_.begin(); it != access_watches_.end();) {
     auto entry = *it;
     if (entry->address <= physical_address &&
         entry->address + entry->length > physical_address) {
-      // Hit! Remove the writewatch.
-      pending_invalidates.push_back(entry);
+      // Hit! Remove the watch.
+      hit = true;
+      ClearAccessWatch(entry);
+      entry->callback(entry->callback_context, entry->callback_data,
+                      physical_address);
 
-      ClearWriteWatch(entry);
-      it = write_watches_.erase(it);
+      it = access_watches_.erase(it);
+      delete entry;
       continue;
     }
     ++it;
   }
-  global_critical_region_.mutex().unlock();
-  if (pending_invalidates.empty()) {
+
+  if (!hit) {
     // Rethrow access violation - range was not being watched.
     return false;
   }
-  while (!pending_invalidates.empty()) {
-    auto entry = pending_invalidates.back();
-    pending_invalidates.pop_back();
-    entry->callback(entry->callback_context, entry->callback_data,
-                    physical_address);
-    delete entry;
-  }
+
   // Range was watched, so lets eat this access violation.
   return true;
 }
@@ -414,9 +427,33 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
     }
   }
   if (!range) {
+    auto fault_address = reinterpret_cast<uint8_t*>(ex->fault_address());
+    uint32_t guest_address = 0;
+    if (fault_address >= virtual_membase_ &&
+        fault_address < physical_membase_) {
+      // Faulting on a virtual address.
+      guest_address = static_cast<uint32_t>(ex->fault_address()) & 0x1FFFFFFF;
+    } else {
+      // Faulting on a physical address.
+      guest_address = static_cast<uint32_t>(ex->fault_address());
+    }
+
+    // HACK: Recheck if the pages are still protected (race condition - another
+    // thread clears the writewatch we just hit)
+    // Do this under the lock so we don't introduce another race condition.
+    auto lock = global_critical_region_.Acquire();
+    memory::PageAccess cur_access;
+    size_t page_length = memory::page_size();
+    memory::QueryProtect((void*)fault_address, page_length, cur_access);
+    if (cur_access != memory::PageAccess::kReadOnly &&
+        cur_access != memory::PageAccess::kNoAccess) {
+      // Another thread has cleared this write watch. Abort.
+      return true;
+    }
+
     // Access is not found within any range, so fail and let the caller handle
     // it (likely by aborting).
-    return CheckWriteWatch(ex->fault_address());
+    return CheckAccessWatch(guest_address);
   }
 
   auto rip = ex->pc();
diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h
index 70d89ac02..bb8cd665f 100644
--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@@ -28,9 +28,8 @@ typedef uint32_t (*MMIOReadCallback)(void* ppc_context, void* callback_context,
                                      uint32_t addr);
 typedef void (*MMIOWriteCallback)(void* ppc_context, void* callback_context,
                                   uint32_t addr, uint32_t value);
-
-typedef void (*WriteWatchCallback)(void* context_ptr, void* data_ptr,
-                                   uint32_t address);
+typedef void (*AccessWatchCallback)(void* context_ptr, void* data_ptr,
+                                    uint32_t address);
 
 struct MMIORange {
   uint32_t address;
@@ -46,6 +45,12 @@ class MMIOHandler {
  public:
   virtual ~MMIOHandler();
 
+  enum WatchType {
+    kWatchInvalid = 0,
+    kWatchWrite = 1,
+    kWatchReadWrite = 2,
+  };
+
   static std::unique_ptr<MMIOHandler> Install(uint8_t* virtual_membase,
                                               uint8_t* physical_membase,
                                               uint8_t* membase_end);
@@ -59,17 +64,24 @@ class MMIOHandler {
   bool CheckLoad(uint32_t virtual_address, uint32_t* out_value);
   bool CheckStore(uint32_t virtual_address, uint32_t value);
 
-  uintptr_t AddPhysicalWriteWatch(uint32_t guest_address, size_t length,
-                                  WriteWatchCallback callback,
-                                  void* callback_context, void* callback_data);
-  void CancelWriteWatch(uintptr_t watch_handle);
+  // Memory watches: These are one-shot alarms that fire a callback (in the
+  // context of the thread that caused the callback) when a memory range is
+  // either written to or read from, depending on the watch type. These fire as
+  // soon as a read/write happens, and only fire once.
+  // These watches may be spuriously fired if memory is accessed nearby.
+  uintptr_t AddPhysicalAccessWatch(uint32_t guest_address, size_t length,
+                                   WatchType type, AccessWatchCallback callback,
+                                   void* callback_context, void* callback_data);
+  void CancelAccessWatch(uintptr_t watch_handle);
   void InvalidateRange(uint32_t physical_address, size_t length);
+  bool IsRangeWatched(uint32_t physical_address, size_t length);
 
  protected:
-  struct WriteWatchEntry {
+  struct AccessWatchEntry {
     uint32_t address;
     uint32_t length;
-    WriteWatchCallback callback;
+    WatchType type;
+    AccessWatchCallback callback;
     void* callback_context;
     void* callback_data;
   };
@@ -83,8 +95,8 @@ class MMIOHandler {
   static bool ExceptionCallbackThunk(Exception* ex, void* data);
   bool ExceptionCallback(Exception* ex);
 
-  void ClearWriteWatch(WriteWatchEntry* entry);
-  bool CheckWriteWatch(uint64_t fault_address);
+  void ClearAccessWatch(AccessWatchEntry* entry);
+  bool CheckAccessWatch(uint32_t guest_address);
 
   uint8_t* virtual_membase_;
   uint8_t* physical_membase_;
@@ -94,7 +106,7 @@ class MMIOHandler {
 
   xe::global_critical_region global_critical_region_;
   // TODO(benvanik): data structure magic.
-  std::list<WriteWatchEntry*> write_watches_;
+  std::list<AccessWatchEntry*> access_watches_;
 
   static MMIOHandler* global_handler_;
 };
diff --git a/src/xenia/gpu/gl4/texture_cache.cc b/src/xenia/gpu/gl4/texture_cache.cc
index 4a8917e71..72e1c9639 100644
--- a/src/xenia/gpu/gl4/texture_cache.cc
+++ b/src/xenia/gpu/gl4/texture_cache.cc
@@ -427,7 +427,7 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture(
   // Not found, create.
   auto entry = std::make_unique<TextureEntry>();
   entry->texture_info = texture_info;
-  entry->write_watch_handle = 0;
+  entry->access_watch_handle = 0;
   entry->pending_invalidation = false;
   entry->handle = 0;
 
@@ -442,6 +442,7 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture(
       // Found! Acquire the handle and remove the readbuffer entry.
       read_buffer_textures_.erase(it);
       entry->handle = read_buffer_entry->handle;
+      entry->access_watch_handle = read_buffer_entry->access_watch_handle;
       delete read_buffer_entry;
       // TODO(benvanik): set more texture properties? swizzle/etc?
       auto entry_ptr = entry.get();
@@ -495,14 +496,15 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture(
   // Add a write watch. If any data in the given range is touched we'll get a
   // callback and evict the texture. We could reuse the storage, though the
   // driver is likely in a better position to pool that kind of stuff.
-  entry->write_watch_handle = memory_->AddPhysicalWriteWatch(
+  entry->access_watch_handle = memory_->AddPhysicalAccessWatch(
       texture_info.guest_address, texture_info.input_length,
+      cpu::MMIOHandler::kWatchWrite,
       [](void* context_ptr, void* data_ptr, uint32_t address) {
         auto self = reinterpret_cast<TextureCache*>(context_ptr);
         auto touched_entry = reinterpret_cast<TextureEntry*>(data_ptr);
         // Clear watch handle first so we don't redundantly
         // remove.
-        touched_entry->write_watch_handle = 0;
+        touched_entry->access_watch_handle = 0;
         touched_entry->pending_invalidation = true;
         // Add to pending list so Scavenge will clean it up.
         self->invalidated_textures_mutex_.lock();
@@ -574,14 +576,27 @@ GLuint TextureCache::ConvertTexture(Blitter* blitter, uint32_t guest_address,
                                   dest_rect, GL_LINEAR, swap_channels);
     }
 
-    // HACK: remove texture from write watch list so readback won't kill us.
-    // Not needed now, as readback is disabled.
-    /*
-    if (texture_entry->write_watch_handle) {
-      memory_->CancelWriteWatch(texture_entry->write_watch_handle);
-      texture_entry->write_watch_handle = 0;
+    // Setup a read/write access watch. If the game tries to touch the memory
+    // we were supposed to populate with this texture, then we'll actually
+    // populate it.
+    if (texture_entry->access_watch_handle) {
+      memory_->CancelAccessWatch(texture_entry->access_watch_handle);
+      texture_entry->access_watch_handle = 0;
     }
-    //*/
+
+    texture_entry->access_watch_handle = memory_->AddPhysicalAccessWatch(
+        guest_address, texture_entry->texture_info.input_length,
+        cpu::MMIOHandler::kWatchReadWrite,
+        [](void* context, void* data, uint32_t address) {
+          auto touched_entry = reinterpret_cast<TextureEntry*>(data);
+          touched_entry->access_watch_handle = 0;
+
+          // This happens. RDR resolves to a texture then upsizes it, BF1943
+          // writes to a resolved texture.
+          // TODO (for Vulkan): Copy this texture back into system memory.
+          // assert_always();
+        },
+        nullptr, texture_entry);
 
     return texture_entry->handle;
   }
@@ -618,6 +633,20 @@ GLuint TextureCache::ConvertTexture(Blitter* blitter, uint32_t guest_address,
   entry->block_height = block_height;
   entry->format = format;
 
+  entry->access_watch_handle = memory_->AddPhysicalAccessWatch(
+      guest_address, block_height * block_width * 4,
+      cpu::MMIOHandler::kWatchReadWrite,
+      [](void* context, void* data, uint32_t address) {
+        auto entry = reinterpret_cast<ReadBufferTexture*>(data);
+        entry->access_watch_handle = 0;
+
+        // This happens. RDR resolves to a texture then upsizes it, BF1943
+        // writes to a resolved texture.
+        // TODO (for Vulkan): Copy this texture back into system memory.
+        // assert_always();
+      },
+      nullptr, entry.get());
+
   glCreateTextures(GL_TEXTURE_2D, 1, &entry->handle);
   glTextureParameteri(entry->handle, GL_TEXTURE_BASE_LEVEL, 0);
   glTextureParameteri(entry->handle, GL_TEXTURE_MAX_LEVEL, 1);
@@ -636,9 +665,9 @@ GLuint TextureCache::ConvertTexture(Blitter* blitter, uint32_t guest_address,
 }
 
 void TextureCache::EvictTexture(TextureEntry* entry) {
-  if (entry->write_watch_handle) {
-    memory_->CancelWriteWatch(entry->write_watch_handle);
-    entry->write_watch_handle = 0;
+  if (entry->access_watch_handle) {
+    memory_->CancelAccessWatch(entry->access_watch_handle);
+    entry->access_watch_handle = 0;
   }
 
   for (auto& view : entry->views) {
diff --git a/src/xenia/gpu/gl4/texture_cache.h b/src/xenia/gpu/gl4/texture_cache.h
index d214dac53..d55aa37a1 100644
--- a/src/xenia/gpu/gl4/texture_cache.h
+++ b/src/xenia/gpu/gl4/texture_cache.h
@@ -44,7 +44,7 @@ class TextureCache {
   };
   struct TextureEntry {
     TextureInfo texture_info;
-    uintptr_t write_watch_handle;
+    uintptr_t access_watch_handle;
     GLuint handle;
     bool pending_invalidation;
     std::vector<std::unique_ptr<TextureEntryView>> views;
@@ -74,8 +74,12 @@ class TextureCache {
                         TextureFormat format, bool swap_channels,
                         GLuint src_texture, Rect2D src_rect, Rect2D dest_rect);
 
+  TextureEntry* LookupAddress(uint32_t guest_address, uint32_t width,
+                              uint32_t height, TextureFormat format);
+
  private:
   struct ReadBufferTexture {
+    uintptr_t access_watch_handle;
     uint32_t guest_address;
     uint32_t logical_width;
     uint32_t logical_height;
@@ -90,8 +94,6 @@ class TextureCache {
   void EvictSampler(SamplerEntry* entry);
   TextureEntry* LookupOrInsertTexture(const TextureInfo& texture_info,
                                       uint64_t opt_hash = 0);
-  TextureEntry* LookupAddress(uint32_t guest_address, uint32_t width,
-                              uint32_t height, TextureFormat format);
   void EvictTexture(TextureEntry* entry);
 
   bool UploadTexture2D(GLuint texture, const TextureInfo& texture_info);
diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc
index d7507df23..5dcf5bfa8 100644
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@@ -376,17 +376,19 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
   return mmio_handler_->LookupRange(virtual_address);
 }
 
-uintptr_t Memory::AddPhysicalWriteWatch(uint32_t physical_address,
-                                        uint32_t length,
-                                        cpu::WriteWatchCallback callback,
-                                        void* callback_context,
-                                        void* callback_data) {
-  return mmio_handler_->AddPhysicalWriteWatch(
-      physical_address, length, callback, callback_context, callback_data);
+uintptr_t Memory::AddPhysicalAccessWatch(uint32_t physical_address,
+                                         uint32_t length,
+                                         cpu::MMIOHandler::WatchType type,
+                                         cpu::AccessWatchCallback callback,
+                                         void* callback_context,
+                                         void* callback_data) {
+  return mmio_handler_->AddPhysicalAccessWatch(physical_address, length, type,
+                                               callback, callback_context,
+                                               callback_data);
 }
 
-void Memory::CancelWriteWatch(uintptr_t watch_handle) {
-  mmio_handler_->CancelWriteWatch(watch_handle);
+void Memory::CancelAccessWatch(uintptr_t watch_handle) {
+  mmio_handler_->CancelAccessWatch(watch_handle);
 }
 
 uint32_t Memory::SystemHeapAlloc(uint32_t size, uint32_t alignment,
@@ -453,6 +455,7 @@ bool Memory::Save(ByteStream* stream) {
 }
 
 bool Memory::Restore(ByteStream* stream) {
+  XELOGD("Restoring memory...");
   heaps_.v00000000.Restore(stream);
   heaps_.v40000000.Restore(stream);
   heaps_.v80000000.Restore(stream);
@@ -577,6 +580,8 @@ bool BaseHeap::Save(ByteStream* stream) {
 }
 
 bool BaseHeap::Restore(ByteStream* stream) {
+  XELOGD("Heap %.8X-%.8X", heap_base_, heap_base_ + heap_size_);
+
   for (size_t i = 0; i < page_table_.size(); i++) {
     auto& page = page_table_[i];
     page.qword = stream->Read<uint64_t>();
@@ -897,7 +902,7 @@ bool BaseHeap::Release(uint32_t base_address, uint32_t* out_region_size) {
   auto base_page_entry = page_table_[base_page_number];
   if (base_page_entry.base_address != base_page_number) {
     XELOGE("BaseHeap::Release failed because address is not a region start");
-    // return false;
+    return false;
   }
 
   if (out_region_size) {
diff --git a/src/xenia/memory.h b/src/xenia/memory.h
index 6a0fc9c5d..e27976de2 100644
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@@ -303,12 +303,13 @@ class Memory {
   //
   // This has a significant performance penalty for writes in in the range or
   // nearby (sharing 64KiB pages).
-  uintptr_t AddPhysicalWriteWatch(uint32_t physical_address, uint32_t length,
-                                  cpu::WriteWatchCallback callback,
-                                  void* callback_context, void* callback_data);
+  uintptr_t AddPhysicalAccessWatch(uint32_t physical_address, uint32_t length,
+                                   cpu::MMIOHandler::WatchType type,
+                                   cpu::AccessWatchCallback callback,
+                                   void* callback_context, void* callback_data);
 
-  // Cancels a write watch requested with AddPhysicalWriteWatch.
-  void CancelWriteWatch(uintptr_t watch_handle);
+  // Cancels a write watch requested with AddPhysicalAccessWatch.
+  void CancelAccessWatch(uintptr_t watch_handle);
 
   // Allocates virtual memory from the 'system' heap.
   // System memory is kept separate from game memory but is still accessible

From 2512a6360eec4770874e994a0292b980bbf9c61a Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 17 Mar 2016 21:55:47 -0500
Subject: [PATCH 19/77] Pass the physical frontbuffer address into the CP

---
 src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc
index e979cb62a..208473cf2 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc
@@ -366,7 +366,7 @@ void VdSwap(lpvoid_t buffer_ptr,  // ptr into primary ringbuffer
   auto dwords = buffer_ptr.as_array<uint32_t>();
   dwords[0] = xenos::MakePacketType3<xenos::PM4_XE_SWAP, 63>();
   dwords[1] = 'SWAP';
-  dwords[2] = *frontbuffer_ptr;
+  dwords[2] = (*frontbuffer_ptr) & 0x1FFFFFFF;
 
   // Set by VdCallGraphicsNotificationRoutines.
   dwords[3] = last_frontbuffer_width_;

From 38b94dd9e2ae355971ae2881ff065ffca0ed6b2b Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 17 Mar 2016 21:58:23 -0500
Subject: [PATCH 20/77] Add in Xenos events

---
 src/xenia/gpu/xenos.h | 45 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 65c1f0bad..32c33cae8 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -49,6 +49,7 @@ enum class PrimitiveType : uint32_t {
   kLineLoop = 0x0C,
   kQuadList = 0x0D,
   kQuadStrip = 0x0E,
+  kUnknown0x11 = 0x11,
 };
 
 enum class Dimension : uint32_t {
@@ -382,7 +383,7 @@ XEPACKEDUNION(xe_gpu_vertex_fetch_t, {
     uint32_t type : 2;
     uint32_t address : 30;
     uint32_t endian : 2;
-    uint32_t size : 24;
+    uint32_t size : 24;  // size in words
     uint32_t unk1 : 6;
   });
   XEPACKEDSTRUCTANONYMOUS({
@@ -486,6 +487,46 @@ XEPACKEDUNION(xe_gpu_fetch_group_t, {
   });
 });
 
+enum Event {
+  SAMPLE_STREAMOUTSTATS1 = (1 << 0),
+  SAMPLE_STREAMOUTSTATS2 = (2 << 0),
+  SAMPLE_STREAMOUTSTATS3 = (3 << 0),
+  CACHE_FLUSH_TS = (4 << 0),
+  CACHE_FLUSH = (6 << 0),
+  CS_PARTIAL_FLUSH = (7 << 0),
+  VGT_STREAMOUT_RESET = (10 << 0),
+  END_OF_PIPE_INCR_DE = (11 << 0),
+  END_OF_PIPE_IB_END = (12 << 0),
+  RST_PIX_CNT = (13 << 0),
+  VS_PARTIAL_FLUSH = (15 << 0),
+  PS_PARTIAL_FLUSH = (16 << 0),
+  CACHE_FLUSH_AND_INV_TS_EVENT = (20 << 0),
+  ZPASS_DONE = (21 << 0),
+  CACHE_FLUSH_AND_INV_EVENT = (22 << 0),
+  PERFCOUNTER_START = (23 << 0),
+  PERFCOUNTER_STOP = (24 << 0),
+  PIPELINESTAT_START = (25 << 0),
+  PIPELINESTAT_STOP = (26 << 0),
+  PERFCOUNTER_SAMPLE = (27 << 0),
+  SAMPLE_PIPELINESTAT = (30 << 0),
+  SAMPLE_STREAMOUTSTATS = (32 << 0),
+  RESET_VTX_CNT = (33 << 0),
+  VGT_FLUSH = (36 << 0),
+  BOTTOM_OF_PIPE_TS = (40 << 0),
+  DB_CACHE_FLUSH_AND_INV = (42 << 0),
+  FLUSH_AND_INV_DB_DATA_TS = (43 << 0),
+  FLUSH_AND_INV_DB_META = (44 << 0),
+  FLUSH_AND_INV_CB_DATA_TS = (45 << 0),
+  FLUSH_AND_INV_CB_META = (46 << 0),
+  CS_DONE = (47 << 0),
+  PS_DONE = (48 << 0),
+  FLUSH_AND_INV_CB_PIXEL_DATA = (49 << 0),
+  THREAD_TRACE_START = (51 << 0),
+  THREAD_TRACE_STOP = (52 << 0),
+  THREAD_TRACE_FLUSH = (54 << 0),
+  THREAD_TRACE_FINISH = (55 << 0),
+};
+
 // Opcodes (IT_OPCODE) for Type-3 commands in the ringbuffer.
 // https://github.com/freedreno/amd-gpu/blob/master/include/api/gsl_pm4types.h
 // Not sure if all of these are used.
@@ -501,7 +542,7 @@ enum Type3Opcode {
   PM4_WAIT_FOR_IDLE         = 0x26,   // wait for the IDLE state of the engine
   PM4_WAIT_REG_MEM          = 0x3c,   // wait until a register or memory location is a specific value
   PM4_WAIT_REG_EQ           = 0x52,   // wait until a register location is equal to a specific value
-  PM4_WAT_REG_GTE           = 0x53,   // wait until a register location is >= a specific value
+  PM4_WAIT_REG_GTE          = 0x53,   // wait until a register location is >= a specific value
   PM4_WAIT_UNTIL_READ       = 0x5c,   // wait until a read completes
   PM4_WAIT_IB_PFD_COMPLETE  = 0x5d,   // wait until all base/size writes from an IB_PFD packet have completed
 

From 7b962e59a4f3ef0d41e89d1a6cf216f1e61d86ea Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 20 Mar 2016 14:21:55 -0500
Subject: [PATCH 21/77] SPIR-V Dst Fix a few bugs in the translator

---
 src/xenia/gpu/spirv_shader_translator.cc | 73 +++++++++++++++++-------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 28158ed20..f7a1660fb 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -164,6 +164,7 @@ void SpirvShaderTranslator::StartTranslation() {
                                   push_constants_type, "push_consts");
 
   // Texture bindings
+  Id sampler_t = b.makeSamplerType();
   Id tex_t[] = {b.makeSampledImageType(b.makeImageType(
                     float_type_, spv::Dim::Dim1D, false, false, false, 1,
                     spv::ImageFormat::ImageFormatUnknown)),
@@ -177,18 +178,17 @@ void SpirvShaderTranslator::StartTranslation() {
                     float_type_, spv::Dim::DimCube, false, false, false, 1,
                     spv::ImageFormat::ImageFormatUnknown))};
 
-  // Id samplers_a = b.makeArrayType(sampler_t, b.makeUintConstant(32), 0);
+  Id samplers_a = b.makeArrayType(sampler_t, b.makeUintConstant(32), 0);
   Id tex_a_t[] = {b.makeArrayType(tex_t[0], b.makeUintConstant(32), 0),
                   b.makeArrayType(tex_t[1], b.makeUintConstant(32), 0),
                   b.makeArrayType(tex_t[2], b.makeUintConstant(32), 0),
                   b.makeArrayType(tex_t[3], b.makeUintConstant(32), 0)};
 
   // TODO(DrChat): See texture_cache.cc - do we need separate samplers here?
-  // samplers_ =
-  // b.createVariable(spv::StorageClass::StorageClassUniformConstant,
-  //                              samplers_a, "samplers");
-  // b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
-  // b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
+  samplers_ = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
+                               samplers_a, "samplers");
+  b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
+  b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
   for (int i = 0; i < 4; i++) {
     tex_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
                                tex_a_t[i],
@@ -481,16 +481,17 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
       // Conditional branch
       assert_true(cf_blocks_.size() > instr.dword_index + 1);
       body = &b.makeNewBlock();
-      auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, v,
-                                b.makeBoolConstant(instr.condition));
+      auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
+                                b.makeUintConstant(uint32_t(instr.condition)));
       b.createConditionalBranch(cond, body, cf_blocks_[instr.dword_index + 1]);
     } break;
     case ParsedExecInstruction::Type::kPredicated: {
       // Branch based on p0.
       assert_true(cf_blocks_.size() > instr.dword_index + 1);
       body = &b.makeNewBlock();
-      auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, p0_,
-                                b.makeBoolConstant(instr.condition));
+      auto cond =
+          b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
+                        b.makeBoolConstant(instr.condition));
       b.createConditionalBranch(cond, body, cf_blocks_[instr.dword_index + 1]);
     } break;
   }
@@ -545,6 +546,8 @@ void SpirvShaderTranslator::ProcessCallInstruction(
   auto head = cf_blocks_[instr.dword_index];
   b.setBuildPoint(head);
 
+  // Unused instruction(?)
+  assert_always();
   EmitUnimplementedTranslationError();
 
   assert_true(cf_blocks_.size() > instr.dword_index + 1);
@@ -558,6 +561,8 @@ void SpirvShaderTranslator::ProcessReturnInstruction(
   auto head = cf_blocks_[instr.dword_index];
   b.setBuildPoint(head);
 
+  // Unused instruction(?)
+  assert_always();
   EmitUnimplementedTranslationError();
 
   assert_true(cf_blocks_.size() > instr.dword_index + 1);
@@ -576,6 +581,8 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
       b.createBranch(cf_blocks_[instr.target_address]);
     } break;
     case ParsedJumpInstruction::Type::kConditional: {
+      assert_true(cf_blocks_.size() > instr.dword_index + 1);
+
       // Based off of bool_consts
       std::vector<Id> offsets;
       offsets.push_back(b.makeUintConstant(2));  // bool_consts
@@ -590,17 +597,19 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
                         b.makeUintConstant(1));
 
       // Conditional branch
-      auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, v,
-                                b.makeBoolConstant(instr.condition));
+      auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
+                                b.makeUintConstant(uint32_t(instr.condition)));
       b.createConditionalBranch(cond, cf_blocks_[instr.target_address],
-                                cf_blocks_[instr.dword_index]);
+                                cf_blocks_[instr.dword_index + 1]);
     } break;
     case ParsedJumpInstruction::Type::kPredicated: {
       assert_true(cf_blocks_.size() > instr.dword_index + 1);
-      auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, p0_,
-                                b.makeBoolConstant(instr.condition));
+
+      auto cond =
+          b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
+                        b.makeBoolConstant(instr.condition));
       b.createConditionalBranch(cond, cf_blocks_[instr.target_address],
-                                cf_blocks_[instr.dword_index]);
+                                cf_blocks_[instr.dword_index + 1]);
     } break;
   }
 }
@@ -770,7 +779,15 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     } break;
 
     case AluVectorOpcode::kDst: {
-      // TODO
+      auto src0_y = b.createCompositeExtract(sources[0], float_type_, 1);
+      auto src1_y = b.createCompositeExtract(sources[1], float_type_, 1);
+      auto dst_y = b.createBinOp(spv::Op::OpFMul, float_type_, src0_y, src1_y);
+
+      auto src0_z = b.createCompositeExtract(sources[0], float_type_, 3);
+      auto src1_w = b.createCompositeExtract(sources[0], float_type_, 4);
+      dest = b.createCompositeConstruct(
+          vec4_float_type_,
+          std::vector<Id>({b.makeFloatConstant(1.f), dst_y, src0_z, src1_w}));
     } break;
 
     case AluVectorOpcode::kDp2Add: {
@@ -1175,7 +1192,10 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto kill_block = &b.makeNewBlock();
       auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                                 b.makeFloatConstant(0.f));
-      cond = b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
+      if (pred_cond) {
+        cond =
+            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
+      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -1348,6 +1368,12 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
                            b.makeFloatConstant(0.f), d);
     } break;
 
+    case AluScalarOpcode::kRsqc: {
+    } break;
+
+    case AluScalarOpcode::kRsqf: {
+    } break;
+
     case AluScalarOpcode::kRsq: {
       // dest = src0 != 0.0 ? inversesqrt(src0) : 0.0;
       auto c = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
@@ -1430,12 +1456,10 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     case AluScalarOpcode::kSetpInv: {
+      // p0 = src0 == 1.0
       auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                                 b.makeFloatConstant(1.f));
-      auto pred =
-          b.createTriOp(spv::Op::OpSelect, bool_type_, cond,
-                        b.makeBoolConstant(true), b.makeBoolConstant(false));
-      b.createStore(pred, p0_);
+      b.createStore(cond, p0_);
 
       // if (!cond) dest = src0 == 0.0 ? 1.0 : src0;
       auto dst_cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_,
@@ -1482,6 +1506,11 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
                                              GLSLstd450::kSin, {sources[0]});
     } break;
 
+    case AluScalarOpcode::kSqrt: {
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             GLSLstd450::kSqrt, {sources[0]});
+    } break;
+
     case AluScalarOpcode::kSubs:
     case AluScalarOpcode::kSubsc0:
     case AluScalarOpcode::kSubsc1: {

From e72e283e79c664ec55055b4ba04f712038a4569e Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 23 Mar 2016 16:19:18 -0500
Subject: [PATCH 22/77] Primitive type makes rasterization state dirty too!

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 8 ++++++++
 src/xenia/gpu/vulkan/pipeline_cache.h  | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index ca7c37b46..ee1174a72 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -936,6 +936,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
   auto& state_info = update_rasterization_state_info_;
 
   bool dirty = false;
+  dirty |= regs.primitive_type != primitive_type;
   dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
                              XE_GPU_REG_PA_SU_SC_MODE_CNTL);
   dirty |= SetShadowRegister(&regs.pa_sc_screen_scissor_tl,
@@ -944,6 +945,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
                              XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR);
   dirty |= SetShadowRegister(&regs.multi_prim_ib_reset_index,
                              XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX);
+  regs.primitive_type = primitive_type;
   XXH64_update(&hash_state_, &regs, sizeof(regs));
   if (!dirty) {
     return UpdateStatus::kCompatible;
@@ -983,6 +985,10 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
     case 2:
       state_info.cullMode = VK_CULL_MODE_BACK_BIT;
       break;
+    case 3:
+      // Cull both sides?
+      assert_always();
+      break;
   }
   if (regs.pa_su_sc_mode_cntl & 0x4) {
     state_info.frontFace = VK_FRONT_FACE_CLOCKWISE;
@@ -1013,6 +1019,8 @@ PipelineCache::UpdateStatus PipelineCache::UpdateMultisampleState() {
   state_info.pNext = nullptr;
   state_info.flags = 0;
 
+  // PA_SC_AA_CONFIG MSAA_NUM_SAMPLES
+  // PA_SU_SC_MODE_CNTL MSAA_ENABLE
   state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
   state_info.sampleShadingEnable = VK_FALSE;
   state_info.minSampleShading = 0;
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h
index 3e623f14e..b33c030ed 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/pipeline_cache.h
@@ -205,11 +205,11 @@ class PipelineCache {
   VkPipelineViewportStateCreateInfo update_viewport_state_info_;
 
   struct UpdateRasterizationStateRegisters {
+    PrimitiveType primitive_type;
     uint32_t pa_su_sc_mode_cntl;
     uint32_t pa_sc_screen_scissor_tl;
     uint32_t pa_sc_screen_scissor_br;
     uint32_t multi_prim_ib_reset_index;
-    PrimitiveType prim_type;
 
     UpdateRasterizationStateRegisters() { Reset(); }
     void Reset() { std::memset(this, 0, sizeof(*this)); }

From 181b2af5a4c0ffa2124e97d6a6e512705ec76fc7 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 13:45:44 -0500
Subject: [PATCH 23/77] Vulkan Circular Buffer

---
 src/xenia/ui/vulkan/circular_buffer.cc | 258 +++++++++++++++++++++++++
 src/xenia/ui/vulkan/circular_buffer.h  |  85 ++++++++
 2 files changed, 343 insertions(+)
 create mode 100644 src/xenia/ui/vulkan/circular_buffer.cc
 create mode 100644 src/xenia/ui/vulkan/circular_buffer.h

diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc
new file mode 100644
index 000000000..4cc22366f
--- /dev/null
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@@ -0,0 +1,258 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <algorithm>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/logging.h"
+#include "xenia/base/math.h"
+
+#include "xenia/ui/vulkan/circular_buffer.h"
+
+namespace xe {
+namespace ui {
+namespace vulkan {
+
+CircularBuffer::CircularBuffer(VulkanDevice* device) : device_(device) {}
+CircularBuffer::~CircularBuffer() { Shutdown(); }
+
+bool CircularBuffer::Initialize(VkDeviceSize capacity, VkBufferUsageFlags usage,
+                                VkDeviceSize alignment) {
+  VkResult status = VK_SUCCESS;
+  capacity = xe::round_up(capacity, alignment);
+
+  // Create our internal buffer.
+  VkBufferCreateInfo buffer_info;
+  buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  buffer_info.pNext = nullptr;
+  buffer_info.flags = 0;
+  buffer_info.size = capacity;
+  buffer_info.usage = usage;
+  buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  buffer_info.queueFamilyIndexCount = 0;
+  buffer_info.pQueueFamilyIndices = nullptr;
+  status = vkCreateBuffer(*device_, &buffer_info, nullptr, &gpu_buffer_);
+  CheckResult(status, "vkCreateBuffer");
+  if (status != VK_SUCCESS) {
+    return false;
+  }
+
+  VkMemoryRequirements reqs;
+  vkGetBufferMemoryRequirements(*device_, gpu_buffer_, &reqs);
+
+  // Allocate memory from the device to back the buffer.
+  assert_true(reqs.size == capacity);
+  reqs.alignment = std::max(alignment, reqs.alignment);
+  gpu_memory_ = device_->AllocateMemory(reqs);
+  if (!gpu_memory_) {
+    XELOGE("CircularBuffer::Initialize - Failed to allocate memory!");
+    Shutdown();
+    return false;
+  }
+
+  alignment_ = reqs.alignment;
+  capacity_ = reqs.size;
+  gpu_base_ = 0;
+
+  // Bind the buffer to its backing memory.
+  status = vkBindBufferMemory(*device_, gpu_buffer_, gpu_memory_, gpu_base_);
+  CheckResult(status, "vkBindBufferMemory");
+  if (status != VK_SUCCESS) {
+    XELOGE("CircularBuffer::Initialize - Failed to bind memory!");
+    Shutdown();
+    return false;
+  }
+
+  // Map the memory so we can access it.
+  status = vkMapMemory(*device_, gpu_memory_, gpu_base_, capacity_, 0,
+                       reinterpret_cast<void**>(&host_base_));
+  CheckResult(status, "vkMapMemory");
+  if (status != VK_SUCCESS) {
+    XELOGE("CircularBuffer::Initialize - Failed to map memory!");
+    Shutdown();
+    return false;
+  }
+
+  return true;
+}
+
+void CircularBuffer::Shutdown() {
+  Clear();
+  if (host_base_) {
+    vkUnmapMemory(*device_, gpu_memory_);
+    host_base_ = nullptr;
+  }
+  if (gpu_buffer_) {
+    vkDestroyBuffer(*device_, gpu_buffer_, nullptr);
+    gpu_buffer_ = nullptr;
+  }
+  if (gpu_memory_) {
+    vkFreeMemory(*device_, gpu_memory_, nullptr);
+    gpu_memory_ = nullptr;
+  }
+}
+
+bool CircularBuffer::CanAcquire(VkDeviceSize length) {
+  // Make sure the length is aligned.
+  length = xe::round_up(length, alignment_);
+  if (allocations_.empty()) {
+    // Read head has caught up to write head (entire buffer available for write)
+    assert(read_head_ == write_head_);
+    return capacity_ > length;
+  } else if (write_head_ < read_head_) {
+    // Write head wrapped around and is behind read head.
+    // |  write  |---- read ----|
+    return (read_head_ - write_head_) > length;
+  } else {
+    // Read head behind write head.
+    // 1. Check if there's enough room from write -> capacity
+    // |  |---- read ----|    write     |
+    if ((capacity_ - write_head_) > length) {
+      return true;
+    }
+
+    // 2. Check if there's enough room from 0 -> read
+    // |    write     |---- read ----|  |
+    if ((read_head_) > length) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+CircularBuffer::Allocation* CircularBuffer::Acquire(
+    VkDeviceSize length, std::shared_ptr<Fence> fence) {
+  if (!CanAcquire(length)) {
+    return nullptr;
+  }
+
+  VkDeviceSize aligned_length = xe::round_up(length, alignment_);
+  if (allocations_.empty()) {
+    // Entire buffer available.
+    assert(read_head_ == write_head_);
+    assert(capacity_ > aligned_length);
+
+    read_head_ = 0;
+    write_head_ = length;
+
+    auto alloc = new Allocation();
+    alloc->host_ptr = host_base_ + 0;
+    alloc->gpu_memory = gpu_memory_;
+    alloc->offset = gpu_base_ + 0;
+    alloc->length = length;
+    alloc->aligned_length = aligned_length;
+    alloc->fence = fence;
+    allocations_.push_back(alloc);
+    return alloc;
+  } else if (write_head_ < read_head_) {
+    // Write head behind read head.
+    assert_true(read_head_ - write_head_ >= aligned_length);
+
+    auto alloc = new Allocation();
+    alloc->host_ptr = host_base_ + write_head_;
+    alloc->gpu_memory = gpu_memory_;
+    alloc->offset = gpu_base_ + write_head_;
+    alloc->length = length;
+    alloc->aligned_length = aligned_length;
+    alloc->fence = fence;
+    write_head_ += aligned_length;
+    allocations_.push_back(alloc);
+
+    return alloc;
+  } else {
+    // Write head after read head
+    if (capacity_ - write_head_ >= aligned_length) {
+      // Free space from write -> capacity
+      auto alloc = new Allocation();
+      alloc->host_ptr = host_base_ + write_head_;
+      alloc->gpu_memory = gpu_memory_;
+      alloc->offset = gpu_base_ + write_head_;
+      alloc->length = length;
+      alloc->aligned_length = aligned_length;
+      alloc->fence = fence;
+      write_head_ += aligned_length;
+      allocations_.push_back(alloc);
+
+      return alloc;
+    } else if ((read_head_ - 0) > aligned_length) {
+      // Free space from begin -> read
+      auto alloc = new Allocation();
+      alloc->host_ptr = host_base_ + write_head_;
+      alloc->gpu_memory = gpu_memory_;
+      alloc->offset = gpu_base_ + 0;
+      alloc->length = length;
+      alloc->aligned_length = aligned_length;
+      alloc->fence = fence;
+      write_head_ = aligned_length;
+      allocations_.push_back(alloc);
+
+      return alloc;
+    }
+  }
+
+  return nullptr;
+}
+
+void CircularBuffer::Discard(Allocation* allocation) {
+  // TODO: Revert write_head_ (only if this is the last alloc though)
+  // Or maybe just disallow discards.
+  for (auto it = allocations_.begin(); it != allocations_.end(); ++it) {
+    if (*it == allocation) {
+      allocations_.erase(it);
+      break;
+    }
+  }
+
+  delete allocation;
+}
+
+void CircularBuffer::Flush(Allocation* allocation) {
+  VkMappedMemoryRange range;
+  range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+  range.pNext = nullptr;
+  range.memory = gpu_memory_;
+  range.offset = gpu_base_ + allocation->offset;
+  range.size = allocation->length;
+  vkFlushMappedMemoryRanges(*device_, 1, &range);
+}
+
+void CircularBuffer::Clear() {
+  for (auto it = allocations_.begin(); it != allocations_.end();) {
+    delete *it;
+    it = allocations_.erase(it);
+  }
+
+  write_head_ = read_head_ = 0;
+}
+
+void CircularBuffer::Scavenge() {
+  for (auto it = allocations_.begin(); it != allocations_.end();) {
+    if ((*it)->fence->status() != VK_SUCCESS) {
+      // Don't bother freeing following allocations to ensure proper ordering.
+      break;
+    }
+
+    read_head_ = (read_head_ + (*it)->aligned_length) % capacity_;
+    delete *it;
+    it = allocations_.erase(it);
+  }
+
+  if (allocations_.empty()) {
+    // Reset R/W heads.
+    read_head_ = write_head_ = 0;
+  } else {
+    // FIXME: Haven't verified this works correctly when actually rotating :P
+    assert_always();
+  }
+}
+
+}  // namespace vulkan
+}  // namespace ui
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/ui/vulkan/circular_buffer.h b/src/xenia/ui/vulkan/circular_buffer.h
new file mode 100644
index 000000000..2c036c685
--- /dev/null
+++ b/src/xenia/ui/vulkan/circular_buffer.h
@@ -0,0 +1,85 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_UI_VULKAN_CIRCULAR_BUFFER_H_
+#define XENIA_UI_VULKAN_CIRCULAR_BUFFER_H_
+
+#include <unordered_map>
+
+#include "xenia/ui/vulkan/vulkan.h"
+#include "xenia/ui/vulkan/vulkan_device.h"
+
+namespace xe {
+namespace ui {
+namespace vulkan {
+
+// A circular buffer, intended to hold (fairly) temporary memory that will be
+// released when a fence is signaled. Best used when allocations are taken
+// in-order with command buffer submission.
+//
+// Allocations loop around the buffer in circles (but are not fragmented at the
+// ends of the buffer), where trailing older allocations are freed after use.
+class CircularBuffer {
+ public:
+  CircularBuffer(VulkanDevice* device);
+  ~CircularBuffer();
+
+  struct Allocation {
+    void* host_ptr;
+    VkDeviceMemory gpu_memory;
+    VkDeviceSize offset;
+    VkDeviceSize length;
+    VkDeviceSize aligned_length;
+
+    // Allocation usage fence. This allocation will be deleted when the fence
+    // becomes signaled.
+    std::shared_ptr<Fence> fence;
+  };
+
+  bool Initialize(VkDeviceSize capacity, VkBufferUsageFlags usage,
+                  VkDeviceSize alignment = 256);
+  void Shutdown();
+
+  VkDeviceSize capacity() const { return capacity_; }
+  VkBuffer gpu_buffer() const { return gpu_buffer_; }
+  VkDeviceMemory gpu_memory() const { return gpu_memory_; }
+  uint8_t* host_base() const { return host_base_; }
+
+  bool CanAcquire(VkDeviceSize length);
+  Allocation* Acquire(VkDeviceSize length, std::shared_ptr<Fence> fence);
+  void Discard(Allocation* allocation);
+  void Flush(Allocation* allocation);
+
+  // Clears all allocations, regardless of whether they've been consumed or not.
+  void Clear();
+
+  // Frees any allocations whose fences have been signaled.
+  void Scavenge();
+
+ private:
+  VkDeviceSize capacity_ = 0;
+  VkDeviceSize alignment_ = 0;
+  VkDeviceSize write_head_ = 0;
+  VkDeviceSize read_head_ = 0;
+
+  VulkanDevice* device_;
+  VkBuffer gpu_buffer_ = nullptr;
+  VkDeviceMemory gpu_memory_ = nullptr;
+  VkDeviceSize gpu_base_ = 0;
+  uint8_t* host_base_ = nullptr;
+
+  std::unordered_map<uint64_t, uintptr_t> allocation_cache_;
+  std::vector<Allocation*> allocations_;
+};
+
+}  // namespace vulkan
+}  // namespace ui
+}  // namespace xe
+
+#endif  // XENIA_UI_GL_CIRCULAR_BUFFER_H_

From 0e41774e36539baae4a76cc4b0c0d3d4efcf3eb8 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:31:12 -0500
Subject: [PATCH 24/77] RenderCache::dirty() - used to tell if we need to begin
 a new pass Round all pixel pitch/heights up before dividing.

---
 src/xenia/gpu/vulkan/render_cache.cc | 87 +++++++++++++++++-----------
 src/xenia/gpu/vulkan/render_cache.h  | 16 ++++-
 2 files changed, 66 insertions(+), 37 deletions(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 5047bff21..334a1215f 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -39,7 +39,7 @@ VkFormat ColorRenderTargetFormatToVkFormat(ColorRenderTargetFormat format) {
     case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown:
       // WARNING: this is wrong, most likely - no float form in vulkan?
       XELOGW("Unsupported EDRAM format k_2_10_10_10_FLOAT used");
-      return VK_FORMAT_A2R10G10B10_SSCALED_PACK32;
+      return VK_FORMAT_A2R10G10B10_UNORM_PACK32;
     case ColorRenderTargetFormat::k_16_16:
       return VK_FORMAT_R16G16_UNORM;
     case ColorRenderTargetFormat::k_16_16_16_16:
@@ -451,10 +451,7 @@ RenderCache::RenderCache(RegisterFile* register_file,
   CheckResult(status, "vkBindBufferMemory");
 
   if (status == VK_SUCCESS) {
-    status = vkBindBufferMemory(*device_, edram_buffer_, edram_memory_, 0);
-    CheckResult(status, "vkBindBufferMemory");
-
-    // Upload a grid into the EDRAM buffer.
+    // For debugging, upload a grid into the EDRAM buffer.
     uint32_t* gpu_data = nullptr;
     status = vkMapMemory(*device_, edram_memory_, 0, buffer_requirements.size,
                          0, reinterpret_cast<void**>(&gpu_data));
@@ -490,6 +487,25 @@ RenderCache::~RenderCache() {
   vkFreeMemory(*device_, edram_memory_, nullptr);
 }
 
+bool RenderCache::dirty() const {
+  auto& regs = *register_file_;
+  auto& cur_regs = shadow_registers_;
+
+  bool dirty = false;
+  dirty |= cur_regs.rb_modecontrol != regs[XE_GPU_REG_RB_MODECONTROL].u32;
+  dirty |= cur_regs.rb_surface_info != regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
+  dirty |= cur_regs.rb_color_info != regs[XE_GPU_REG_RB_COLOR_INFO].u32;
+  dirty |= cur_regs.rb_color1_info != regs[XE_GPU_REG_RB_COLOR1_INFO].u32;
+  dirty |= cur_regs.rb_color2_info != regs[XE_GPU_REG_RB_COLOR2_INFO].u32;
+  dirty |= cur_regs.rb_color3_info != regs[XE_GPU_REG_RB_COLOR3_INFO].u32;
+  dirty |= cur_regs.rb_depth_info != regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
+  dirty |= cur_regs.pa_sc_window_scissor_tl !=
+           regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
+  dirty |= cur_regs.pa_sc_window_scissor_br !=
+           regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
+  return dirty;
+}
+
 const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
                                                 VulkanShader* vertex_shader,
                                                 VulkanShader* pixel_shader) {
@@ -739,8 +755,8 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
     for (int i = 0; i < 4; ++i) {
       TileViewKey color_key;
       color_key.tile_offset = config->color[i].edram_base;
-      color_key.tile_width = config->surface_pitch_px / 80;
-      color_key.tile_height = config->surface_height_px / 16;
+      color_key.tile_width = xe::round_up(config->surface_pitch_px, 80) / 80;
+      color_key.tile_height = xe::round_up(config->surface_height_px, 16) / 16;
       color_key.color_or_depth = 1;
       color_key.edram_format = static_cast<uint16_t>(config->color[i].format);
       target_color_attachments[i] =
@@ -753,8 +769,10 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
 
     TileViewKey depth_stencil_key;
     depth_stencil_key.tile_offset = config->depth_stencil.edram_base;
-    depth_stencil_key.tile_width = config->surface_pitch_px / 80;
-    depth_stencil_key.tile_height = config->surface_height_px / 16;
+    depth_stencil_key.tile_width =
+        xe::round_up(config->surface_pitch_px, 80) / 80;
+    depth_stencil_key.tile_height =
+        xe::round_up(config->surface_height_px, 16) / 16;
     depth_stencil_key.color_or_depth = 0;
     depth_stencil_key.edram_format =
         static_cast<uint16_t>(config->depth_stencil.format);
@@ -960,6 +978,7 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
                        &buffer_barrier, 0, nullptr);
 
   // Update the tile view with current EDRAM contents.
+  // TODO: Heuristics to determine if this copy is avoidable.
   VkBufferImageCopy buffer_copy;
   buffer_copy.bufferOffset = edram_base * 5120;
   buffer_copy.bufferImageHeight = 0;
@@ -980,29 +999,26 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
   image_barrier.pNext = nullptr;
   image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  if (image_layout != VK_IMAGE_LAYOUT_GENERAL &&
-      image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
-    image_barrier.srcAccessMask = 0;
-    image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-    image_barrier.oldLayout = image_layout;
-    image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-    image_barrier.image = image;
-    image_barrier.subresourceRange = {0, 0, 1, 0, 1};
-    image_barrier.subresourceRange.aspectMask =
-        color_or_depth
-            ? VK_IMAGE_ASPECT_COLOR_BIT
-            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  image_barrier.srcAccessMask = 0;
+  image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+  image_barrier.oldLayout = image_layout;
+  image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+  image_barrier.image = image;
+  image_barrier.subresourceRange = {0, 0, 1, 0, 1};
+  image_barrier.subresourceRange.aspectMask =
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
+                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
 
-    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
-                         nullptr, 1, &image_barrier);
-  }
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &image_barrier);
 
   // If we overflow we'll lose the device here.
   assert_true(extents.width <= key.tile_width * 80u);
   assert_true(extents.height <= key.tile_height * 16u);
 
   // Now issue the blit to the destination.
+  // TODO: Resolve to destination if necessary.
   VkImageBlit image_blit;
   image_blit.srcSubresource = {0, 0, 0, 1};
   image_blit.srcSubresource.aspectMask =
@@ -1024,15 +1040,12 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
                  image, image_layout, 1, &image_blit, filter);
 
   // Transition the image back into its previous layout.
-  if (image_layout != VK_IMAGE_LAYOUT_GENERAL &&
-      image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
-    image_barrier.srcAccessMask = image_barrier.dstAccessMask;
-    image_barrier.dstAccessMask = 0;
-    std::swap(image_barrier.oldLayout, image_barrier.newLayout);
-    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
-                         nullptr, 1, &image_barrier);
-  }
+  image_barrier.srcAccessMask = image_barrier.dstAccessMask;
+  image_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+  std::swap(image_barrier.oldLayout, image_barrier.newLayout);
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &image_barrier);
 }
 
 void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
@@ -1040,6 +1053,9 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
                                   ColorRenderTargetFormat format,
                                   uint32_t pitch, uint32_t height,
                                   float* color) {
+  // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
+  // need to detect this and calculate a value.
+
   // Grab a tile view (as we need to clear an image first)
   TileViewKey key;
   key.color_or_depth = 1;
@@ -1076,6 +1092,9 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
                                          DepthRenderTargetFormat format,
                                          uint32_t pitch, uint32_t height,
                                          float depth, uint32_t stencil) {
+  // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
+  // need to detect this and calculate a value.
+
   // Grab a tile view (as we need to clear an image first)
   TileViewKey key;
   key.color_or_depth = 0;
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index 97816c365..2e8d1c5fe 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -37,8 +37,10 @@ struct TileViewKey {
   uint16_t tile_height;
   // 1 if format is ColorRenderTargetFormat, else DepthRenderTargetFormat.
   uint16_t color_or_depth : 1;
+  // Surface MSAA samples
+  // uint16_t msaa_samples : 2;
   // Either ColorRenderTargetFormat or DepthRenderTargetFormat.
-  uint16_t edram_format : 15;
+  uint16_t edram_format : 15;  // 13;
 };
 static_assert(sizeof(TileViewKey) == 8, "Key must be tightly packed");
 
@@ -249,6 +251,10 @@ class RenderCache {
   RenderCache(RegisterFile* register_file, ui::vulkan::VulkanDevice* device);
   ~RenderCache();
 
+  // Call this to determine if you should start a new render pass or continue
+  // with an already open pass.
+  bool dirty() const;
+
   // Begins a render pass targeting the state-specified framebuffer formats.
   // The command buffer will be transitioned into the render pass phase.
   const RenderState* BeginRenderPass(VkCommandBuffer command_buffer,
@@ -263,23 +269,27 @@ class RenderCache {
   void ClearCache();
 
   // Queues commands to copy EDRAM contents into an image.
+  // The command buffer must not be inside of a render pass when calling this.
   void RawCopyToImage(VkCommandBuffer command_buffer, uint32_t edram_base,
                       VkImage image, VkImageLayout image_layout,
                       bool color_or_depth, VkOffset3D offset,
                       VkExtent3D extents);
 
   // Queues commands to blit EDRAM contents into an image.
+  // The command buffer must not be inside of a render pass when calling this.
   void BlitToImage(VkCommandBuffer command_buffer, uint32_t edram_base,
                    uint32_t pitch, uint32_t height, VkImage image,
                    VkImageLayout image_layout, bool color_or_depth,
                    uint32_t format, VkFilter filter, VkOffset3D offset,
                    VkExtent3D extents);
 
-  // Queues commands to clear EDRAM contents with a solid color
+  // Queues commands to clear EDRAM contents with a solid color.
+  // The command buffer must not be inside of a render pass when calling this.
   void ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base,
                        ColorRenderTargetFormat format, uint32_t pitch,
                        uint32_t height, float* color);
   // Queues commands to clear EDRAM contents with depth/stencil values.
+  // The command buffer must not be inside of a render pass when calling this.
   void ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
                               uint32_t edram_base,
                               DepthRenderTargetFormat format, uint32_t pitch,
@@ -307,7 +317,7 @@ class RenderCache {
   RegisterFile* register_file_ = nullptr;
   ui::vulkan::VulkanDevice* device_ = nullptr;
 
-  // Entire 10MiB of EDRAM, aliased to hell by various VkImages.
+  // Entire 10MiB of EDRAM.
   VkDeviceMemory edram_memory_ = nullptr;
   // Buffer overlayed 1:1 with edram_memory_ to allow raw access.
   VkBuffer edram_buffer_ = nullptr;

From b2457d7e724645678f40721682c4135f96697aec Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:32:29 -0500
Subject: [PATCH 25/77] Basic texture uploads/address lookups/etc Freeing of
 descriptor sets when the GPU is finished with them.

---
 src/xenia/gpu/vulkan/texture_cache.cc | 403 +++++++++++++++++++-------
 src/xenia/gpu/vulkan/texture_cache.h  |  84 ++++--
 2 files changed, 363 insertions(+), 124 deletions(-)

diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 5c6e42b8b..500d6ac25 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -26,19 +26,26 @@ using xe::ui::vulkan::CheckResult;
 
 constexpr uint32_t kMaxTextureSamplers = 32;
 
-TextureCache::TextureCache(RegisterFile* register_file,
+struct TextureConfig {
+  TextureFormat guest_format;
+  VkFormat host_format;
+};
+
+TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
                            TraceWriter* trace_writer,
                            ui::vulkan::VulkanDevice* device)
-    : register_file_(register_file),
+    : memory_(memory),
+      register_file_(register_file),
       trace_writer_(trace_writer),
-      device_(device) {
+      device_(device),
+      staging_buffer_(device) {
   // Descriptor pool used for all of our cached descriptors.
   VkDescriptorPoolCreateInfo descriptor_pool_info;
   descriptor_pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
   descriptor_pool_info.pNext = nullptr;
   descriptor_pool_info.flags =
       VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
-  descriptor_pool_info.maxSets = 256;
+  descriptor_pool_info.maxSets = 4096;
   VkDescriptorPoolSize pool_sizes[2];
   pool_sizes[0].type = VK_DESCRIPTOR_TYPE_SAMPLER;
   pool_sizes[0].descriptorCount = 32;
@@ -81,50 +88,21 @@ TextureCache::TextureCache(RegisterFile* register_file,
                                     nullptr, &texture_descriptor_set_layout_);
   CheckResult(err, "vkCreateDescriptorSetLayout");
 
-  // Allocate memory for a staging buffer.
-  VkBufferCreateInfo staging_buffer_info;
-  staging_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-  staging_buffer_info.pNext = nullptr;
-  staging_buffer_info.flags = 0;
-  staging_buffer_info.size = 2048 * 2048 * 4;  // 16MB buffer
-  staging_buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
-  staging_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  staging_buffer_info.queueFamilyIndexCount = 0;
-  staging_buffer_info.pQueueFamilyIndices = nullptr;
-  err =
-      vkCreateBuffer(*device_, &staging_buffer_info, nullptr, &staging_buffer_);
-  CheckResult(err, "vkCreateBuffer");
-  if (err != VK_SUCCESS) {
-    // This isn't good.
+  int width = 4096;
+  int height = 4096;
+  if (!staging_buffer_.Initialize(width * height * 4,
+                                  VK_BUFFER_USAGE_TRANSFER_SRC_BIT)) {
     assert_always();
-    return;
   }
 
-  VkMemoryRequirements staging_buffer_reqs;
-  vkGetBufferMemoryRequirements(*device_, staging_buffer_,
-                                &staging_buffer_reqs);
-  staging_buffer_mem_ = device_->AllocateMemory(staging_buffer_reqs);
-  assert_not_null(staging_buffer_mem_);
-
-  err = vkBindBufferMemory(*device_, staging_buffer_, staging_buffer_mem_, 0);
-  CheckResult(err, "vkBindBufferMemory");
-
   // Upload a grid into the staging buffer.
-  uint32_t* gpu_data = nullptr;
-  err = vkMapMemory(*device_, staging_buffer_mem_, 0, staging_buffer_info.size,
-                    0, reinterpret_cast<void**>(&gpu_data));
-  CheckResult(err, "vkMapMemory");
-
-  int width = 2048;
-  int height = 2048;
+  auto gpu_data = reinterpret_cast<uint32_t*>(staging_buffer_.host_base());
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       gpu_data[y * width + x] =
           ((y % 32 < 16) ^ (x % 32 >= 16)) ? 0xFF0000FF : 0xFFFFFFFF;
     }
   }
-
-  vkUnmapMemory(*device_, staging_buffer_mem_);
 }
 
 TextureCache::~TextureCache() {
@@ -223,6 +201,10 @@ TextureCache::Texture* TextureCache::AllocateTexture(
     auto texture_view = std::make_unique<TextureView>();
     texture_view->texture = texture;
     texture_view->view = view;
+    texture_view->swiz_x = 0;
+    texture_view->swiz_y = 1;
+    texture_view->swiz_z = 2;
+    texture_view->swiz_w = 3;
     texture->views.push_back(std::move(texture_view));
   }
 
@@ -245,28 +227,16 @@ TextureCache::Texture* TextureCache::DemandResolveTexture(
     return texture;
   }
 
-  // Check resolve textures.
-  for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
-       ++it) {
-    texture = (*it).get();
-    if (texture_info.guest_address == texture->texture_info.guest_address &&
-        texture_info.size_2d.logical_width ==
-            texture->texture_info.size_2d.logical_width &&
-        texture_info.size_2d.logical_height ==
-            texture->texture_info.size_2d.logical_height) {
-      // Exact match.
-      return texture;
-    }
-  }
-
   // No texture at this location. Make a new one.
   texture = AllocateTexture(texture_info);
+  texture->is_full_texture = false;
   resolve_textures_.push_back(std::unique_ptr<Texture>(texture));
   return texture;
 }
 
-TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
-                                            VkCommandBuffer command_buffer) {
+TextureCache::Texture* TextureCache::Demand(
+    const TextureInfo& texture_info, VkCommandBuffer command_buffer,
+    std::shared_ptr<ui::vulkan::Fence> completion_fence) {
   // Run a tight loop to scan for an exact match existing texture.
   auto texture_hash = texture_info.hash();
   for (auto it = textures_.find(texture_hash); it != textures_.end(); ++it) {
@@ -285,9 +255,13 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
         texture_info.size_2d.logical_height ==
             texture->texture_info.size_2d.logical_height) {
       // Exact match.
-      // TODO: Lazy match
+      // TODO: Lazy match (at an offset)
+      // Upgrade this texture to a full texture.
+      texture->is_full_texture = true;
       texture->texture_info = texture_info;
       textures_[texture_hash] = std::move(*it);
+      it = resolve_textures_.erase(it);
+      return textures_[texture_hash].get();
     }
   }
 
@@ -305,7 +279,21 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
     return nullptr;
   }
 
-  if (!UploadTexture2D(command_buffer, texture, texture_info)) {
+  bool uploaded = false;
+  switch (texture_info.dimension) {
+    case Dimension::k2D: {
+      uploaded = UploadTexture2D(command_buffer, completion_fence, texture,
+                                 texture_info);
+    } break;
+    default:
+      assert_unhandled_case(texture_info.dimension);
+      break;
+  }
+
+  // Okay. Now that the texture is uploaded from system memory, put a writewatch
+  // on it to tell us if it's been modified from the guest.
+
+  if (!uploaded) {
     // TODO: Destroy the texture.
     assert_always();
     return nullptr;
@@ -314,6 +302,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
   // Though we didn't find an exact match, that doesn't mean we're out of the
   // woods yet. This texture could either be a portion of another texture or
   // vice versa. Copy any overlapping textures into this texture.
+  // TODO: Byte count -> pixel count (on x and y axes)
   for (auto it = textures_.begin(); it != textures_.end(); ++it) {
   }
 
@@ -322,6 +311,67 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
   return texture;
 }
 
+TextureCache::TextureView* TextureCache::DemandView(Texture* texture,
+                                                    uint16_t swizzle) {
+  for (auto it = texture->views.begin(); it != texture->views.end(); ++it) {
+    if ((*it)->swizzle == swizzle) {
+      return (*it).get();
+    }
+  }
+
+  VkImageViewCreateInfo view_info;
+  view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+  view_info.pNext = nullptr;
+  view_info.flags = 0;
+  view_info.image = texture->image;
+  view_info.format = texture->format;
+
+  switch (texture->texture_info.dimension) {
+    case Dimension::k1D:
+      view_info.viewType = VK_IMAGE_VIEW_TYPE_1D;
+      break;
+    case Dimension::k2D:
+      view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+      break;
+    case Dimension::k3D:
+      view_info.viewType = VK_IMAGE_VIEW_TYPE_3D;
+      break;
+    case Dimension::kCube:
+      view_info.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
+      break;
+    default:
+      assert_always();
+  }
+
+  VkComponentSwizzle swiz_component_map[] = {
+      VK_COMPONENT_SWIZZLE_R,        VK_COMPONENT_SWIZZLE_G,
+      VK_COMPONENT_SWIZZLE_B,        VK_COMPONENT_SWIZZLE_A,
+      VK_COMPONENT_SWIZZLE_ONE,      VK_COMPONENT_SWIZZLE_ZERO,
+      VK_COMPONENT_SWIZZLE_IDENTITY,
+  };
+
+  view_info.components = {
+      swiz_component_map[(swizzle >> 0) & 0x7],
+      swiz_component_map[(swizzle >> 3) & 0x7],
+      swiz_component_map[(swizzle >> 6) & 0x7],
+      swiz_component_map[(swizzle >> 9) & 0x7],
+  };
+  view_info.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+  VkImageView view;
+  auto status = vkCreateImageView(*device_, &view_info, nullptr, &view);
+  CheckResult(status, "vkCreateImageView");
+  if (status == VK_SUCCESS) {
+    auto texture_view = new TextureView();
+    texture_view->texture = texture;
+    texture_view->view = view;
+    texture_view->swizzle = swizzle;
+    texture->views.push_back(std::unique_ptr<TextureView>(texture_view));
+    return texture_view;
+  }
+
+  return nullptr;
+}
+
 TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
   auto sampler_hash = sampler_info.hash();
   for (auto it = samplers_.find(sampler_hash); it != samplers_.end(); ++it) {
@@ -339,12 +389,28 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
   sampler_create_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
   sampler_create_info.pNext = nullptr;
   sampler_create_info.flags = 0;
-  sampler_create_info.magFilter = VK_FILTER_NEAREST;
   sampler_create_info.minFilter = VK_FILTER_NEAREST;
+  sampler_create_info.magFilter = VK_FILTER_NEAREST;
   sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
-  sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT;
-  sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
-  sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+
+  // FIXME: Both halfway / mirror clamp to border aren't mapped properly.
+  VkSamplerAddressMode address_mode_map[] = {
+      /* kRepeat               */ VK_SAMPLER_ADDRESS_MODE_REPEAT,
+      /* kMirroredRepeat       */ VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
+      /* kClampToEdge          */ VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+      /* kMirrorClampToEdge    */ VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE,
+      /* kClampToHalfway       */ VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+      /* kMirrorClampToHalfway */ VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE,
+      /* kClampToBorder        */ VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+      /* kMirrorClampToBorder  */ VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE,
+  };
+  sampler_create_info.addressModeU =
+      address_mode_map[static_cast<int>(sampler_info.clamp_u)];
+  sampler_create_info.addressModeV =
+      address_mode_map[static_cast<int>(sampler_info.clamp_v)];
+  sampler_create_info.addressModeW =
+      address_mode_map[static_cast<int>(sampler_info.clamp_w)];
+
   sampler_create_info.mipLodBias = 0.0f;
   sampler_create_info.anisotropyEnable = VK_FALSE;
   sampler_create_info.maxAnisotropy = 1.0f;
@@ -375,6 +441,22 @@ TextureCache::Texture* TextureCache::LookupAddress(
     TextureFormat format, uint32_t* offset_x, uint32_t* offset_y) {
   for (auto it = textures_.begin(); it != textures_.end(); ++it) {
     const auto& texture_info = it->second->texture_info;
+    if (guest_address >= texture_info.guest_address &&
+        guest_address <
+            texture_info.guest_address + texture_info.input_length &&
+        offset_x && offset_y) {
+      auto offset_bytes = guest_address - texture_info.guest_address;
+
+      if (texture_info.dimension == Dimension::k2D) {
+        *offset_y = offset_bytes / texture_info.size_2d.input_pitch;
+        if (offset_bytes % texture_info.size_2d.input_pitch != 0) {
+          // TODO: offset_x
+        }
+      }
+
+      return it->second.get();
+    }
+
     if (texture_info.guest_address == guest_address &&
         texture_info.dimension == Dimension::k2D &&
         texture_info.size_2d.input_width == width &&
@@ -383,20 +465,86 @@ TextureCache::Texture* TextureCache::LookupAddress(
     }
   }
 
-  // TODO: Try to match at an offset.
+  // Check resolve textures
+  for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
+       ++it) {
+    const auto& texture_info = (*it)->texture_info;
+    if (guest_address >= texture_info.guest_address &&
+        guest_address <
+            texture_info.guest_address + texture_info.input_length &&
+        offset_x && offset_y) {
+      auto offset_bytes = guest_address - texture_info.guest_address;
+
+      if (texture_info.dimension == Dimension::k2D) {
+        *offset_y = offset_bytes / texture_info.size_2d.input_pitch;
+        if (offset_bytes % texture_info.size_2d.input_pitch != 0) {
+          // TODO: offset_x
+        }
+      }
+
+      return (*it).get();
+    }
+
+    if (texture_info.guest_address == guest_address &&
+        texture_info.dimension == Dimension::k2D &&
+        texture_info.size_2d.input_width == width &&
+        texture_info.size_2d.input_height == height) {
+      return (*it).get();
+    }
+  }
+
   return nullptr;
 }
 
-bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
-                                   Texture* dest, TextureInfo src) {
-  // TODO: We need to allocate memory to use as a staging buffer. We can then
-  // raw copy the texture from system memory into the staging buffer and use a
-  // shader to convert the texture into a format consumable by the host GPU.
+void TextureSwap(Endian endianness, void* dest, const void* src,
+                 size_t length) {
+  switch (endianness) {
+    case Endian::k8in16:
+      xe::copy_and_swap_16_aligned(dest, src, length / 2);
+      break;
+    case Endian::k8in32:
+      xe::copy_and_swap_32_aligned(dest, src, length / 4);
+      break;
+    case Endian::k16in32:  // Swap high and low 16 bits within a 32 bit word
+      xe::copy_and_swap_16_in_32_aligned(dest, src, length);
+      break;
+    default:
+    case Endian::kUnspecified:
+      std::memcpy(dest, src, length);
+      break;
+  }
+}
 
-  // Need to have unique memory for every upload for at least one frame. If we
-  // run out of memory, we need to flush all queued upload commands to the GPU.
+bool TextureCache::UploadTexture2D(
+    VkCommandBuffer command_buffer,
+    std::shared_ptr<ui::vulkan::Fence> completion_fence, Texture* dest,
+    TextureInfo src) {
+  SCOPE_profile_cpu_f("gpu");
+  assert_true(src.dimension == Dimension::k2D);
 
-  // TODO: Upload memory here.
+  if (!staging_buffer_.CanAcquire(src.input_length)) {
+    // Need to have unique memory for every upload for at least one frame. If we
+    // run out of memory, we need to flush all queued upload commands to the
+    // GPU.
+    // TODO: Actually flush commands.
+    assert_always();
+  }
+
+  // Grab some temporary memory for staging.
+  auto alloc = staging_buffer_.Acquire(src.input_length, completion_fence);
+  assert_not_null(alloc);
+
+  // TODO: Support these cases.
+  // assert_false(src.is_tiled);
+  // assert_false(src.is_compressed());
+
+  // Upload texture into GPU memory.
+  // TODO: If the GPU supports it, we can submit a compute batch to convert the
+  // texture and copy it to its destination. Otherwise, fallback to conversion
+  // on the CPU.
+  auto guest_ptr = memory_->TranslatePhysical(src.guest_address);
+  TextureSwap(src.endianness, alloc->host_ptr, guest_ptr, src.input_length);
+  staging_buffer_.Flush(alloc);
 
   // Insert a memory barrier into the command buffer to ensure the upload has
   // finished before we copy it into the destination texture.
@@ -407,9 +555,9 @@ bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
       VK_ACCESS_TRANSFER_READ_BIT,
       VK_QUEUE_FAMILY_IGNORED,
       VK_QUEUE_FAMILY_IGNORED,
-      staging_buffer_,
-      0,
-      2048 * 2048 * 4,
+      staging_buffer_.gpu_buffer(),
+      alloc->offset,
+      alloc->aligned_length,
   };
   vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
@@ -432,18 +580,24 @@ bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                        nullptr, 1, &barrier);
 
+  assert_true(src.size_2d.input_width >=
+              dest->texture_info.size_2d.output_width);
+  assert_true(src.size_2d.input_height >=
+              dest->texture_info.size_2d.output_height);
+
   // For now, just transfer the grid we uploaded earlier into the texture.
   VkBufferImageCopy copy_region;
-  copy_region.bufferOffset = 0;
-  copy_region.bufferRowLength = 2048;
-  copy_region.bufferImageHeight = 2048;
+  copy_region.bufferOffset = alloc->offset;
+  copy_region.bufferRowLength = src.size_2d.input_width;
+  copy_region.bufferImageHeight = src.size_2d.input_height;
   copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
   copy_region.imageOffset = {0, 0, 0};
-  copy_region.imageExtent = {dest->texture_info.width + 1,
-                             dest->texture_info.height + 1,
+  copy_region.imageExtent = {dest->texture_info.size_2d.output_width + 1,
+                             dest->texture_info.size_2d.output_height + 1,
                              dest->texture_info.depth + 1};
-  vkCmdCopyBufferToImage(command_buffer, staging_buffer_, dest->image,
-                         VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy_region);
+  vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
+                         dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
+                         &copy_region);
 
   // Now transition the texture into a shader readonly source.
   barrier.srcAccessMask = barrier.dstAccessMask;
@@ -460,6 +614,7 @@ bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
 
 VkDescriptorSet TextureCache::PrepareTextureSet(
     VkCommandBuffer command_buffer,
+    std::shared_ptr<ui::vulkan::Fence> completion_fence,
     const std::vector<Shader::TextureBinding>& vertex_bindings,
     const std::vector<Shader::TextureBinding>& pixel_bindings) {
   // Clear state.
@@ -476,12 +631,12 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
   // This does things lazily and de-dupes fetch constants reused in both
   // shaders.
   bool any_failed = false;
-  any_failed =
-      !SetupTextureBindings(update_set_info, vertex_bindings, command_buffer) ||
-      any_failed;
-  any_failed =
-      !SetupTextureBindings(update_set_info, pixel_bindings, command_buffer) ||
-      any_failed;
+  any_failed = !SetupTextureBindings(command_buffer, completion_fence,
+                                     update_set_info, vertex_bindings) ||
+               any_failed;
+  any_failed = !SetupTextureBindings(command_buffer, completion_fence,
+                                     update_set_info, pixel_bindings) ||
+               any_failed;
   if (any_failed) {
     XELOGW("Failed to setup one or more texture bindings");
     // TODO(benvanik): actually bail out here?
@@ -518,6 +673,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     sampler_write.pImageInfo = update_set_info->sampler_infos;
   }
   */
+  // FIXME: These are not be lined up properly with tf binding points!!!!!
   if (update_set_info->image_1d_write_count) {
     auto& image_write = descriptor_writes[descriptor_write_count++];
     image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
@@ -567,30 +723,33 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
                            0, nullptr);
   }
 
+  in_flight_sets_.push_back({descriptor_set, completion_fence});
   return descriptor_set;
 }
 
 bool TextureCache::SetupTextureBindings(
+    VkCommandBuffer command_buffer,
+    std::shared_ptr<ui::vulkan::Fence> completion_fence,
     UpdateSetInfo* update_set_info,
-    const std::vector<Shader::TextureBinding>& bindings,
-    VkCommandBuffer command_buffer) {
+    const std::vector<Shader::TextureBinding>& bindings) {
   bool any_failed = false;
   for (auto& binding : bindings) {
     uint32_t fetch_bit = 1 << binding.fetch_constant;
     if ((update_set_info->has_setup_fetch_mask & fetch_bit) == 0) {
       // Needs setup.
-      any_failed =
-          !SetupTextureBinding(update_set_info, binding, command_buffer) ||
-          any_failed;
+      any_failed = !SetupTextureBinding(command_buffer, completion_fence,
+                                        update_set_info, binding) ||
+                   any_failed;
       update_set_info->has_setup_fetch_mask |= fetch_bit;
     }
   }
   return !any_failed;
 }
 
-bool TextureCache::SetupTextureBinding(UpdateSetInfo* update_set_info,
-                                       const Shader::TextureBinding& binding,
-                                       VkCommandBuffer command_buffer) {
+bool TextureCache::SetupTextureBinding(
+    VkCommandBuffer command_buffer,
+    std::shared_ptr<ui::vulkan::Fence> completion_fence,
+    UpdateSetInfo* update_set_info, const Shader::TextureBinding& binding) {
   auto& regs = *register_file_;
   int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6;
   auto group =
@@ -615,18 +774,48 @@ bool TextureCache::SetupTextureBinding(UpdateSetInfo* update_set_info,
     return false;  // invalid texture used
   }
 
-  auto texture = Demand(texture_info, command_buffer);
+  auto texture = Demand(texture_info, command_buffer, completion_fence);
   auto sampler = Demand(sampler_info);
   assert_true(texture != nullptr && sampler != nullptr);
+  if (texture == nullptr || sampler == nullptr) {
+    return false;
+  }
+
+  uint16_t swizzle = static_cast<uint16_t>(fetch.swizzle);
+  auto view = DemandView(texture, swizzle);
 
   trace_writer_->WriteMemoryRead(texture_info.guest_address,
                                  texture_info.input_length);
 
-  auto& image_write =
-      update_set_info->image_2d_infos[update_set_info->image_2d_write_count++];
-  image_write.imageView = texture->views[0]->view;
-  image_write.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-  image_write.sampler = sampler->sampler;
+  VkDescriptorImageInfo* image_write = nullptr;
+  switch (texture_info.dimension) {
+    case Dimension::k1D:
+      image_write =
+          &update_set_info
+               ->image_1d_infos[update_set_info->image_1d_write_count++];
+      break;
+    case Dimension::k2D:
+      image_write =
+          &update_set_info
+               ->image_2d_infos[update_set_info->image_2d_write_count++];
+      break;
+    case Dimension::k3D:
+      image_write =
+          &update_set_info
+               ->image_3d_infos[update_set_info->image_3d_write_count++];
+      break;
+    case Dimension::kCube:
+      image_write =
+          &update_set_info
+               ->image_cube_infos[update_set_info->image_cube_write_count++];
+      break;
+    default:
+      assert_unhandled_case(texture_info.dimension);
+      return false;
+  }
+  image_write->imageView = view->view;
+  image_write->imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+  image_write->sampler = sampler->sampler;
 
   return true;
 }
@@ -635,6 +824,22 @@ void TextureCache::ClearCache() {
   // TODO(benvanik): caching.
 }
 
+void TextureCache::Scavenge() {
+  // Free unused descriptor sets
+  for (auto it = in_flight_sets_.begin(); it != in_flight_sets_.end();) {
+    if (vkGetFenceStatus(*device_, *it->second) == VK_SUCCESS) {
+      // We can free this one.
+      vkFreeDescriptorSets(*device_, descriptor_pool_, 1, &it->first);
+      it = in_flight_sets_.erase(it);
+      continue;
+    }
+
+    ++it;
+  }
+
+  staging_buffer_.Scavenge();
+}
+
 }  // namespace vulkan
 }  // namespace gpu
 }  // namespace xe
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index 6264a4a98..dfc993763 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -17,7 +17,9 @@
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/texture_info.h"
 #include "xenia/gpu/trace_writer.h"
+#include "xenia/gpu/vulkan/vulkan_command_processor.h"
 #include "xenia/gpu/xenos.h"
+#include "xenia/ui/vulkan/circular_buffer.h"
 #include "xenia/ui/vulkan/vulkan.h"
 #include "xenia/ui/vulkan/vulkan_device.h"
 
@@ -38,22 +40,38 @@ class TextureCache {
     // True if we know all info about this texture, false otherwise.
     // (e.g. we resolve to system memory and may not know the full details about
     // this texture)
-    bool full_texture;
+    bool is_full_texture;
     VkFormat format;
     VkImage image;
     VkImageLayout image_layout;
     VkDeviceMemory image_memory;
     VkDeviceSize memory_offset;
     VkDeviceSize memory_size;
+
+    uintptr_t access_watch_handle;
+    bool pending_invalidation;
   };
 
   struct TextureView {
     Texture* texture;
     VkImageView view;
+
+    union {
+      struct {
+        // FIXME: This only applies on little-endian platforms!
+        uint16_t swiz_x : 3;
+        uint16_t swiz_y : 3;
+        uint16_t swiz_z : 3;
+        uint16_t swiz_w : 3;
+        uint16_t : 4;
+      };
+
+      uint16_t swizzle;
+    };
   };
 
-  TextureCache(RegisterFile* register_file, TraceWriter* trace_writer,
-               ui::vulkan::VulkanDevice* device);
+  TextureCache(Memory* memory, RegisterFile* register_file,
+               TraceWriter* trace_writer, ui::vulkan::VulkanDevice* device);
   ~TextureCache();
 
   // Descriptor set layout containing all possible texture bindings.
@@ -64,8 +82,11 @@ class TextureCache {
 
   // Prepares a descriptor set containing the samplers and images for all
   // bindings. The textures will be uploaded/converted/etc as needed.
+  // Requires a fence to be provided that will be signaled when finished
+  // using the returned descriptor set.
   VkDescriptorSet PrepareTextureSet(
-      VkCommandBuffer command_buffer,
+      VkCommandBuffer setup_command_buffer,
+      std::shared_ptr<ui::vulkan::Fence> completion_fence,
       const std::vector<Shader::TextureBinding>& vertex_bindings,
       const std::vector<Shader::TextureBinding>& pixel_bindings);
 
@@ -73,6 +94,16 @@ class TextureCache {
   // TODO(benvanik): Resolve.
   // TODO(benvanik): ReadTexture.
 
+  // Looks for a texture either containing or matching these parameters.
+  // Caller is responsible for checking if the texture returned is an exact
+  // match or just contains the texture given by the parameters.
+  // If offset_x and offset_y are not null, this may return a texture that
+  // contains this address at an offset.
+  Texture* LookupAddress(uint32_t guest_address, uint32_t width,
+                         uint32_t height, TextureFormat format,
+                         uint32_t* offset_x = nullptr,
+                         uint32_t* offset_y = nullptr);
+
   // Demands a texture for the purpose of resolving from EDRAM. This either
   // creates a new texture or returns a previously created texture. texture_info
   // is not required to be completely filled out, just guest_address and size.
@@ -89,6 +120,9 @@ class TextureCache {
   // Clears all cached content.
   void ClearCache();
 
+  // Frees any unused resources
+  void Scavenge();
+
  private:
   struct UpdateSetInfo;
 
@@ -104,31 +138,30 @@ class TextureCache {
 
   // Demands a texture. If command_buffer is null and the texture hasn't been
   // uploaded to graphics memory already, we will return null and bail.
-  Texture* Demand(const TextureInfo& texture_info,
-                  VkCommandBuffer command_buffer = nullptr);
+  Texture* Demand(
+      const TextureInfo& texture_info, VkCommandBuffer command_buffer = nullptr,
+      std::shared_ptr<ui::vulkan::Fence> completion_fence = nullptr);
+  TextureView* DemandView(Texture* texture, uint16_t swizzle);
   Sampler* Demand(const SamplerInfo& sampler_info);
 
-  // Looks for a texture either containing or matching these parameters.
-  // Caller is responsible for checking if the texture returned is an exact
-  // match or just contains the texture given by the parameters.
-  // If offset_x and offset_y are not null, this may return a texture that
-  // contains this image at an offset.
-  Texture* LookupAddress(uint32_t guest_address, uint32_t width,
-                         uint32_t height, TextureFormat format,
-                         uint32_t* offset_x, uint32_t* offset_y);
-
   // Queues commands to upload a texture from system memory, applying any
   // conversions necessary. This may flush the command buffer to the GPU if we
   // run out of staging memory.
-  bool UploadTexture2D(VkCommandBuffer command_buffer, Texture* dest,
-                       TextureInfo src);
+  bool UploadTexture2D(VkCommandBuffer command_buffer,
+                       std::shared_ptr<ui::vulkan::Fence> completion_fence,
+                       Texture* dest, TextureInfo src);
 
-  bool SetupTextureBindings(UpdateSetInfo* update_set_info,
-                            const std::vector<Shader::TextureBinding>& bindings,
-                            VkCommandBuffer command_buffer = nullptr);
-  bool SetupTextureBinding(UpdateSetInfo* update_set_info,
-                           const Shader::TextureBinding& binding,
-                           VkCommandBuffer command_buffer = nullptr);
+  bool SetupTextureBindings(
+      VkCommandBuffer command_buffer,
+      std::shared_ptr<ui::vulkan::Fence> completion_fence,
+      UpdateSetInfo* update_set_info,
+      const std::vector<Shader::TextureBinding>& bindings);
+  bool SetupTextureBinding(VkCommandBuffer command_buffer,
+                           std::shared_ptr<ui::vulkan::Fence> completion_fence,
+                           UpdateSetInfo* update_set_info,
+                           const Shader::TextureBinding& binding);
+
+  Memory* memory_ = nullptr;
 
   RegisterFile* register_file_ = nullptr;
   TraceWriter* trace_writer_ = nullptr;
@@ -136,10 +169,11 @@ class TextureCache {
 
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr;
+  std::vector<std::pair<VkDescriptorSet, std::shared_ptr<ui::vulkan::Fence>>>
+      in_flight_sets_;
 
   // Temporary until we have circular buffers.
-  VkBuffer staging_buffer_ = nullptr;
-  VkDeviceMemory staging_buffer_mem_ = nullptr;
+  ui::vulkan::CircularBuffer staging_buffer_;
   std::unordered_map<uint64_t, std::unique_ptr<Texture>> textures_;
   std::unordered_map<uint64_t, std::unique_ptr<Sampler>> samplers_;
   std::vector<std::unique_ptr<Texture>> resolve_textures_;

From 1e1da1eb6c78a70f18d188228a79a99fe3f9072f Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:34:14 -0500
Subject: [PATCH 26/77] PipelineCache::ConfigurePipeline - Inform the caller if
 the pipeline is dirty or they can reuse the previously bound pipeline. Make
 SetDynamicState public.

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 31 ++++++++----------------
 src/xenia/gpu/vulkan/pipeline_cache.h  | 33 +++++++++++++-------------
 2 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index ee1174a72..efcaf5b46 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -183,11 +183,12 @@ VulkanShader* PipelineCache::LoadShader(ShaderType shader_type,
   return shader;
 }
 
-bool PipelineCache::ConfigurePipeline(VkCommandBuffer command_buffer,
-                                      const RenderState* render_state,
-                                      VulkanShader* vertex_shader,
-                                      VulkanShader* pixel_shader,
-                                      PrimitiveType primitive_type) {
+PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline(
+    VkCommandBuffer command_buffer, const RenderState* render_state,
+    VulkanShader* vertex_shader, VulkanShader* pixel_shader,
+    PrimitiveType primitive_type, VkPipeline* pipeline_out) {
+  assert_not_null(pipeline_out);
+
   // Perform a pass over all registers and state updating our cached structures.
   // This will tell us if anything has changed that requires us to either build
   // a new pipeline or use an existing one.
@@ -208,7 +209,7 @@ bool PipelineCache::ConfigurePipeline(VkCommandBuffer command_buffer,
       // Error updating state - bail out.
       // We are in an indeterminate state, so reset things for the next attempt.
       current_pipeline_ = nullptr;
-      return false;
+      return update_status;
   }
   if (!pipeline) {
     // Should have a hash key produced by the UpdateState pass.
@@ -217,24 +218,12 @@ bool PipelineCache::ConfigurePipeline(VkCommandBuffer command_buffer,
     current_pipeline_ = pipeline;
     if (!pipeline) {
       // Unable to create pipeline.
-      return false;
+      return UpdateStatus::kError;
     }
   }
 
-  // Bind the pipeline.
-  vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
-
-  // Issue all changed dynamic state information commands.
-  // TODO(benvanik): dynamic state is kept in the command buffer, so if we
-  // have issued it before (regardless of pipeline) we don't need to do it now.
-  // TODO(benvanik): track whether we have issued on the given command buffer.
-  bool full_dynamic_state = true;
-  if (!SetDynamicState(command_buffer, full_dynamic_state)) {
-    // Failed to update state.
-    return false;
-  }
-
-  return true;
+  *pipeline_out = pipeline;
+  return update_status;
 }
 
 void PipelineCache::ClearCache() {
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h
index b33c030ed..66b2e87ef 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/pipeline_cache.h
@@ -32,6 +32,12 @@ namespace vulkan {
 // including shaders, various blend/etc options, and input configuration.
 class PipelineCache {
  public:
+  enum class UpdateStatus {
+    kCompatible,
+    kMismatch,
+    kError,
+  };
+
   PipelineCache(RegisterFile* register_file, ui::vulkan::VulkanDevice* device,
                 VkDescriptorSetLayout uniform_descriptor_set_layout,
                 VkDescriptorSetLayout texture_descriptor_set_layout);
@@ -46,11 +52,17 @@ class PipelineCache {
   // otherwise a new one may be created. Any state that can be set dynamically
   // in the command buffer is issued at this time.
   // Returns whether the pipeline could be successfully created.
-  bool ConfigurePipeline(VkCommandBuffer command_buffer,
-                         const RenderState* render_state,
-                         VulkanShader* vertex_shader,
-                         VulkanShader* pixel_shader,
-                         PrimitiveType primitive_type);
+  UpdateStatus ConfigurePipeline(VkCommandBuffer command_buffer,
+                                 const RenderState* render_state,
+                                 VulkanShader* vertex_shader,
+                                 VulkanShader* pixel_shader,
+                                 PrimitiveType primitive_type,
+                                 VkPipeline* pipeline_out);
+
+  // Sets required dynamic state on the command buffer.
+  // Only state that has changed since the last call will be set unless
+  // full_update is true.
+  bool SetDynamicState(VkCommandBuffer command_buffer, bool full_update);
 
   // Pipeline layout shared by all pipelines.
   VkPipelineLayout pipeline_layout() const { return pipeline_layout_; }
@@ -68,11 +80,6 @@ class PipelineCache {
   VkShaderModule GetGeometryShader(PrimitiveType primitive_type,
                                    bool is_line_mode);
 
-  // Sets required dynamic state on the command buffer.
-  // Only state that has changed since the last call will be set unless
-  // full_update is true.
-  bool SetDynamicState(VkCommandBuffer command_buffer, bool full_update);
-
   RegisterFile* register_file_ = nullptr;
   VkDevice device_ = nullptr;
 
@@ -111,12 +118,6 @@ class PipelineCache {
   VkPipeline current_pipeline_ = nullptr;
 
  private:
-  enum class UpdateStatus {
-    kCompatible,
-    kMismatch,
-    kError,
-  };
-
   UpdateStatus UpdateState(VulkanShader* vertex_shader,
                            VulkanShader* pixel_shader,
                            PrimitiveType primitive_type);

From f75e5fec2463fffc54cd71d02652ac59291f07fb Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:35:34 -0500
Subject: [PATCH 27/77] CP: Use a single command buffer for every frame, reuse
 render passes/pipelines if not dirty Hook up resolves and swaps

---
 .../gpu/vulkan/vulkan_command_processor.cc    | 488 ++++++++++++++----
 .../gpu/vulkan/vulkan_command_processor.h     |  10 +
 2 files changed, 407 insertions(+), 91 deletions(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 48c7d681d..1d559d896 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -69,8 +69,8 @@ bool VulkanCommandProcessor::SetupContext() {
   // Initialize the state machine caches.
   buffer_cache_ = std::make_unique<BufferCache>(register_file_, device_,
                                                 kDefaultBufferCacheCapacity);
-  texture_cache_ =
-      std::make_unique<TextureCache>(register_file_, &trace_writer_, device_);
+  texture_cache_ = std::make_unique<TextureCache>(memory_, register_file_,
+                                                  &trace_writer_, device_);
   pipeline_cache_ = std::make_unique<PipelineCache>(
       register_file_, device_, buffer_cache_->constant_descriptor_set_layout(),
       texture_cache_->texture_descriptor_set_layout());
@@ -134,21 +134,127 @@ void VulkanCommandProcessor::ReturnFromWait() {
 void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
                                          uint32_t frontbuffer_width,
                                          uint32_t frontbuffer_height) {
-  // Ensure we issue any pending draws.
-  // draw_batcher_.Flush(DrawBatcher::FlushMode::kMakeCoherent);
+  SCOPE_profile_cpu_f("gpu");
 
-  // Need to finish to be sure the other context sees the right data.
-  // TODO(benvanik): prevent this? fences?
-  // glFinish();
+  // Queue up current command buffers.
+  // TODO(benvanik): bigger batches.
+  if (current_command_buffer_) {
+    if (current_render_state_) {
+      render_cache_->EndRenderPass();
+      current_render_state_ = nullptr;
+    }
 
-  if (context_->WasLost()) {
-    // We've lost the context due to a TDR.
-    // TODO: Dump the current commands to a tracefile.
-    assert_always();
+    auto status = vkEndCommandBuffer(current_command_buffer_);
+    CheckResult(status, "vkEndCommandBuffer");
+    status = vkEndCommandBuffer(current_setup_buffer_);
+    CheckResult(status, "vkEndCommandBuffer");
+    command_buffer_pool_->EndBatch(*current_batch_fence_);
+
+    // TODO(benvanik): move to CP or to host (trace dump, etc).
+    // This only needs to surround a vkQueueSubmit.
+    static uint32_t frame = 0;
+    if (device_->is_renderdoc_attached() &&
+        (FLAGS_vulkan_renderdoc_capture_all ||
+         trace_state_ == TraceState::kSingleFrame)) {
+      if (queue_mutex_) {
+        queue_mutex_->lock();
+      }
+
+      device_->BeginRenderDocFrameCapture();
+
+      if (queue_mutex_) {
+        queue_mutex_->unlock();
+      }
+    }
+
+    // TODO(DrChat): If setup buffer is empty, don't bother queueing it up.
+    VkCommandBuffer command_buffers[] = {
+        current_setup_buffer_, current_command_buffer_,
+    };
+
+    VkSubmitInfo submit_info;
+    submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submit_info.pNext = nullptr;
+    submit_info.waitSemaphoreCount = 0;
+    submit_info.pWaitSemaphores = nullptr;
+    submit_info.commandBufferCount = 2;
+    submit_info.pCommandBuffers = command_buffers;
+    submit_info.signalSemaphoreCount = 0;
+    submit_info.pSignalSemaphores = nullptr;
+    if (queue_mutex_) {
+      queue_mutex_->lock();
+    }
+    status = vkQueueSubmit(queue_, 1, &submit_info, *current_batch_fence_);
+    if (queue_mutex_) {
+      queue_mutex_->unlock();
+    }
+    CheckResult(status, "vkQueueSubmit");
+
+    VkFence fences[] = {*current_batch_fence_};
+    status = vkWaitForFences(*device_, 1, fences, true, -1);
+    CheckResult(status, "vkWaitForFences");
+
+    if (device_->is_renderdoc_attached() &&
+        (FLAGS_vulkan_renderdoc_capture_all ||
+         trace_state_ == TraceState::kSingleFrame)) {
+      if (queue_mutex_) {
+        queue_mutex_->lock();
+      }
+
+      device_->EndRenderDocFrameCapture();
+
+      // HACK(DrChat): Used b/c I disabled trace saving code in the CP.
+      // Remove later.
+      if (!trace_writer_.is_open()) {
+        trace_state_ = TraceState::kDisabled;
+      }
+
+      if (queue_mutex_) {
+        queue_mutex_->unlock();
+      }
+    }
+
+    // Scavenging.
+    current_command_buffer_ = nullptr;
+    current_setup_buffer_ = nullptr;
+    while (command_buffer_pool_->has_pending()) {
+      command_buffer_pool_->Scavenge();
+      xe::threading::MaybeYield();
+    }
+
+    texture_cache_->Scavenge();
+    current_batch_fence_ = nullptr;
+
+    // TODO: Remove this when we stop waiting on the queue.
+    buffer_cache_->ClearCache();
+  }
+
+  if (!frontbuffer_ptr) {
+    if (!last_copy_base_) {
+      // Nothing to draw.
+      return;
+    }
+
+    // Trace viewer does this.
+    frontbuffer_ptr = last_copy_base_;
+  }
+
+  auto texture = texture_cache_->LookupAddress(
+      frontbuffer_ptr, xe::round_up(frontbuffer_width, 32),
+      xe::round_up(frontbuffer_height, 32), TextureFormat::k_8_8_8_8);
+  // There shouldn't be a case where the texture is null.
+  assert_not_null(texture);
+
+  if (texture) {
+    std::lock_guard<std::mutex> lock(swap_state_.mutex);
+    swap_state_.width = frontbuffer_width;
+    swap_state_.height = frontbuffer_height;
+    swap_state_.back_buffer_texture =
+        reinterpret_cast<uintptr_t>(texture->image);
   }
 
   // Remove any dead textures, etc.
-  // texture_cache_.Scavenge();
+  texture_cache_->Scavenge();
 }
 
 Shader* VulkanCommandProcessor::LoadShader(ShaderType shader_type,
@@ -183,13 +289,8 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     return true;
   }
 
-  // TODO(benvanik): move to CP or to host (trace dump, etc).
-  if (FLAGS_vulkan_renderdoc_capture_all && device_->is_renderdoc_attached()) {
-    device_->BeginRenderDocFrameCapture();
-  }
-
   // Shaders will have already been defined by previous loads.
-  // We need the to do just about anything so validate here.
+  // We need them to do just about anything so validate here.
   auto vertex_shader = static_cast<VulkanShader*>(active_vertex_shader());
   auto pixel_shader = static_cast<VulkanShader*>(active_pixel_shader());
   if (!vertex_shader || !vertex_shader->is_valid()) {
@@ -206,42 +307,73 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     return true;
   }
 
-  // TODO(benvanik): bigger batches.
-  command_buffer_pool_->BeginBatch();
-  VkCommandBuffer command_buffer = command_buffer_pool_->AcquireEntry();
-  VkCommandBufferBeginInfo command_buffer_begin_info;
-  command_buffer_begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-  command_buffer_begin_info.pNext = nullptr;
-  command_buffer_begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-  command_buffer_begin_info.pInheritanceInfo = nullptr;
-  auto err = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
-  CheckResult(err, "vkBeginCommandBuffer");
+  bool started_command_buffer = false;
+  if (!current_command_buffer_) {
+    // TODO(benvanik): bigger batches.
+    // TODO(DrChat): Decouple setup buffer from current batch.
+    command_buffer_pool_->BeginBatch();
+    current_command_buffer_ = command_buffer_pool_->AcquireEntry();
+    current_setup_buffer_ = command_buffer_pool_->AcquireEntry();
+    current_batch_fence_.reset(new ui::vulkan::Fence(*device_));
+
+    VkCommandBufferBeginInfo command_buffer_begin_info;
+    command_buffer_begin_info.sType =
+        VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    command_buffer_begin_info.pNext = nullptr;
+    command_buffer_begin_info.flags =
+        VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    command_buffer_begin_info.pInheritanceInfo = nullptr;
+    auto status = vkBeginCommandBuffer(current_command_buffer_,
+                                       &command_buffer_begin_info);
+    CheckResult(status, "vkBeginCommandBuffer");
+
+    status =
+        vkBeginCommandBuffer(current_setup_buffer_, &command_buffer_begin_info);
+    CheckResult(status, "vkBeginCommandBuffer");
+
+    started_command_buffer = true;
+  }
+  auto command_buffer = current_command_buffer_;
 
   // Upload and set descriptors for all textures.
   // We do this outside of the render pass so the texture cache can upload and
   // convert textures.
-  auto samplers = PopulateSamplers(command_buffer, vertex_shader, pixel_shader);
+  // Setup buffer may be flushed to GPU if the texture cache needs it.
+  auto samplers =
+      PopulateSamplers(current_setup_buffer_, vertex_shader, pixel_shader);
   if (!samplers) {
     return false;
   }
 
   // Begin the render pass.
   // This will setup our framebuffer and begin the pass in the command buffer.
-  auto render_state = render_cache_->BeginRenderPass(
-      command_buffer, vertex_shader, pixel_shader);
-  if (!render_state) {
-    return false;
+  // This reuses a previous render pass if one is already open.
+  if (render_cache_->dirty() || !current_render_state_) {
+    if (current_render_state_) {
+      render_cache_->EndRenderPass();
+      current_render_state_ = nullptr;
+    }
+
+    current_render_state_ = render_cache_->BeginRenderPass(
+        command_buffer, vertex_shader, pixel_shader);
+    if (!current_render_state_) {
+      return false;
+    }
   }
 
   // Configure the pipeline for drawing.
   // This encodes all render state (blend, depth, etc), our shader stages,
   // and our vertex input layout.
-  if (!pipeline_cache_->ConfigurePipeline(command_buffer, render_state,
-                                          vertex_shader, pixel_shader,
-                                          primitive_type)) {
-    render_cache_->EndRenderPass();
-    return false;
+  VkPipeline pipeline = nullptr;
+  auto pipeline_status = pipeline_cache_->ConfigurePipeline(
+      command_buffer, current_render_state_, vertex_shader, pixel_shader,
+      primitive_type, &pipeline);
+  if (pipeline_status == PipelineCache::UpdateStatus::kMismatch ||
+      started_command_buffer) {
+    vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                      pipeline);
   }
+  pipeline_cache_->SetDynamicState(command_buffer, started_command_buffer);
 
   // Pass registers to the shaders.
   if (!PopulateConstants(command_buffer, vertex_shader, pixel_shader)) {
@@ -285,57 +417,6 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
                      vertex_offset, first_instance);
   }
 
-  // End the rendering pass.
-  render_cache_->EndRenderPass();
-
-  // TODO(benvanik): bigger batches.
-  err = vkEndCommandBuffer(command_buffer);
-  CheckResult(err, "vkEndCommandBuffer");
-  VkFence fence;
-  VkFenceCreateInfo fence_info;
-  fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-  fence_info.pNext = nullptr;
-  fence_info.flags = 0;
-  vkCreateFence(*device_, &fence_info, nullptr, &fence);
-  command_buffer_pool_->EndBatch(fence);
-  VkSubmitInfo submit_info;
-  submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-  submit_info.pNext = nullptr;
-  submit_info.waitSemaphoreCount = 0;
-  submit_info.pWaitSemaphores = nullptr;
-  submit_info.commandBufferCount = 1;
-  submit_info.pCommandBuffers = &command_buffer;
-  submit_info.signalSemaphoreCount = 0;
-  submit_info.pSignalSemaphores = nullptr;
-  if (queue_mutex_) {
-    queue_mutex_->lock();
-  }
-  err = vkQueueSubmit(queue_, 1, &submit_info, fence);
-  if (queue_mutex_) {
-    queue_mutex_->unlock();
-  }
-  CheckResult(err, "vkQueueSubmit");
-  if (queue_mutex_) {
-    queue_mutex_->lock();
-  }
-  err = vkQueueWaitIdle(queue_);
-  CheckResult(err, "vkQueueWaitIdle");
-  err = vkDeviceWaitIdle(*device_);
-  CheckResult(err, "vkDeviceWaitIdle");
-  if (queue_mutex_) {
-    queue_mutex_->unlock();
-  }
-  while (command_buffer_pool_->has_pending()) {
-    command_buffer_pool_->Scavenge();
-    xe::threading::MaybeYield();
-  }
-  vkDestroyFence(*device_, fence, nullptr);
-
-  // TODO(benvanik): move to CP or to host (trace dump, etc).
-  if (FLAGS_vulkan_renderdoc_capture_all && device_->is_renderdoc_attached()) {
-    device_->EndRenderDocFrameCapture();
-  }
-
   return true;
 }
 
@@ -486,7 +567,7 @@ VkDescriptorSet VulkanCommandProcessor::PopulateSamplers(
 #endif  // FINE_GRAINED_DRAW_SCOPES
 
   auto descriptor_set = texture_cache_->PrepareTextureSet(
-      command_buffer, vertex_shader->texture_bindings(),
+      command_buffer, current_batch_fence_, vertex_shader->texture_bindings(),
       pixel_shader->texture_bindings());
   if (!descriptor_set) {
     // Unable to bind set.
@@ -519,7 +600,7 @@ bool VulkanCommandProcessor::IssueCopy() {
   uint32_t copy_dest_slice = (copy_dest_info >> 4) & 0x7;
   assert_true(copy_dest_slice == 0);
   auto copy_dest_format =
-    static_cast<ColorFormat>((copy_dest_info >> 7) & 0x3F);
+      static_cast<ColorFormat>((copy_dest_info >> 7) & 0x3F);
   uint32_t copy_dest_number = (copy_dest_info >> 13) & 0x7;
   // assert_true(copy_dest_number == 0); // ?
   uint32_t copy_dest_bias = (copy_dest_info >> 16) & 0x3F;
@@ -541,12 +622,237 @@ bool VulkanCommandProcessor::IssueCopy() {
   uint32_t copy_mask = regs[XE_GPU_REG_RB_COPY_MASK].u32;
   assert_true(copy_mask == 0);
 
+  // Supported in GL4, not supported here yet.
+  assert_zero(copy_dest_swap);
+
   // RB_SURFACE_INFO
   // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
   uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
   uint32_t surface_pitch = surface_info & 0x3FFF;
   auto surface_msaa = static_cast<MsaaSamples>((surface_info >> 16) & 0x3);
 
+  // TODO(benvanik): any way to scissor this? a200 has:
+  // REG_A2XX_RB_COPY_DEST_OFFSET = A2XX_RB_COPY_DEST_OFFSET_X(tile->xoff) |
+  //                                A2XX_RB_COPY_DEST_OFFSET_Y(tile->yoff);
+  // but I can't seem to find something similar.
+  uint32_t dest_logical_width = copy_dest_pitch;
+  uint32_t dest_logical_height = copy_dest_height;
+  uint32_t dest_block_width = xe::round_up(dest_logical_width, 32);
+  uint32_t dest_block_height = xe::round_up(dest_logical_height, 32);
+
+  uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32;
+  int16_t window_offset_x = window_offset & 0x7FFF;
+  int16_t window_offset_y = (window_offset >> 16) & 0x7FFF;
+  // Sign-extension
+  if (window_offset_x & 0x4000) {
+    window_offset_x |= 0x8000;
+  }
+  if (window_offset_y & 0x4000) {
+    window_offset_y |= 0x8000;
+  }
+
+  // Adjust the copy base offset to point to the beginning of the texture, so
+  // we don't run into hiccups down the road (e.g. resolving the last part going
+  // backwards).
+  int32_t dest_offset = window_offset_y * copy_dest_pitch * 4;
+  dest_offset += window_offset_x * 32 * 4;
+  copy_dest_base += dest_offset;
+
+  // HACK: vertices to use are always in vf0.
+  int copy_vertex_fetch_slot = 0;
+  int r =
+      XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (copy_vertex_fetch_slot / 3) * 6;
+  const auto group = reinterpret_cast<xe_gpu_fetch_group_t*>(&regs.values[r]);
+  const xe_gpu_vertex_fetch_t* fetch = nullptr;
+  switch (copy_vertex_fetch_slot % 3) {
+    case 0:
+      fetch = &group->vertex_fetch_0;
+      break;
+    case 1:
+      fetch = &group->vertex_fetch_1;
+      break;
+    case 2:
+      fetch = &group->vertex_fetch_2;
+      break;
+  }
+  assert_true(fetch->type == 3);
+  assert_true(fetch->endian == 2);
+  assert_true(fetch->size == 6);
+  const uint8_t* vertex_addr = memory_->TranslatePhysical(fetch->address << 2);
+  trace_writer_.WriteMemoryRead(fetch->address << 2, fetch->size * 4);
+  int32_t dest_min_x = int32_t((std::min(
+      std::min(
+          GpuSwap(xe::load<float>(vertex_addr + 0), Endian(fetch->endian)),
+          GpuSwap(xe::load<float>(vertex_addr + 8), Endian(fetch->endian))),
+      GpuSwap(xe::load<float>(vertex_addr + 16), Endian(fetch->endian)))));
+  int32_t dest_max_x = int32_t((std::max(
+      std::max(
+          GpuSwap(xe::load<float>(vertex_addr + 0), Endian(fetch->endian)),
+          GpuSwap(xe::load<float>(vertex_addr + 8), Endian(fetch->endian))),
+      GpuSwap(xe::load<float>(vertex_addr + 16), Endian(fetch->endian)))));
+  int32_t dest_min_y = int32_t((std::min(
+      std::min(
+          GpuSwap(xe::load<float>(vertex_addr + 4), Endian(fetch->endian)),
+          GpuSwap(xe::load<float>(vertex_addr + 12), Endian(fetch->endian))),
+      GpuSwap(xe::load<float>(vertex_addr + 20), Endian(fetch->endian)))));
+  int32_t dest_max_y = int32_t((std::max(
+      std::max(
+          GpuSwap(xe::load<float>(vertex_addr + 4), Endian(fetch->endian)),
+          GpuSwap(xe::load<float>(vertex_addr + 12), Endian(fetch->endian))),
+      GpuSwap(xe::load<float>(vertex_addr + 20), Endian(fetch->endian)))));
+
+  uint32_t color_edram_base = 0;
+  uint32_t depth_edram_base = 0;
+  ColorRenderTargetFormat color_format;
+  DepthRenderTargetFormat depth_format;
+  if (copy_src_select <= 3) {
+    // Source from a color target.
+    uint32_t color_info[4] = {
+        regs[XE_GPU_REG_RB_COLOR_INFO].u32, regs[XE_GPU_REG_RB_COLOR1_INFO].u32,
+        regs[XE_GPU_REG_RB_COLOR2_INFO].u32,
+        regs[XE_GPU_REG_RB_COLOR3_INFO].u32,
+    };
+    color_edram_base = color_info[copy_src_select] & 0xFFF;
+
+    color_format = static_cast<ColorRenderTargetFormat>(
+        (color_info[copy_src_select] >> 16) & 0xF);
+  }
+
+  if (copy_src_select > 3 || depth_clear_enabled) {
+    // Source from a depth target.
+    uint32_t depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
+    depth_edram_base = depth_info & 0xFFF;
+
+    depth_format =
+        static_cast<DepthRenderTargetFormat>((depth_info >> 16) & 0x1);
+  }
+
+  // Demand a resolve texture from the texture cache.
+  TextureInfo tex_info = {};
+  tex_info.guest_address = copy_dest_base;
+  tex_info.width = dest_logical_width - 1;
+  tex_info.height = dest_logical_height - 1;
+  tex_info.dimension = gpu::Dimension::k2D;
+  tex_info.input_length = copy_dest_pitch * copy_dest_height * 4;
+  tex_info.size_2d.logical_width = dest_logical_width;
+  tex_info.size_2d.logical_height = dest_logical_height;
+  tex_info.size_2d.block_width = dest_block_width;
+  tex_info.size_2d.block_height = dest_block_height;
+  tex_info.size_2d.input_width = dest_block_width;
+  tex_info.size_2d.input_height = dest_block_height;
+  tex_info.size_2d.input_pitch = copy_dest_pitch * 4;
+  auto texture = texture_cache_->DemandResolveTexture(
+      tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr, nullptr);
+  if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
+    // Transition the image to a general layout.
+    VkImageMemoryBarrier image_barrier;
+    image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    image_barrier.pNext = nullptr;
+    image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    image_barrier.srcAccessMask = 0;
+    image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
+    image_barrier.image = texture->image;
+    image_barrier.subresourceRange = {0, 0, 1, 0, 1};
+    image_barrier.subresourceRange.aspectMask =
+        copy_src_select <= 3
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+    texture->image_layout = VK_IMAGE_LAYOUT_GENERAL;
+  }
+
+  // For debugging purposes only (trace viewer)
+  last_copy_base_ = texture->texture_info.guest_address;
+
+  if (!current_command_buffer_) {
+    command_buffer_pool_->BeginBatch();
+    current_command_buffer_ = command_buffer_pool_->AcquireEntry();
+    current_setup_buffer_ = command_buffer_pool_->AcquireEntry();
+    current_batch_fence_.reset(new ui::vulkan::Fence(*device_));
+
+    VkCommandBufferBeginInfo command_buffer_begin_info;
+    command_buffer_begin_info.sType =
+        VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    command_buffer_begin_info.pNext = nullptr;
+    command_buffer_begin_info.flags =
+        VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    command_buffer_begin_info.pInheritanceInfo = nullptr;
+    auto status = vkBeginCommandBuffer(current_command_buffer_,
+                                       &command_buffer_begin_info);
+    CheckResult(status, "vkBeginCommandBuffer");
+
+    status =
+        vkBeginCommandBuffer(current_setup_buffer_, &command_buffer_begin_info);
+    CheckResult(status, "vkBeginCommandBuffer");
+  } else if (current_render_state_) {
+    render_cache_->EndRenderPass();
+    current_render_state_ = nullptr;
+  }
+  auto command_buffer = current_command_buffer_;
+
+  VkOffset3D resolve_offset = {dest_min_x, dest_min_y, 0};
+  VkExtent3D resolve_extent = {uint32_t(dest_max_x - dest_min_x),
+                               uint32_t(dest_max_y - dest_min_y), 1};
+
+  // Ask the render cache to copy to the resolve texture.
+  auto edram_base = copy_src_select <= 3 ? color_edram_base : depth_edram_base;
+  uint32_t src_format = copy_src_select <= 3
+                            ? static_cast<uint32_t>(color_format)
+                            : static_cast<uint32_t>(depth_format);
+  switch (copy_command) {
+    case CopyCommand::kRaw:
+      render_cache_->RawCopyToImage(command_buffer, edram_base, texture->image,
+                                    texture->image_layout, copy_src_select <= 3,
+                                    resolve_offset, resolve_extent);
+      break;
+    case CopyCommand::kConvert:
+      render_cache_->BlitToImage(
+          command_buffer, edram_base, surface_pitch, resolve_extent.height,
+          texture->image, texture->image_layout, copy_src_select <= 3,
+          src_format, VK_FILTER_LINEAR, resolve_offset, resolve_extent);
+      break;
+
+    case CopyCommand::kConstantOne:
+    case CopyCommand::kNull:
+      assert_always();
+      break;
+  }
+
+  // Perform any requested clears.
+  uint32_t copy_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
+  uint32_t copy_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
+  uint32_t copy_color_clear_low = regs[XE_GPU_REG_RB_COLOR_CLEAR_LOW].u32;
+  assert_true(copy_color_clear == copy_color_clear_low);
+
+  if (color_clear_enabled) {
+    // If color clear is enabled, we can only clear a selected color target!
+    assert_true(copy_src_select <= 3);
+
+    // TODO(benvanik): verify color order.
+    float color[] = {((copy_color_clear >> 0) & 0xFF) / 255.0f,
+                     ((copy_color_clear >> 8) & 0xFF) / 255.0f,
+                     ((copy_color_clear >> 16) & 0xFF) / 255.0f,
+                     ((copy_color_clear >> 24) & 0xFF) / 255.0f};
+
+    // TODO(DrChat): Do we know the surface height at this point?
+    render_cache_->ClearEDRAMColor(command_buffer, color_edram_base,
+                                   color_format, surface_pitch,
+                                   resolve_extent.height, color);
+  }
+
+  if (depth_clear_enabled) {
+    float depth =
+        (copy_depth_clear & 0xFFFFFF00) / static_cast<float>(0xFFFFFF00);
+    uint8_t stencil = copy_depth_clear & 0xFF;
+
+    // TODO(DrChat): Do we know the surface height at this point?
+    render_cache_->ClearEDRAMDepthStencil(
+        command_buffer, depth_edram_base, depth_format, surface_pitch,
+        resolve_extent.height, depth, stencil);
+  }
+
   return true;
 }
 
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index b45be07fb..c87c515c0 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -34,12 +34,14 @@
 #include "xenia/ui/vulkan/fenced_pools.h"
 #include "xenia/ui/vulkan/vulkan_context.h"
 #include "xenia/ui/vulkan/vulkan_device.h"
+#include "xenia/ui/vulkan/vulkan_util.h"
 
 namespace xe {
 namespace gpu {
 namespace vulkan {
 
 class VulkanGraphicsSystem;
+class TextureCache;
 
 class VulkanCommandProcessor : public CommandProcessor {
  public:
@@ -90,12 +92,20 @@ class VulkanCommandProcessor : public CommandProcessor {
   VkQueue queue_ = nullptr;
   std::mutex* queue_mutex_ = nullptr;
 
+  // Last copy base address, for debugging only.
+  uint32_t last_copy_base_ = 0;
+
   std::unique_ptr<BufferCache> buffer_cache_;
   std::unique_ptr<PipelineCache> pipeline_cache_;
   std::unique_ptr<RenderCache> render_cache_;
   std::unique_ptr<TextureCache> texture_cache_;
 
   std::unique_ptr<ui::vulkan::CommandBufferPool> command_buffer_pool_;
+
+  const RenderState* current_render_state_ = nullptr;
+  VkCommandBuffer current_command_buffer_ = nullptr;
+  VkCommandBuffer current_setup_buffer_ = nullptr;
+  std::shared_ptr<ui::vulkan::Fence> current_batch_fence_;
 };
 
 }  // namespace vulkan

From 2bb40c122db784ce4fcedf47bdf13cd4cc7ef32f Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:36:21 -0500
Subject: [PATCH 28/77] Vulkan util Fence class

---
 src/xenia/ui/vulkan/vulkan_util.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/xenia/ui/vulkan/vulkan_util.h b/src/xenia/ui/vulkan/vulkan_util.h
index fcf9e4f8f..ca93c4c2d 100644
--- a/src/xenia/ui/vulkan/vulkan_util.h
+++ b/src/xenia/ui/vulkan/vulkan_util.h
@@ -25,6 +25,32 @@ namespace xe {
 namespace ui {
 namespace vulkan {
 
+class Fence {
+ public:
+  Fence(VkDevice device) : device_(device) {
+    VkFenceCreateInfo fence_info;
+    fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    fence_info.pNext = nullptr;
+    fence_info.flags = 0;
+    vkCreateFence(device, &fence_info, nullptr, &fence_);
+  }
+  ~Fence() {
+    vkDestroyFence(device_, fence_, nullptr);
+    fence_ = nullptr;
+  }
+
+  VkResult status() const {
+    return vkGetFenceStatus(device_, fence_);
+  }
+
+  VkFence fence() const { return fence_; }
+  operator VkFence() const { return fence_; }
+
+ private:
+  VkDevice device_;
+  VkFence fence_ = nullptr;
+};
+
 struct Version {
   uint32_t major;
   uint32_t minor;

From a5a31cf12371cdcae9456fe4b04e370509439708 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:37:24 -0500
Subject: [PATCH 29/77] VulkanShader::Prepare - return false if
 vkCreateShaderModule failed.

---
 src/xenia/gpu/vulkan/vulkan_shader.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_shader.cc b/src/xenia/gpu/vulkan/vulkan_shader.cc
index b3c72abf3..c18341a71 100644
--- a/src/xenia/gpu/vulkan/vulkan_shader.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shader.cc
@@ -44,11 +44,11 @@ bool VulkanShader::Prepare() {
   shader_info.codeSize = translated_binary_.size();
   shader_info.pCode =
       reinterpret_cast<const uint32_t*>(translated_binary_.data());
-  auto err =
+  auto status =
       vkCreateShaderModule(device_, &shader_info, nullptr, &shader_module_);
-  CheckResult(err, "vkCreateShaderModule");
+  CheckResult(status, "vkCreateShaderModule");
 
-  return true;
+  return status == VK_SUCCESS;
 }
 
 }  // namespace vulkan

From d7599c817f4453652206ab799d9eefc1260d0679 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:44:25 -0500
Subject: [PATCH 30/77] Formatting.

---
 src/xenia/ui/vulkan/vulkan_util.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/xenia/ui/vulkan/vulkan_util.h b/src/xenia/ui/vulkan/vulkan_util.h
index ca93c4c2d..f5475edd8 100644
--- a/src/xenia/ui/vulkan/vulkan_util.h
+++ b/src/xenia/ui/vulkan/vulkan_util.h
@@ -39,9 +39,7 @@ class Fence {
     fence_ = nullptr;
   }
 
-  VkResult status() const {
-    return vkGetFenceStatus(device_, fence_);
-  }
+  VkResult status() const { return vkGetFenceStatus(device_, fence_); }
 
   VkFence fence() const { return fence_; }
   operator VkFence() const { return fence_; }

From 0e44cda961d76d5cb004ccd1ccff59e9850d4386 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:49:41 -0500
Subject: [PATCH 31/77] Update the rectangle list shader

---
 .../gpu/vulkan/shaders/bin/rect_list_geom.h   | 587 +++++++++---------
 .../gpu/vulkan/shaders/bin/rect_list_geom.txt | 429 +++++++------
 src/xenia/gpu/vulkan/shaders/rect_list.geom   |  31 +-
 3 files changed, 513 insertions(+), 534 deletions(-)

diff --git a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h
index b9598cfa9..730f9f12e 100644
--- a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h
+++ b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h
@@ -2,7 +2,7 @@
 // source: rect_list.geom
 const uint8_t rect_list_geom[] = {
     0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x08, 0x00,
-    0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+    0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
     0x02, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x18, 0x00, 0x00, 0x00,
     0x11, 0x00, 0x02, 0x00, 0x20, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
     0x36, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
@@ -10,8 +10,8 @@ const uint8_t rect_list_geom[] = {
     0x00, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x09, 0x00, 0x03, 0x00, 0x00, 0x00,
     0x04, 0x00, 0x00, 0x00, 0x6D, 0x61, 0x69, 0x6E, 0x00, 0x00, 0x00, 0x00,
-    0x12, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x33, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, 0x04, 0x00, 0x00, 0x00,
     0x16, 0x00, 0x00, 0x00, 0x10, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00,
     0x04, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x10, 0x00, 0x04, 0x00,
@@ -40,17 +40,13 @@ const uint8_t rect_list_geom[] = {
     0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x67, 0x6C, 0x5F, 0x43,
     0x6C, 0x69, 0x70, 0x44, 0x69, 0x73, 0x74, 0x61, 0x6E, 0x63, 0x65, 0x00,
     0x05, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x05, 0x00, 0x05, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x56, 0x65, 0x72, 0x74,
-    0x65, 0x78, 0x44, 0x61, 0x74, 0x61, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-    0x2F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00,
-    0x05, 0x00, 0x04, 0x00, 0x31, 0x00, 0x00, 0x00, 0x6F, 0x75, 0x74, 0x5F,
-    0x76, 0x74, 0x78, 0x00, 0x05, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00,
-    0x56, 0x65, 0x72, 0x74, 0x65, 0x78, 0x44, 0x61, 0x74, 0x61, 0x00, 0x00,
-    0x06, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x6F, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00,
-    0x69, 0x6E, 0x5F, 0x76, 0x74, 0x78, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00,
-    0x66, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00,
-    0xB4, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+    0x05, 0x00, 0x07, 0x00, 0x30, 0x00, 0x00, 0x00, 0x6F, 0x75, 0x74, 0x5F,
+    0x69, 0x6E, 0x74, 0x65, 0x72, 0x70, 0x6F, 0x6C, 0x61, 0x74, 0x6F, 0x72,
+    0x73, 0x00, 0x00, 0x00, 0x05, 0x00, 0x07, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x69, 0x6E, 0x5F, 0x69, 0x6E, 0x74, 0x65, 0x72, 0x70, 0x6F, 0x6C, 0x61,
+    0x74, 0x6F, 0x72, 0x73, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00,
+    0x64, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00,
+    0xB2, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
     0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0E, 0x00, 0x00, 0x00,
     0x01, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
@@ -65,12 +61,10 @@ const uint8_t rect_list_geom[] = {
     0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
     0x20, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2F, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x47, 0x00, 0x04, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-    0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x1E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x47, 0x00, 0x04, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
     0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
     0x14, 0x00, 0x02, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
@@ -107,25 +101,23 @@ const uint8_t rect_list_geom[] = {
     0x03, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00,
     0x0B, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
     0x1C, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0x2D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x03, 0x00, 0x2F, 0x00, 0x00, 0x00,
-    0x2E, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00,
-    0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x1E, 0x00, 0x03, 0x00, 0x32, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00,
-    0x1C, 0x00, 0x04, 0x00, 0x33, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00,
-    0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00,
-    0x34, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x32, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00,
+    0x2D, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x2F, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00,
+    0x2F, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x1C, 0x00, 0x04, 0x00, 0x31, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00,
+    0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00,
+    0x32, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x2E, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x63, 0x00, 0x00, 0x00,
     0x07, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
     0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
     0x05, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
     0x08, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00,
-    0x65, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-    0x3B, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00,
+    0x63, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x3B, 0x00, 0x04, 0x00, 0x63, 0x00, 0x00, 0x00, 0xB2, 0x00, 0x00, 0x00,
     0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00, 0x16, 0x00, 0x00, 0x00,
     0x17, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
     0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
@@ -139,7 +131,7 @@ const uint8_t rect_list_geom[] = {
     0x1C, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
     0x1D, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xF7, 0x00, 0x03, 0x00,
     0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00,
-    0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x00, 0x00,
+    0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x00,
     0xF8, 0x00, 0x02, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
     0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
     0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
@@ -153,286 +145,283 @@ const uint8_t rect_list_geom[] = {
     0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x2C, 0x00, 0x00, 0x00,
     0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
     0x2C, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00,
-    0x38, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x31, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x39, 0x00, 0x00, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00,
+    0x36, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
     0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00,
-    0x39, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
-    0x3B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x3B, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+    0x37, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
+    0x39, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x39, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00,
     0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00,
-    0x3C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x3E, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x32, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x00, 0x00,
+    0x3A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
+    0x3C, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00,
+    0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x2E, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x00,
     0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x3F, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
     0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0x42, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x26, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x43, 0x00, 0x00, 0x00,
-    0x42, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
-    0x44, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+    0x42, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
     0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x2B, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
-    0x45, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00,
-    0x47, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
-    0x47, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x48, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+    0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x2B, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00,
+    0x43, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00,
+    0x45, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+    0x45, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x46, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
     0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00,
-    0x49, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
-    0x4B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x4B, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4C, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x47, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
+    0x49, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00,
     0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00,
-    0x4C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
-    0x4E, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x4E, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x32, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4B, 0x00, 0x00, 0x00,
+    0x4A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
+    0x4C, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x4C, 0x00, 0x00, 0x00, 0x4B, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00,
+    0x33, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x2E, 0x00, 0x00, 0x00, 0x4E, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, 0x4E, 0x00, 0x00, 0x00,
     0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
-    0x51, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x4F, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
     0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0x52, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x26, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x53, 0x00, 0x00, 0x00,
-    0x52, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
-    0x54, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x50, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x51, 0x00, 0x00, 0x00,
+    0x50, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+    0x52, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
     0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x2B, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x56, 0x00, 0x00, 0x00,
-    0x55, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00,
-    0x57, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
-    0x57, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x58, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x23, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x53, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x2B, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00,
+    0x53, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00,
+    0x55, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+    0x55, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x56, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x23, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
     0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
     0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00,
-    0x5B, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0x5D, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x5E, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00,
+    0x59, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0x5B, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00,
     0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00,
-    0x5E, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0x60, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00,
+    0x5C, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0x5E, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00,
     0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x61, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x16, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x5F, 0x00, 0x00, 0x00, 0x5E, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x16, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
     0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x09, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
     0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x64, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x66, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00,
-    0x67, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00,
-    0xF6, 0x00, 0x04, 0x00, 0x69, 0x00, 0x00, 0x00, 0x6A, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x6B, 0x00, 0x00, 0x00,
-    0xF8, 0x00, 0x02, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00,
-    0xB1, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x00,
-    0x6C, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00,
-    0x6E, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
+    0x62, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0x64, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00,
+    0x65, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x65, 0x00, 0x00, 0x00,
+    0xF6, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x69, 0x00, 0x00, 0x00,
+    0xF8, 0x00, 0x02, 0x00, 0x69, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0x6A, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+    0xB1, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00,
+    0x6A, 0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00,
+    0x6C, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+    0xF8, 0x00, 0x02, 0x00, 0x66, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x00,
+    0x64, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x6F, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x6E, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0x70, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x04, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00,
+    0x64, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x73, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x72, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0x74, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+    0x74, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00,
+    0x76, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x23, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00,
+    0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00,
+    0x75, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x6D, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x7A, 0x00, 0x00, 0x00,
+    0x79, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00,
     0xF8, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
-    0x66, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00,
-    0x71, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
-    0x7F, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00,
-    0x72, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00,
-    0x74, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00,
-    0x23, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
-    0x75, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0x77, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
-    0x66, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00,
-    0x79, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00,
-    0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x7B, 0x00, 0x00, 0x00,
-    0x77, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x26, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x7C, 0x00, 0x00, 0x00, 0x7B, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00,
-    0x6A, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x6A, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x00,
-    0x66, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00,
-    0x7E, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x66, 0x00, 0x00, 0x00, 0x7E, 0x00, 0x00, 0x00,
-    0xF9, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
-    0x69, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00,
-    0xF9, 0x00, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
-    0x7F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
-    0x80, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0x81, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x26, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x82, 0x00, 0x00, 0x00,
-    0x81, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
-    0x83, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x2B, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x85, 0x00, 0x00, 0x00,
-    0x84, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00,
-    0x86, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00,
-    0x86, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x87, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x23, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x00,
-    0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x8A, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x16, 0x00, 0x00, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x09, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x00, 0x8B, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x8D, 0x00, 0x00, 0x00,
-    0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x8D, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x36, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00,
-    0x8F, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x31, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
-    0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00,
-    0x90, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
-    0x92, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x92, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00,
-    0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
-    0x93, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
-    0x95, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x32, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
-    0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x23, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x9A, 0x00, 0x00, 0x00,
-    0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x9A, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x16, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x09, 0x00, 0x00, 0x00, 0x9C, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x00,
-    0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x9D, 0x00, 0x00, 0x00, 0x9C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x36, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00,
-    0x9F, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0x31, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0xA0, 0x00, 0x00, 0x00,
-    0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00,
-    0xA0, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
-    0xA2, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0xA2, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xA3, 0x00, 0x00, 0x00,
-    0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00,
-    0xA3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
-    0xA5, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0xA5, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x32, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x00,
-    0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00,
-    0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
-    0xA8, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0xA9, 0x00, 0x00, 0x00, 0xA8, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x23, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-    0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0xAB, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00,
-    0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xAC, 0x00, 0x00, 0x00,
-    0xA9, 0x00, 0x00, 0x00, 0xAB, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-    0x23, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0xAE, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00,
-    0x83, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x00, 0x00,
-    0xAC, 0x00, 0x00, 0x00, 0xAE, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x26, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xB0, 0x00, 0x00, 0x00,
-    0xAF, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
-    0xB1, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0xB2, 0x00, 0x00, 0x00, 0xB1, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-    0x2B, 0x00, 0x00, 0x00, 0xB3, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xB3, 0x00, 0x00, 0x00,
-    0xB2, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xB4, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0xB5, 0x00, 0x00, 0x00,
-    0xF8, 0x00, 0x02, 0x00, 0xB5, 0x00, 0x00, 0x00, 0xF6, 0x00, 0x04, 0x00,
-    0xB7, 0x00, 0x00, 0x00, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0xF9, 0x00, 0x02, 0x00, 0xB9, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
-    0xB9, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00,
-    0xBA, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, 0xB1, 0x00, 0x05, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0xBB, 0x00, 0x00, 0x00, 0xBA, 0x00, 0x00, 0x00,
-    0x6D, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, 0xBB, 0x00, 0x00, 0x00,
-    0xB6, 0x00, 0x00, 0x00, 0xB7, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
-    0xB6, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00,
-    0xBC, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0xBD, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, 0xBE, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0xBD, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0xBF, 0x00, 0x00, 0x00, 0xBE, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0xC0, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0xC0, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0xC2, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x04, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00, 0xC2, 0x00, 0x00, 0x00,
-    0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xC4, 0x00, 0x00, 0x00,
-    0xBF, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0xC5, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00,
-    0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, 0xC6, 0x00, 0x00, 0x00,
-    0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0xC5, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
-    0xC7, 0x00, 0x00, 0x00, 0xC6, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
-    0x0A, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC4, 0x00, 0x00, 0x00,
-    0xC7, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x26, 0x00, 0x00, 0x00,
-    0xC9, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0xBC, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xC9, 0x00, 0x00, 0x00,
-    0xC8, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0xB8, 0x00, 0x00, 0x00,
-    0xF8, 0x00, 0x02, 0x00, 0xB8, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0xCA, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00,
-    0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, 0xCB, 0x00, 0x00, 0x00,
-    0xCA, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
-    0xB4, 0x00, 0x00, 0x00, 0xCB, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00,
-    0xB5, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB7, 0x00, 0x00, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0x7B, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+    0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x00, 0x00,
+    0x7B, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0x64, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00,
+    0x65, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00,
     0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, 0xF9, 0x00, 0x02, 0x00,
-    0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00,
-    0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
+    0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x7D, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x7E, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x00, 0x00,
+    0x7E, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
+    0x80, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+    0x81, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
+    0x83, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x83, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+    0x33, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x2E, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00,
+    0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x86, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0x87, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00,
+    0x87, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+    0x89, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x8A, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x2B, 0x00, 0x00, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x8B, 0x00, 0x00, 0x00,
+    0x8A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00,
+    0x8C, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x8D, 0x00, 0x00, 0x00,
+    0x8C, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x8D, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x23, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+    0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0x90, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x16, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00,
+    0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0x93, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00,
+    0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00,
+    0xDB, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x96, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x98, 0x00, 0x00, 0x00,
+    0x97, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+    0x99, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x9A, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x2B, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x9B, 0x00, 0x00, 0x00,
+    0x9A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00,
+    0x9C, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x00,
+    0x9C, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x9D, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x23, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0xA0, 0x00, 0x00, 0x00,
+    0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0xA0, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x16, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0xA2, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0xA3, 0x00, 0x00, 0x00,
+    0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0xA3, 0x00, 0x00, 0x00, 0xA2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x34, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00,
+    0xA5, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0xA5, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00,
+    0xA6, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0xA8, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0xA9, 0x00, 0x00, 0x00, 0xA8, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00,
+    0xA9, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0xAB, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0xAC, 0x00, 0x00, 0x00, 0xAB, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00,
+    0xAC, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00,
+    0xAE, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0xAE, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x00, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00,
+    0xAF, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00,
+    0xB1, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0xB1, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0xB2, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0xF9, 0x00, 0x02, 0x00, 0xB3, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
+    0xB3, 0x00, 0x00, 0x00, 0xF6, 0x00, 0x04, 0x00, 0xB5, 0x00, 0x00, 0x00,
+    0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00,
+    0xB7, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB7, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xB8, 0x00, 0x00, 0x00,
+    0xB2, 0x00, 0x00, 0x00, 0xB1, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0xB9, 0x00, 0x00, 0x00, 0xB8, 0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00,
+    0xFA, 0x00, 0x04, 0x00, 0xB9, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00,
+    0xB5, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB4, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xBA, 0x00, 0x00, 0x00,
+    0xB2, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00,
+    0xBB, 0x00, 0x00, 0x00, 0xB2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x23, 0x00, 0x00, 0x00, 0xBC, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0xBB, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0xBD, 0x00, 0x00, 0x00, 0xBC, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xBE, 0x00, 0x00, 0x00,
+    0xB2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0xBF, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0xBE, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0xC0, 0x00, 0x00, 0x00, 0xBF, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x04, 0x00,
+    0x0A, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00, 0xC0, 0x00, 0x00, 0x00,
+    0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xC2, 0x00, 0x00, 0x00,
+    0xBD, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00, 0xB2, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0xC4, 0x00, 0x00, 0x00,
+    0x33, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xC5, 0x00, 0x00, 0x00,
+    0xC4, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00,
+    0xC6, 0x00, 0x00, 0x00, 0xC2, 0x00, 0x00, 0x00, 0xC5, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0xC7, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0xBA, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00,
+    0xC7, 0x00, 0x00, 0x00, 0xC6, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00,
+    0xB6, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB6, 0x00, 0x00, 0x00,
+    0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00,
+    0xB2, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00,
+    0xC9, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x3E, 0x00, 0x03, 0x00, 0xB2, 0x00, 0x00, 0x00, 0xC9, 0x00, 0x00, 0x00,
+    0xF9, 0x00, 0x02, 0x00, 0xB3, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
+    0xB5, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00,
+    0xF9, 0x00, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00,
+    0x1F, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
 };
diff --git a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt
index b047926f5..94fb6a700 100644
--- a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt
+++ b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt
@@ -1,7 +1,7 @@
 ; SPIR-V
 ; Version: 1.0
 ; Generator: Khronos Glslang Reference Front End; 1
-; Bound: 204
+; Bound: 202
 ; Schema: 0
                OpCapability Geometry
                OpCapability GeometryPointSize
@@ -9,7 +9,7 @@
                OpCapability GeometryStreams
           %1 = OpExtInstImport "GLSL.std.450"
                OpMemoryModel Logical GLSL450
-               OpEntryPoint Geometry %4 "main" %18 %34 %49 %53
+               OpEntryPoint Geometry %4 "main" %18 %34 %48 %51
                OpExecutionMode %4 Triangles
                OpExecutionMode %4 Invocations 1
                OpExecutionMode %4 OutputTriangleStrip
@@ -27,14 +27,10 @@
                OpMemberName %32 1 "gl_PointSize"
                OpMemberName %32 2 "gl_ClipDistance"
                OpName %34 ""
-               OpName %47 "VertexData"
-               OpMemberName %47 0 "o"
-               OpName %49 "out_vtx"
-               OpName %50 "VertexData"
-               OpMemberName %50 0 "o"
-               OpName %53 "in_vtx"
-               OpName %102 "i"
-               OpName %180 "i"
+               OpName %48 "out_interpolators"
+               OpName %51 "in_interpolators"
+               OpName %100 "i"
+               OpName %178 "i"
                OpMemberDecorate %14 0 BuiltIn Position
                OpMemberDecorate %14 1 BuiltIn PointSize
                OpMemberDecorate %14 2 BuiltIn ClipDistance
@@ -45,10 +41,9 @@
                OpDecorate %32 Block
                OpDecorate %32 Stream 0
                OpDecorate %34 Stream 0
-               OpMemberDecorate %47 0 Location 0
-               OpDecorate %47 Stream 0
-               OpDecorate %49 Stream 0
-               OpMemberDecorate %50 0 Location 0
+               OpDecorate %48 Location 0
+               OpDecorate %48 Stream 0
+               OpDecorate %51 Location 0
           %2 = OpTypeVoid
           %3 = OpTypeFunction %2
           %6 = OpTypeBool
@@ -77,21 +72,19 @@
          %43 = OpTypePointer Output %9
          %45 = OpConstant %11 16
          %46 = OpTypeArray %10 %45
-         %47 = OpTypeStruct %46
-         %48 = OpTypePointer Output %47
-         %49 = OpVariable %48 Output
-         %50 = OpTypeStruct %46
-         %51 = OpTypeArray %50 %15
-         %52 = OpTypePointer Input %51
-         %53 = OpVariable %52 Input
-         %54 = OpTypePointer Input %50
-        %101 = OpTypePointer Function %19
-        %109 = OpConstant %19 16
+         %47 = OpTypePointer Output %46
+         %48 = OpVariable %47 Output
+         %49 = OpTypeArray %46 %15
+         %50 = OpTypePointer Input %49
+         %51 = OpVariable %50 Input
+         %52 = OpTypePointer Input %46
+         %99 = OpTypePointer Function %19
+        %107 = OpConstant %19 16
           %4 = OpFunction %2 None %3
           %5 = OpLabel
           %8 = OpVariable %7 Function
-        %102 = OpVariable %101 Function
-        %180 = OpVariable %101 Function
+        %100 = OpVariable %99 Function
+        %178 = OpVariable %99 Function
          %23 = OpAccessChain %22 %18 %20 %20 %21
          %24 = OpLoad %9 %23
          %26 = OpAccessChain %22 %18 %25 %20 %21
@@ -100,7 +93,7 @@
                OpStore %8 %28
          %29 = OpLoad %6 %8
                OpSelectionMerge %31 None
-               OpBranchConditional %29 %30 %127
+               OpBranchConditional %29 %30 %125
          %30 = OpLabel
          %36 = OpAccessChain %35 %18 %20 %20
          %37 = OpLoad %10 %36
@@ -110,216 +103,216 @@
          %42 = OpLoad %9 %41
          %44 = OpAccessChain %43 %34 %40
                OpStore %44 %42
-         %55 = OpAccessChain %54 %53 %20
-         %56 = OpLoad %50 %55
-               OpStore %49 %56
+         %53 = OpAccessChain %52 %51 %20
+         %54 = OpLoad %46 %53
+               OpStore %48 %54
                OpEmitVertex
-         %57 = OpAccessChain %35 %18 %40 %20
-         %58 = OpLoad %10 %57
-         %59 = OpAccessChain %38 %34 %20
-               OpStore %59 %58
-         %60 = OpAccessChain %22 %18 %40 %40
-         %61 = OpLoad %9 %60
-         %62 = OpAccessChain %43 %34 %40
-               OpStore %62 %61
-         %63 = OpAccessChain %54 %53 %40
-         %64 = OpLoad %50 %63
-               OpStore %49 %64
+         %55 = OpAccessChain %35 %18 %40 %20
+         %56 = OpLoad %10 %55
+         %57 = OpAccessChain %38 %34 %20
+               OpStore %57 %56
+         %58 = OpAccessChain %22 %18 %40 %40
+         %59 = OpLoad %9 %58
+         %60 = OpAccessChain %43 %34 %40
+               OpStore %60 %59
+         %61 = OpAccessChain %52 %51 %40
+         %62 = OpLoad %46 %61
+               OpStore %48 %62
                OpEmitVertex
-         %65 = OpAccessChain %35 %18 %25 %20
-         %66 = OpLoad %10 %65
-         %67 = OpAccessChain %38 %34 %20
-               OpStore %67 %66
-         %68 = OpAccessChain %22 %18 %25 %40
-         %69 = OpLoad %9 %68
-         %70 = OpAccessChain %43 %34 %40
-               OpStore %70 %69
-         %71 = OpAccessChain %54 %53 %25
-         %72 = OpLoad %50 %71
-               OpStore %49 %72
+         %63 = OpAccessChain %35 %18 %25 %20
+         %64 = OpLoad %10 %63
+         %65 = OpAccessChain %38 %34 %20
+               OpStore %65 %64
+         %66 = OpAccessChain %22 %18 %25 %40
+         %67 = OpLoad %9 %66
+         %68 = OpAccessChain %43 %34 %40
+               OpStore %68 %67
+         %69 = OpAccessChain %52 %51 %25
+         %70 = OpLoad %46 %69
+               OpStore %48 %70
                OpEmitVertex
                OpEndPrimitive
-         %73 = OpAccessChain %35 %18 %25 %20
-         %74 = OpLoad %10 %73
-         %75 = OpAccessChain %38 %34 %20
-               OpStore %75 %74
-         %76 = OpAccessChain %22 %18 %25 %40
-         %77 = OpLoad %9 %76
-         %78 = OpAccessChain %43 %34 %40
-               OpStore %78 %77
-         %79 = OpAccessChain %54 %53 %25
-         %80 = OpLoad %50 %79
-               OpStore %49 %80
+         %71 = OpAccessChain %35 %18 %25 %20
+         %72 = OpLoad %10 %71
+         %73 = OpAccessChain %38 %34 %20
+               OpStore %73 %72
+         %74 = OpAccessChain %22 %18 %25 %40
+         %75 = OpLoad %9 %74
+         %76 = OpAccessChain %43 %34 %40
+               OpStore %76 %75
+         %77 = OpAccessChain %52 %51 %25
+         %78 = OpLoad %46 %77
+               OpStore %48 %78
                OpEmitVertex
-         %81 = OpAccessChain %35 %18 %40 %20
-         %82 = OpLoad %10 %81
-         %83 = OpAccessChain %38 %34 %20
-               OpStore %83 %82
-         %84 = OpAccessChain %22 %18 %40 %40
-         %85 = OpLoad %9 %84
-         %86 = OpAccessChain %43 %34 %40
-               OpStore %86 %85
-         %87 = OpAccessChain %54 %53 %40
-         %88 = OpLoad %50 %87
-               OpStore %49 %88
+         %79 = OpAccessChain %35 %18 %40 %20
+         %80 = OpLoad %10 %79
+         %81 = OpAccessChain %38 %34 %20
+               OpStore %81 %80
+         %82 = OpAccessChain %22 %18 %40 %40
+         %83 = OpLoad %9 %82
+         %84 = OpAccessChain %43 %34 %40
+               OpStore %84 %83
+         %85 = OpAccessChain %52 %51 %40
+         %86 = OpLoad %46 %85
+               OpStore %48 %86
                OpEmitVertex
-         %89 = OpAccessChain %35 %18 %40 %20
+         %87 = OpAccessChain %35 %18 %40 %20
+         %88 = OpLoad %10 %87
+         %89 = OpAccessChain %35 %18 %25 %20
          %90 = OpLoad %10 %89
-         %91 = OpAccessChain %35 %18 %25 %20
-         %92 = OpLoad %10 %91
-         %93 = OpFAdd %10 %90 %92
-         %94 = OpAccessChain %35 %18 %20 %20
-         %95 = OpLoad %10 %94
-         %96 = OpFSub %10 %93 %95
-         %97 = OpAccessChain %38 %34 %20
-               OpStore %97 %96
-         %98 = OpAccessChain %22 %18 %25 %40
-         %99 = OpLoad %9 %98
-        %100 = OpAccessChain %43 %34 %40
-               OpStore %100 %99
-               OpStore %102 %20
-               OpBranch %103
-        %103 = OpLabel
-               OpLoopMerge %105 %106 None
-               OpBranch %107
-        %107 = OpLabel
-        %108 = OpLoad %19 %102
-        %110 = OpSLessThan %6 %108 %109
-               OpBranchConditional %110 %104 %105
-        %104 = OpLabel
-        %111 = OpLoad %19 %102
-        %112 = OpLoad %19 %102
-        %113 = OpAccessChain %35 %53 %20 %20 %112
-        %114 = OpLoad %10 %113
-        %115 = OpFNegate %10 %114
-        %116 = OpLoad %19 %102
-        %117 = OpAccessChain %35 %53 %40 %20 %116
-        %118 = OpLoad %10 %117
-        %119 = OpFAdd %10 %115 %118
-        %120 = OpLoad %19 %102
-        %121 = OpAccessChain %35 %53 %25 %20 %120
-        %122 = OpLoad %10 %121
-        %123 = OpFAdd %10 %119 %122
-        %124 = OpAccessChain %38 %49 %20 %111
-               OpStore %124 %123
-               OpBranch %106
-        %106 = OpLabel
-        %125 = OpLoad %19 %102
-        %126 = OpIAdd %19 %125 %40
-               OpStore %102 %126
-               OpBranch %103
+         %91 = OpFAdd %10 %88 %90
+         %92 = OpAccessChain %35 %18 %20 %20
+         %93 = OpLoad %10 %92
+         %94 = OpFSub %10 %91 %93
+         %95 = OpAccessChain %38 %34 %20
+               OpStore %95 %94
+         %96 = OpAccessChain %22 %18 %25 %40
+         %97 = OpLoad %9 %96
+         %98 = OpAccessChain %43 %34 %40
+               OpStore %98 %97
+               OpStore %100 %20
+               OpBranch %101
+        %101 = OpLabel
+               OpLoopMerge %103 %104 None
+               OpBranch %105
         %105 = OpLabel
+        %106 = OpLoad %19 %100
+        %108 = OpSLessThan %6 %106 %107
+               OpBranchConditional %108 %102 %103
+        %102 = OpLabel
+        %109 = OpLoad %19 %100
+        %110 = OpLoad %19 %100
+        %111 = OpAccessChain %35 %51 %20 %110
+        %112 = OpLoad %10 %111
+        %113 = OpFNegate %10 %112
+        %114 = OpLoad %19 %100
+        %115 = OpAccessChain %35 %51 %40 %114
+        %116 = OpLoad %10 %115
+        %117 = OpFAdd %10 %113 %116
+        %118 = OpLoad %19 %100
+        %119 = OpAccessChain %35 %51 %25 %118
+        %120 = OpLoad %10 %119
+        %121 = OpFAdd %10 %117 %120
+        %122 = OpAccessChain %38 %48 %109
+               OpStore %122 %121
+               OpBranch %104
+        %104 = OpLabel
+        %123 = OpLoad %19 %100
+        %124 = OpIAdd %19 %123 %40
+               OpStore %100 %124
+               OpBranch %101
+        %103 = OpLabel
                OpEmitVertex
                OpEndPrimitive
                OpBranch %31
-        %127 = OpLabel
-        %128 = OpAccessChain %35 %18 %20 %20
-        %129 = OpLoad %10 %128
-        %130 = OpAccessChain %38 %34 %20
-               OpStore %130 %129
-        %131 = OpAccessChain %22 %18 %20 %40
-        %132 = OpLoad %9 %131
-        %133 = OpAccessChain %43 %34 %40
-               OpStore %133 %132
-        %134 = OpAccessChain %54 %53 %20
-        %135 = OpLoad %50 %134
-               OpStore %49 %135
+        %125 = OpLabel
+        %126 = OpAccessChain %35 %18 %20 %20
+        %127 = OpLoad %10 %126
+        %128 = OpAccessChain %38 %34 %20
+               OpStore %128 %127
+        %129 = OpAccessChain %22 %18 %20 %40
+        %130 = OpLoad %9 %129
+        %131 = OpAccessChain %43 %34 %40
+               OpStore %131 %130
+        %132 = OpAccessChain %52 %51 %20
+        %133 = OpLoad %46 %132
+               OpStore %48 %133
                OpEmitVertex
-        %136 = OpAccessChain %35 %18 %40 %20
-        %137 = OpLoad %10 %136
-        %138 = OpAccessChain %38 %34 %20
-               OpStore %138 %137
-        %139 = OpAccessChain %22 %18 %40 %40
-        %140 = OpLoad %9 %139
-        %141 = OpAccessChain %43 %34 %40
-               OpStore %141 %140
-        %142 = OpAccessChain %54 %53 %40
-        %143 = OpLoad %50 %142
-               OpStore %49 %143
+        %134 = OpAccessChain %35 %18 %40 %20
+        %135 = OpLoad %10 %134
+        %136 = OpAccessChain %38 %34 %20
+               OpStore %136 %135
+        %137 = OpAccessChain %22 %18 %40 %40
+        %138 = OpLoad %9 %137
+        %139 = OpAccessChain %43 %34 %40
+               OpStore %139 %138
+        %140 = OpAccessChain %52 %51 %40
+        %141 = OpLoad %46 %140
+               OpStore %48 %141
                OpEmitVertex
-        %144 = OpAccessChain %35 %18 %25 %20
-        %145 = OpLoad %10 %144
-        %146 = OpAccessChain %38 %34 %20
-               OpStore %146 %145
-        %147 = OpAccessChain %22 %18 %25 %40
-        %148 = OpLoad %9 %147
-        %149 = OpAccessChain %43 %34 %40
-               OpStore %149 %148
-        %150 = OpAccessChain %54 %53 %25
-        %151 = OpLoad %50 %150
-               OpStore %49 %151
+        %142 = OpAccessChain %35 %18 %25 %20
+        %143 = OpLoad %10 %142
+        %144 = OpAccessChain %38 %34 %20
+               OpStore %144 %143
+        %145 = OpAccessChain %22 %18 %25 %40
+        %146 = OpLoad %9 %145
+        %147 = OpAccessChain %43 %34 %40
+               OpStore %147 %146
+        %148 = OpAccessChain %52 %51 %25
+        %149 = OpLoad %46 %148
+               OpStore %48 %149
                OpEmitVertex
                OpEndPrimitive
-        %152 = OpAccessChain %35 %18 %20 %20
-        %153 = OpLoad %10 %152
-        %154 = OpAccessChain %38 %34 %20
-               OpStore %154 %153
-        %155 = OpAccessChain %22 %18 %20 %40
-        %156 = OpLoad %9 %155
-        %157 = OpAccessChain %43 %34 %40
-               OpStore %157 %156
-        %158 = OpAccessChain %54 %53 %20
-        %159 = OpLoad %50 %158
-               OpStore %49 %159
+        %150 = OpAccessChain %35 %18 %20 %20
+        %151 = OpLoad %10 %150
+        %152 = OpAccessChain %38 %34 %20
+               OpStore %152 %151
+        %153 = OpAccessChain %22 %18 %20 %40
+        %154 = OpLoad %9 %153
+        %155 = OpAccessChain %43 %34 %40
+               OpStore %155 %154
+        %156 = OpAccessChain %52 %51 %20
+        %157 = OpLoad %46 %156
+               OpStore %48 %157
                OpEmitVertex
-        %160 = OpAccessChain %35 %18 %25 %20
-        %161 = OpLoad %10 %160
-        %162 = OpAccessChain %38 %34 %20
-               OpStore %162 %161
-        %163 = OpAccessChain %22 %18 %25 %40
-        %164 = OpLoad %9 %163
-        %165 = OpAccessChain %43 %34 %40
-               OpStore %165 %164
-        %166 = OpAccessChain %54 %53 %25
-        %167 = OpLoad %50 %166
-               OpStore %49 %167
+        %158 = OpAccessChain %35 %18 %25 %20
+        %159 = OpLoad %10 %158
+        %160 = OpAccessChain %38 %34 %20
+               OpStore %160 %159
+        %161 = OpAccessChain %22 %18 %25 %40
+        %162 = OpLoad %9 %161
+        %163 = OpAccessChain %43 %34 %40
+               OpStore %163 %162
+        %164 = OpAccessChain %52 %51 %25
+        %165 = OpLoad %46 %164
+               OpStore %48 %165
                OpEmitVertex
-        %168 = OpAccessChain %35 %18 %20 %20
+        %166 = OpAccessChain %35 %18 %20 %20
+        %167 = OpLoad %10 %166
+        %168 = OpAccessChain %35 %18 %25 %20
         %169 = OpLoad %10 %168
-        %170 = OpAccessChain %35 %18 %25 %20
-        %171 = OpLoad %10 %170
-        %172 = OpFAdd %10 %169 %171
-        %173 = OpAccessChain %35 %18 %40 %20
-        %174 = OpLoad %10 %173
-        %175 = OpFSub %10 %172 %174
-        %176 = OpAccessChain %38 %34 %20
-               OpStore %176 %175
-        %177 = OpAccessChain %22 %18 %25 %40
-        %178 = OpLoad %9 %177
-        %179 = OpAccessChain %43 %34 %40
-               OpStore %179 %178
-               OpStore %180 %20
-               OpBranch %181
-        %181 = OpLabel
-               OpLoopMerge %183 %184 None
-               OpBranch %185
-        %185 = OpLabel
-        %186 = OpLoad %19 %180
-        %187 = OpSLessThan %6 %186 %109
-               OpBranchConditional %187 %182 %183
-        %182 = OpLabel
-        %188 = OpLoad %19 %180
-        %189 = OpLoad %19 %180
-        %190 = OpAccessChain %35 %53 %20 %20 %189
-        %191 = OpLoad %10 %190
-        %192 = OpLoad %19 %180
-        %193 = OpAccessChain %35 %53 %40 %20 %192
-        %194 = OpLoad %10 %193
-        %195 = OpFNegate %10 %194
-        %196 = OpFAdd %10 %191 %195
-        %197 = OpLoad %19 %180
-        %198 = OpAccessChain %35 %53 %25 %20 %197
-        %199 = OpLoad %10 %198
-        %200 = OpFAdd %10 %196 %199
-        %201 = OpAccessChain %38 %49 %20 %188
-               OpStore %201 %200
-               OpBranch %184
-        %184 = OpLabel
-        %202 = OpLoad %19 %180
-        %203 = OpIAdd %19 %202 %40
-               OpStore %180 %203
-               OpBranch %181
+        %170 = OpFAdd %10 %167 %169
+        %171 = OpAccessChain %35 %18 %40 %20
+        %172 = OpLoad %10 %171
+        %173 = OpFSub %10 %170 %172
+        %174 = OpAccessChain %38 %34 %20
+               OpStore %174 %173
+        %175 = OpAccessChain %22 %18 %25 %40
+        %176 = OpLoad %9 %175
+        %177 = OpAccessChain %43 %34 %40
+               OpStore %177 %176
+               OpStore %178 %20
+               OpBranch %179
+        %179 = OpLabel
+               OpLoopMerge %181 %182 None
+               OpBranch %183
         %183 = OpLabel
+        %184 = OpLoad %19 %178
+        %185 = OpSLessThan %6 %184 %107
+               OpBranchConditional %185 %180 %181
+        %180 = OpLabel
+        %186 = OpLoad %19 %178
+        %187 = OpLoad %19 %178
+        %188 = OpAccessChain %35 %51 %20 %187
+        %189 = OpLoad %10 %188
+        %190 = OpLoad %19 %178
+        %191 = OpAccessChain %35 %51 %40 %190
+        %192 = OpLoad %10 %191
+        %193 = OpFNegate %10 %192
+        %194 = OpFAdd %10 %189 %193
+        %195 = OpLoad %19 %178
+        %196 = OpAccessChain %35 %51 %25 %195
+        %197 = OpLoad %10 %196
+        %198 = OpFAdd %10 %194 %197
+        %199 = OpAccessChain %38 %48 %186
+               OpStore %199 %198
+               OpBranch %182
+        %182 = OpLabel
+        %200 = OpLoad %19 %178
+        %201 = OpIAdd %19 %200 %40
+               OpStore %178 %201
+               OpBranch %179
+        %181 = OpLabel
                OpEmitVertex
                OpEndPrimitive
                OpBranch %31
diff --git a/src/xenia/gpu/vulkan/shaders/rect_list.geom b/src/xenia/gpu/vulkan/shaders/rect_list.geom
index d796919d3..6c7e24c7e 100644
--- a/src/xenia/gpu/vulkan/shaders/rect_list.geom
+++ b/src/xenia/gpu/vulkan/shaders/rect_list.geom
@@ -16,11 +16,8 @@ out gl_PerVertex {
   float gl_ClipDistance[];
 };
 
-struct VertexData {
-  vec4 o[16];
-};
-layout(location = 0) in VertexData in_vtx[];
-layout(location = 0) out VertexData out_vtx;
+layout(location = 0) in vec4 in_interpolators[][16];
+layout(location = 0) out vec4 out_interpolators[16];
 
 layout(triangles) in;
 layout(triangle_strip, max_vertices = 6) out;
@@ -35,30 +32,30 @@ void main() {
     //  2 ----- [3]
     gl_Position = gl_in[0].gl_Position;
     gl_PointSize = gl_in[0].gl_PointSize;
-    out_vtx = in_vtx[0];
+    out_interpolators = in_interpolators[0];
     EmitVertex();
     gl_Position = gl_in[1].gl_Position;
     gl_PointSize = gl_in[1].gl_PointSize;
-    out_vtx = in_vtx[1];
+    out_interpolators = in_interpolators[1];
     EmitVertex();
     gl_Position = gl_in[2].gl_Position;
     gl_PointSize = gl_in[2].gl_PointSize;
-    out_vtx = in_vtx[2];
+    out_interpolators = in_interpolators[2];
     EmitVertex();
     EndPrimitive();
     gl_Position = gl_in[2].gl_Position;
     gl_PointSize = gl_in[2].gl_PointSize;
-    out_vtx = in_vtx[2];
+    out_interpolators = in_interpolators[2];
     EmitVertex();
     gl_Position = gl_in[1].gl_Position;
     gl_PointSize = gl_in[1].gl_PointSize;
-    out_vtx = in_vtx[1];
+    out_interpolators = in_interpolators[1];
     EmitVertex();
     gl_Position = (gl_in[1].gl_Position + gl_in[2].gl_Position) -
                   gl_in[0].gl_Position;
     gl_PointSize = gl_in[2].gl_PointSize;
     for (int i = 0; i < 16; ++i) {
-      out_vtx.o[i] = -in_vtx[0].o[i] + in_vtx[1].o[i] + in_vtx[2].o[i];
+      out_interpolators[i] = -in_interpolators[0][i] + in_interpolators[1][i] + in_interpolators[2][i];
     }
     EmitVertex();
     EndPrimitive();
@@ -70,30 +67,30 @@ void main() {
     // [3] ----- 2
     gl_Position = gl_in[0].gl_Position;
     gl_PointSize = gl_in[0].gl_PointSize;
-    out_vtx = in_vtx[0];
+    out_interpolators = in_interpolators[0];
     EmitVertex();
     gl_Position = gl_in[1].gl_Position;
     gl_PointSize = gl_in[1].gl_PointSize;
-    out_vtx = in_vtx[1];
+    out_interpolators = in_interpolators[1];
     EmitVertex();
     gl_Position = gl_in[2].gl_Position;
     gl_PointSize = gl_in[2].gl_PointSize;
-    out_vtx = in_vtx[2];
+    out_interpolators = in_interpolators[2];
     EmitVertex();
     EndPrimitive();
     gl_Position = gl_in[0].gl_Position;
     gl_PointSize = gl_in[0].gl_PointSize;
-    out_vtx = in_vtx[0];
+    out_interpolators = in_interpolators[0];
     EmitVertex();
     gl_Position = gl_in[2].gl_Position;
     gl_PointSize = gl_in[2].gl_PointSize;
-    out_vtx = in_vtx[2];
+    out_interpolators = in_interpolators[2];
     EmitVertex();
     gl_Position = (gl_in[0].gl_Position + gl_in[2].gl_Position) -
                   gl_in[1].gl_Position;
     gl_PointSize = gl_in[2].gl_PointSize;
     for (int i = 0; i < 16; ++i) {
-      out_vtx.o[i] = in_vtx[0].o[i] + -in_vtx[1].o[i] + in_vtx[2].o[i];
+      out_interpolators[i] = in_interpolators[0][i] + -in_interpolators[1][i] + in_interpolators[2][i];
     }
     EmitVertex();
     EndPrimitive();

From 692d666d57c0eed47237d56e085062431370b8b8 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 16:50:06 -0500
Subject: [PATCH 32/77] Wipe the buffer cache in ClearCache for now.

---
 src/xenia/gpu/vulkan/buffer_cache.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc
index 7fd3c4768..90b7c487e 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.cc
+++ b/src/xenia/gpu/vulkan/buffer_cache.cc
@@ -432,6 +432,8 @@ void BufferCache::InvalidateCache() {
 
 void BufferCache::ClearCache() {
   // TODO(benvanik): caching.
+  // Temporary clear.
+  transient_tail_offset_ = transient_head_offset_;
 }
 
 }  // namespace vulkan

From fc1bd0f3793f61fc264271889a634f8cc5d884a1 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 17:29:39 -0500
Subject: [PATCH 33/77] Fix texture uploads

---
 src/xenia/gpu/vulkan/texture_cache.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 500d6ac25..686c5314d 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -588,12 +588,12 @@ bool TextureCache::UploadTexture2D(
   // For now, just transfer the grid we uploaded earlier into the texture.
   VkBufferImageCopy copy_region;
   copy_region.bufferOffset = alloc->offset;
-  copy_region.bufferRowLength = src.size_2d.input_width;
-  copy_region.bufferImageHeight = src.size_2d.input_height;
+  copy_region.bufferRowLength = src.width + 1;
+  copy_region.bufferImageHeight = src.height + 1;
   copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
   copy_region.imageOffset = {0, 0, 0};
-  copy_region.imageExtent = {dest->texture_info.size_2d.output_width + 1,
-                             dest->texture_info.size_2d.output_height + 1,
+  copy_region.imageExtent = {dest->texture_info.width + 1,
+                             dest->texture_info.height + 1,
                              dest->texture_info.depth + 1};
   vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
                          dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,

From 44cffab389377638c619309dfe30b15c07b24521 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 25 Mar 2016 18:23:45 -0500
Subject: [PATCH 34/77] SPIR-V Max4

---
 src/xenia/gpu/spirv_shader_translator.cc | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index f7a1660fb..2749aad01 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -701,6 +701,7 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
     } break;
     default:
       // TODO: the rest of these
+      assert_always();
       break;
   }
 
@@ -909,6 +910,23 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     } break;
 
     case AluVectorOpcode::kMax4: {
+      auto src0_x = b.createCompositeExtract(sources[0], float_type_, 0);
+      auto src0_y = b.createCompositeExtract(sources[0], float_type_, 1);
+      auto src0_z = b.createCompositeExtract(sources[0], float_type_, 2);
+      auto src0_w = b.createCompositeExtract(sources[0], float_type_, 3);
+
+      auto max_xy = CreateGlslStd450InstructionCall(
+          spv::NoPrecision, float_type_, spv::GLSLstd450::kFMax,
+          {src0_x, src0_y});
+      auto max_zw = CreateGlslStd450InstructionCall(
+          spv::NoPrecision, float_type_, spv::GLSLstd450::kFMax,
+          {src0_z, src0_w});
+      auto max_xyzw = CreateGlslStd450InstructionCall(
+          spv::NoPrecision, float_type_, spv::GLSLstd450::kFMax,
+          {max_xy, max_zw});
+
+      // FIXME: Docs say this only updates pv.x?
+      dest = b.smearScalar(spv::NoPrecision, max_xyzw, vec4_float_type_);
     } break;
 
     case AluVectorOpcode::kMaxA: {

From 1ea72c5e068ee40fb945bd471048be175a4c1338 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 1 Apr 2016 21:49:58 -0500
Subject: [PATCH 35/77] FencedPool::CancelBatch

---
 src/xenia/ui/vulkan/fenced_pools.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/xenia/ui/vulkan/fenced_pools.h b/src/xenia/ui/vulkan/fenced_pools.h
index a50f82d08..3a7bb01c4 100644
--- a/src/xenia/ui/vulkan/fenced_pools.h
+++ b/src/xenia/ui/vulkan/fenced_pools.h
@@ -88,6 +88,24 @@ class BaseFencedPool {
     open_batch_ = batch;
   }
 
+  // Cancels an open batch, and releases all entries acquired within.
+  void CancelBatch() {
+    assert_not_null(open_batch_);
+
+    auto batch = open_batch_;
+    open_batch_ = nullptr;
+
+    // Relink the batch back into the free batch list.
+    batch->next = free_batch_list_head_;
+    free_batch_list_head_ = batch;
+
+    // Relink entries back into free entries list.
+    batch->entry_list_tail->next = free_entry_list_head_;
+    free_entry_list_head_ = batch->entry_list_head;
+    batch->entry_list_head = nullptr;
+    batch->entry_list_tail = nullptr;
+  }
+
   // Attempts to acquire an entry from the pool in the current batch.
   // If none are available a new one will be allocated.
   HANDLE AcquireEntry() {

From 2eca3ce9e6e5b14c8765935e73e3e1b1cecb43aa Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 1 Apr 2016 21:51:17 -0500
Subject: [PATCH 36/77] Texture uploads/basic formats Fixed swizzle one/zero
 mismatch Sampler setup Remove samplers from the descriptor set layout

---
 src/xenia/gpu/spirv_shader_translator.cc | 118 +++++--
 src/xenia/gpu/spirv_shader_translator.h  |   5 +-
 src/xenia/gpu/vulkan/texture_cache.cc    | 424 ++++++++++++++++++-----
 src/xenia/gpu/vulkan/texture_cache.h     |  20 +-
 4 files changed, 437 insertions(+), 130 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 2749aad01..ef242f0bd 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -55,11 +55,11 @@ void SpirvShaderTranslator::StartTranslation() {
   bool_type_ = b.makeBoolType();
   float_type_ = b.makeFloatType(32);
   int_type_ = b.makeIntType(32);
-  Id uint_type = b.makeUintType(32);
+  uint_type_ = b.makeUintType(32);
   vec2_float_type_ = b.makeVectorType(float_type_, 2);
   vec3_float_type_ = b.makeVectorType(float_type_, 3);
   vec4_float_type_ = b.makeVectorType(float_type_, 4);
-  vec4_uint_type_ = b.makeVectorType(uint_type, 4);
+  vec4_uint_type_ = b.makeVectorType(uint_type_, 4);
   vec4_bool_type_ = b.makeVectorType(bool_type_, 4);
 
   vec4_float_one_ = b.makeCompositeConstant(
@@ -136,7 +136,7 @@ void SpirvShaderTranslator::StartTranslation() {
 
   // Push constants, represented by SpirvPushConstants.
   Id push_constants_type = b.makeStructType(
-      {vec4_float_type_, vec4_float_type_, vec4_float_type_, uint_type},
+      {vec4_float_type_, vec4_float_type_, vec4_float_type_, uint_type_},
       "push_consts_type");
   b.addDecoration(push_constants_type, spv::Decoration::DecorationBlock);
 
@@ -164,7 +164,6 @@ void SpirvShaderTranslator::StartTranslation() {
                                   push_constants_type, "push_consts");
 
   // Texture bindings
-  Id sampler_t = b.makeSamplerType();
   Id tex_t[] = {b.makeSampledImageType(b.makeImageType(
                     float_type_, spv::Dim::Dim1D, false, false, false, 1,
                     spv::ImageFormat::ImageFormatUnknown)),
@@ -178,23 +177,17 @@ void SpirvShaderTranslator::StartTranslation() {
                     float_type_, spv::Dim::DimCube, false, false, false, 1,
                     spv::ImageFormat::ImageFormatUnknown))};
 
-  Id samplers_a = b.makeArrayType(sampler_t, b.makeUintConstant(32), 0);
   Id tex_a_t[] = {b.makeArrayType(tex_t[0], b.makeUintConstant(32), 0),
                   b.makeArrayType(tex_t[1], b.makeUintConstant(32), 0),
                   b.makeArrayType(tex_t[2], b.makeUintConstant(32), 0),
                   b.makeArrayType(tex_t[3], b.makeUintConstant(32), 0)};
 
-  // TODO(DrChat): See texture_cache.cc - do we need separate samplers here?
-  samplers_ = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
-                               samplers_a, "samplers");
-  b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1);
-  b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0);
   for (int i = 0; i < 4; i++) {
     tex_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant,
                                tex_a_t[i],
                                xe::format_string("textures%dD", i + 1).c_str());
     b.addDecoration(tex_[i], spv::Decoration::DecorationDescriptorSet, 1);
-    b.addDecoration(tex_[i], spv::Decoration::DecorationBinding, i + 1);
+    b.addDecoration(tex_[i], spv::Decoration::DecorationBinding, i);
   }
 
   // Interpolators.
@@ -254,6 +247,20 @@ void SpirvShaderTranslator::StartTranslation() {
                             vec4_float_type_, "gl_Position");
     b.addDecoration(pos_, spv::Decoration::DecorationBuiltIn,
                     spv::BuiltIn::BuiltInPosition);
+
+    vertex_id_ = b.createVariable(spv::StorageClass::StorageClassInput,
+                                  int_type_, "gl_VertexId");
+    b.addDecoration(vertex_id_, spv::Decoration::DecorationBuiltIn,
+                    spv::BuiltIn::BuiltInVertexId);
+
+    auto vertex_id = b.createLoad(vertex_id_);
+    auto r0_ptr = b.createAccessChain(spv::StorageClass::StorageClassFunction,
+                                      registers_ptr_,
+                                      std::vector<Id>({b.makeUintConstant(0)}));
+    auto r0 = b.createLoad(r0_ptr);
+    r0 = b.createCompositeInsert(vertex_id, r0, vec4_float_type_,
+                                 std::vector<uint32_t>({0}));
+    b.createStore(r0, r0_ptr);
   } else {
     // Pixel inputs from vertex shader.
     interpolators_ = b.createVariable(spv::StorageClass::StorageClassInput,
@@ -267,9 +274,9 @@ void SpirvShaderTranslator::StartTranslation() {
                                      frag_outputs_type, "oC");
     b.addDecoration(frag_outputs_, spv::Decoration::DecorationLocation, 0);
 
-    Id frag_depth = b.createVariable(spv::StorageClass::StorageClassOutput,
-                                     vec4_float_type_, "gl_FragDepth");
-    b.addDecoration(frag_depth, spv::Decoration::DecorationBuiltIn,
+    frag_depth_ = b.createVariable(spv::StorageClass::StorageClassOutput,
+                                   float_type_, "gl_FragDepth");
+    b.addDecoration(frag_depth_, spv::Decoration::DecorationBuiltIn,
                     spv::BuiltIn::BuiltInFragDepth);
 
     // TODO(benvanik): frag depth, etc.
@@ -388,6 +395,25 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
                    {p, p_scaled, 4, 5, 2, 3});
 
     b.createStore(p, pos_);
+  } else {
+    // Alpha test
+    auto alpha_test_x = b.createCompositeExtract(
+        push_consts_, float_type_, std::vector<uint32_t>{2, 0});
+    auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, alpha_test_x, b.makeFloatConstant(1.f));
+
+    spv::Builder::If alpha_if(cond, b);
+
+    // TODO(DrChat): Apply alpha test.
+    // if (alpha_func == 0) passes = false;
+    // if (alpha_func == 1 && oC[0].a <  alpha_ref) passes = true;
+    // if (alpha_func == 2 && oC[0].a == alpha_ref) passes = true;
+    // if (alpha_func == 3 && oC[0].a <= alpha_ref) passes = true;
+    // if (alpha_func == 4 && oC[0].a >  alpha_ref) passes = true;
+    // if (alpha_func == 5 && oC[0].a != alpha_ref) passes = true;
+    // if (alpha_func == 6 && oC[0].a >= alpha_ref) passes = true;
+    // if (alpha_func == 7) passes = true;
+
+    alpha_if.makeEndIf();
   }
 
   b.makeReturn(false);
@@ -592,9 +618,9 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
       v = b.createLoad(v);
 
       // Bitfield extract the bool constant.
-      v = b.createTriOp(spv::Op::OpBitFieldUExtract, b.makeUintType(32), v,
-                        b.makeUintConstant(instr.bool_constant_index % 32),
-                        b.makeUintConstant(1));
+      v = b.createTriOp(spv::Op::OpBitFieldUExtract, uint_type_, v,
+                        b.makeIntConstant(instr.bool_constant_index % 32),
+                        b.makeIntConstant(1));
 
       // Conditional branch
       auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
@@ -642,17 +668,57 @@ void SpirvShaderTranslator::ProcessAllocInstruction(
 void SpirvShaderTranslator::ProcessVertexFetchInstruction(
     const ParsedVertexFetchInstruction& instr) {
   auto& b = *builder_;
+  assert_true(is_vertex_shader());
+  assert_not_zero(vertex_id_);
 
   // TODO: instr.is_predicated
 
   // Operand 0 is the index
   // Operand 1 is the binding
   // TODO: Indexed fetch
+  auto vertex_id = LoadFromOperand(instr.operands[0]);
+  vertex_id = b.createCompositeExtract(vertex_id, float_type_, 0);
+  vertex_id = b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, vertex_id);
+  auto shader_vertex_id = b.createLoad(vertex_id_);
+  auto cond =
+      b.createBinOp(spv::Op::OpIEqual, bool_type_, vertex_id, shader_vertex_id);
+
+  // Skip loading if it's an indexed fetch.
   auto vertex_ptr = vertex_binding_map_[instr.operands[1].storage_index]
                                        [instr.attributes.offset];
   assert_not_zero(vertex_ptr);
-
   auto vertex = b.createLoad(vertex_ptr);
+
+  auto vertex_components = b.getNumComponents(vertex);
+  Id alt_vertex = 0;
+  switch (vertex_components) {
+    case 1:
+      alt_vertex = b.makeFloatConstant(0.f);
+      break;
+    case 2:
+      alt_vertex = b.makeCompositeConstant(
+          vec2_float_type_, std::vector<Id>({b.makeFloatConstant(0.f),
+                                             b.makeFloatConstant(1.f)}));
+      break;
+    case 3:
+      alt_vertex = b.makeCompositeConstant(
+          vec3_float_type_,
+          std::vector<Id>({b.makeFloatConstant(0.f), b.makeFloatConstant(0.f),
+                           b.makeFloatConstant(1.f)}));
+      break;
+    case 4:
+      alt_vertex = b.makeCompositeConstant(
+          vec4_float_type_,
+          std::vector<Id>({b.makeFloatConstant(0.f), b.makeFloatConstant(0.f),
+                           b.makeFloatConstant(0.f),
+                           b.makeFloatConstant(1.f)}));
+      break;
+    default:
+      assert_unhandled_case(vertex_components);
+  }
+
+  vertex = b.createTriOp(spv::Op::OpSelect, b.getTypeId(vertex), cond, vertex,
+                         alt_vertex);
   StoreToResult(vertex, instr.result);
 }
 
@@ -1594,15 +1660,15 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
     case InstructionStorageAddressingMode::kAddressAbsolute: {
       // storage_index + a0
       storage_index =
-          b.createBinOp(spv::Op::OpIAdd, b.makeUintType(32), b.createLoad(a0_),
+          b.createBinOp(spv::Op::OpIAdd, uint_type_, b.createLoad(a0_),
                         b.makeUintConstant(storage_base + op.storage_index));
     } break;
     case InstructionStorageAddressingMode::kAddressRelative: {
       // TODO: Based on loop index
       // storage_index + aL.x
-      storage_index = b.createBinOp(
-          spv::Op::OpIAdd, b.makeUintType(32), b.makeUintConstant(0),
-          b.makeUintConstant(storage_base + op.storage_index));
+      storage_index =
+          b.createBinOp(spv::Op::OpIAdd, uint_type_, b.makeUintConstant(0),
+                        b.makeUintConstant(storage_base + op.storage_index));
     } break;
     default:
       assert_always();
@@ -1723,7 +1789,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
     case InstructionStorageAddressingMode::kAddressAbsolute: {
       // storage_index + a0
       storage_index =
-          b.createBinOp(spv::Op::OpIAdd, b.makeUintType(32), b.createLoad(a0_),
+          b.createBinOp(spv::Op::OpIAdd, uint_type_, b.createLoad(a0_),
                         b.makeUintConstant(result.storage_index));
     } break;
     case InstructionStorageAddressingMode::kAddressRelative: {
@@ -1776,7 +1842,11 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
       break;
     case InstructionStorageTarget::kDepth:
       assert_true(is_pixel_shader());
-      // TODO(benvanik): result.storage_index
+      storage_pointer = frag_depth_;
+      storage_class = spv::StorageClass::StorageClassOutput;
+      storage_type = float_type_;
+      storage_offsets.push_back(0);
+      storage_array = false;
       break;
     case InstructionStorageTarget::kNone:
       assert_unhandled_case(result.storage_target);
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 3327dccbd..1d5dea31b 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -104,7 +104,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
   spv::Function* translated_main_ = 0;
 
   // Types.
-  spv::Id float_type_ = 0, bool_type_ = 0, int_type_ = 0;
+  spv::Id float_type_ = 0, bool_type_ = 0, int_type_ = 0, uint_type_ = 0;
   spv::Id vec2_float_type_ = 0, vec3_float_type_ = 0, vec4_float_type_ = 0;
   spv::Id vec4_uint_type_ = 0;
   spv::Id vec4_bool_type_ = 0;
@@ -120,7 +120,8 @@ class SpirvShaderTranslator : public ShaderTranslator {
   spv::Id pos_ = 0;
   spv::Id push_consts_ = 0;
   spv::Id interpolators_ = 0;
-  spv::Id frag_outputs_ = 0;
+  spv::Id vertex_id_ = 0;
+  spv::Id frag_outputs_ = 0, frag_depth_ = 0;
   spv::Id samplers_ = 0;
   spv::Id tex_[4] = {0};  // Images {1D, 2D, 3D, Cube}
 
diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 686c5314d..0deddf36d 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -31,6 +31,81 @@ struct TextureConfig {
   VkFormat host_format;
 };
 
+static const TextureConfig texture_configs[64] = {
+    {TextureFormat::k_1_REVERSE, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_1, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_8, VK_FORMAT_R8_UNORM},
+    {TextureFormat::k_1_5_5_5, VK_FORMAT_R5G5B5A1_UNORM_PACK16},
+    {TextureFormat::k_5_6_5, VK_FORMAT_R5G6B5_UNORM_PACK16},
+    {TextureFormat::k_6_5_5, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_8_8_8_8, VK_FORMAT_R8G8B8A8_UNORM},
+    {TextureFormat::k_2_10_10_10, VK_FORMAT_A2R10G10B10_UNORM_PACK32},
+    {TextureFormat::k_8_A, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_8_B, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_8_8, VK_FORMAT_R8G8_UNORM},
+    {TextureFormat::k_Cr_Y1_Cb_Y0, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_Y1_Cr_Y0_Cb, VK_FORMAT_UNDEFINED},
+    {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_8_8_8_8_A, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_4_4_4_4, VK_FORMAT_R4G4B4A4_UNORM_PACK16},
+    {TextureFormat::k_10_11_11, VK_FORMAT_B10G11R11_UFLOAT_PACK32},  // ?
+    {TextureFormat::k_11_11_10, VK_FORMAT_B10G11R11_UFLOAT_PACK32},  // ?
+    {TextureFormat::k_DXT1, VK_FORMAT_BC1_RGBA_SRGB_BLOCK},          // ?
+    {TextureFormat::k_DXT2_3, VK_FORMAT_BC3_SRGB_BLOCK},             // ?
+    {TextureFormat::k_DXT4_5, VK_FORMAT_BC5_UNORM_BLOCK},            // ?
+    {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_24_8, VK_FORMAT_D24_UNORM_S8_UINT},
+    {TextureFormat::k_24_8_FLOAT, VK_FORMAT_D24_UNORM_S8_UINT},  // ?
+    {TextureFormat::k_16, VK_FORMAT_R16_UNORM},
+    {TextureFormat::k_16_16, VK_FORMAT_R16G16_UNORM},
+    {TextureFormat::k_16_16_16_16, VK_FORMAT_R16G16B16A16_UNORM},
+    {TextureFormat::k_16_EXPAND, VK_FORMAT_R16_UNORM},                    // ?
+    {TextureFormat::k_16_16_EXPAND, VK_FORMAT_R16G16_UNORM},              // ?
+    {TextureFormat::k_16_16_16_16_EXPAND, VK_FORMAT_R16G16B16A16_UNORM},  // ?
+    {TextureFormat::k_16_FLOAT, VK_FORMAT_R16_SFLOAT},
+    {TextureFormat::k_16_16_FLOAT, VK_FORMAT_R16G16_SFLOAT},
+    {TextureFormat::k_16_16_16_16_FLOAT, VK_FORMAT_R16G16B16A16_SFLOAT},
+    {TextureFormat::k_32, VK_FORMAT_R32_SINT},
+    {TextureFormat::k_32_32, VK_FORMAT_R32G32_SINT},
+    {TextureFormat::k_32_32_32_32, VK_FORMAT_R32G32B32A32_SINT},
+    {TextureFormat::k_32_FLOAT, VK_FORMAT_R32_SFLOAT},
+    {TextureFormat::k_32_32_FLOAT, VK_FORMAT_R32G32_SFLOAT},
+    {TextureFormat::k_32_32_32_32_FLOAT, VK_FORMAT_R32G32B32A32_SFLOAT},
+    {TextureFormat::k_32_AS_8, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_32_AS_8_8, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_16_MPEG, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_16_16_MPEG, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_8_INTERLACED, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_32_AS_8_INTERLACED, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_32_AS_8_8_INTERLACED, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_16_INTERLACED, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_16_MPEG_INTERLACED, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_16_16_MPEG_INTERLACED, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_DXN, VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RG_RGTC2 */},
+    {TextureFormat::k_8_8_8_8_AS_16_16_16_16, VK_FORMAT_R8G8B8A8_UNORM},
+    {TextureFormat::k_DXT1_AS_16_16_16_16,
+     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGB_S3TC_DXT1_EXT */},
+    {TextureFormat::k_DXT2_3_AS_16_16_16_16,
+     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT3_EXT */},
+    {TextureFormat::k_DXT4_5_AS_16_16_16_16,
+     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT5_EXT */},
+    {TextureFormat::k_2_10_10_10_AS_16_16_16_16,
+     VK_FORMAT_A2R10G10B10_UNORM_PACK32},
+    {TextureFormat::k_10_11_11_AS_16_16_16_16,
+     VK_FORMAT_B10G11R11_UFLOAT_PACK32},  // ?
+    {TextureFormat::k_11_11_10_AS_16_16_16_16,
+     VK_FORMAT_B10G11R11_UFLOAT_PACK32},  // ?
+    {TextureFormat::k_32_32_32_FLOAT, VK_FORMAT_R32G32B32_SFLOAT},
+    {TextureFormat::k_DXT3A,
+     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT3_EXT */},
+    {TextureFormat::k_DXT5A,
+     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT5_EXT */},
+    {TextureFormat::k_CTX1, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_DXT3A_AS_1_1_1_1, VK_FORMAT_UNDEFINED},
+    {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
+    {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
+};
+
 TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
                            TraceWriter* trace_writer,
                            ui::vulkan::VulkanDevice* device)
@@ -46,12 +121,10 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
   descriptor_pool_info.flags =
       VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
   descriptor_pool_info.maxSets = 4096;
-  VkDescriptorPoolSize pool_sizes[2];
-  pool_sizes[0].type = VK_DESCRIPTOR_TYPE_SAMPLER;
-  pool_sizes[0].descriptorCount = 32;
-  pool_sizes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-  pool_sizes[1].descriptorCount = 32;
-  descriptor_pool_info.poolSizeCount = 2;
+  VkDescriptorPoolSize pool_sizes[1];
+  pool_sizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+  pool_sizes[0].descriptorCount = 4096;
+  descriptor_pool_info.poolSizeCount = 1;
   descriptor_pool_info.pPoolSizes = pool_sizes;
   auto err = vkCreateDescriptorPool(*device_, &descriptor_pool_info, nullptr,
                                     &descriptor_pool_);
@@ -59,17 +132,10 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
 
   // Create the descriptor set layout used for rendering.
   // We always have the same number of samplers but only some are used.
-  VkDescriptorSetLayoutBinding bindings[5];
-  auto& sampler_binding = bindings[0];
-  sampler_binding.binding = 0;
-  sampler_binding.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER;
-  sampler_binding.descriptorCount = kMaxTextureSamplers;
-  sampler_binding.stageFlags =
-      VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT;
-  sampler_binding.pImmutableSamplers = nullptr;
+  VkDescriptorSetLayoutBinding bindings[4];
   for (int i = 0; i < 4; ++i) {
-    auto& texture_binding = bindings[1 + i];
-    texture_binding.binding = 1 + i;
+    auto& texture_binding = bindings[i];
+    texture_binding.binding = i;
     texture_binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
     texture_binding.descriptorCount = kMaxTextureSamplers;
     texture_binding.stageFlags =
@@ -103,6 +169,10 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
           ((y % 32 < 16) ^ (x % 32 >= 16)) ? 0xFF0000FF : 0xFFFFFFFF;
     }
   }
+
+  invalidated_textures_sets_[0].reserve(64);
+  invalidated_textures_sets_[1].reserve(64);
+  invalidated_textures_ = &invalidated_textures_sets_[0];
 }
 
 TextureCache::~TextureCache() {
@@ -135,8 +205,27 @@ TextureCache::Texture* TextureCache::AllocateTexture(
       return nullptr;
   }
 
-  // TODO: Format
-  image_info.format = VK_FORMAT_R8G8B8A8_UNORM;
+  VkFormat format = VK_FORMAT_UNDEFINED;
+  if (texture_info.format_info) {
+    auto& config = texture_configs[int(texture_info.format_info->format)];
+    format = config.host_format != VK_FORMAT_UNDEFINED
+                 ? config.host_format
+                 : VK_FORMAT_R8G8B8A8_UNORM;
+  } else {
+    format = VK_FORMAT_R8G8B8A8_UNORM;
+  }
+
+  VkFormatProperties props;
+  uint32_t required_flags = VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
+                            VK_FORMAT_FEATURE_BLIT_DST_BIT |
+                            VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+  vkGetPhysicalDeviceFormatProperties(*device_, format, &props);
+  if ((props.optimalTilingFeatures & required_flags) != required_flags) {
+    // Texture needs conversion on upload to a native format.
+    // assert_always();
+  }
+
+  image_info.format = format;
   image_info.extent = {texture_info.width + 1, texture_info.height + 1,
                        texture_info.depth + 1};
   image_info.mipLevels = 1;
@@ -212,17 +301,24 @@ TextureCache::Texture* TextureCache::AllocateTexture(
 }
 
 bool TextureCache::FreeTexture(Texture* texture) {
-  // TODO(DrChat)
-  return false;
+  for (auto it = texture->views.begin(); it != texture->views.end();) {
+    vkDestroyImageView(*device_, (*it)->view, nullptr);
+    it = texture->views.erase(it);
+  }
+
+  vkDestroyImage(*device_, texture->image, nullptr);
+  vkFreeMemory(*device_, texture->image_memory, nullptr);
+  delete texture;
+  return true;
 }
 
 TextureCache::Texture* TextureCache::DemandResolveTexture(
     const TextureInfo& texture_info, TextureFormat format,
-    uint32_t* out_offset_x, uint32_t* out_offset_y) {
+    VkOffset2D* out_offset) {
   // Check to see if we've already used a texture at this location.
   auto texture = LookupAddress(
       texture_info.guest_address, texture_info.size_2d.block_width,
-      texture_info.size_2d.block_height, format, out_offset_x, out_offset_y);
+      texture_info.size_2d.block_height, format, out_offset);
   if (texture) {
     return texture;
   }
@@ -230,7 +326,7 @@ TextureCache::Texture* TextureCache::DemandResolveTexture(
   // No texture at this location. Make a new one.
   texture = AllocateTexture(texture_info);
   texture->is_full_texture = false;
-  resolve_textures_.push_back(std::unique_ptr<Texture>(texture));
+  resolve_textures_.push_back(texture);
   return texture;
 }
 
@@ -241,14 +337,14 @@ TextureCache::Texture* TextureCache::Demand(
   auto texture_hash = texture_info.hash();
   for (auto it = textures_.find(texture_hash); it != textures_.end(); ++it) {
     if (it->second->texture_info == texture_info) {
-      return it->second.get();
+      return it->second;
     }
   }
 
   // Check resolve textures.
   for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
        ++it) {
-    auto texture = (*it).get();
+    auto texture = (*it);
     if (texture_info.guest_address == texture->texture_info.guest_address &&
         texture_info.size_2d.logical_width ==
             texture->texture_info.size_2d.logical_width &&
@@ -259,9 +355,9 @@ TextureCache::Texture* TextureCache::Demand(
       // Upgrade this texture to a full texture.
       texture->is_full_texture = true;
       texture->texture_info = texture_info;
-      textures_[texture_hash] = std::move(*it);
+      textures_[texture_hash] = *it;
       it = resolve_textures_.erase(it);
-      return textures_[texture_hash].get();
+      return textures_[texture_hash];
     }
   }
 
@@ -290,15 +386,34 @@ TextureCache::Texture* TextureCache::Demand(
       break;
   }
 
-  // Okay. Now that the texture is uploaded from system memory, put a writewatch
-  // on it to tell us if it's been modified from the guest.
-
   if (!uploaded) {
     // TODO: Destroy the texture.
     assert_always();
     return nullptr;
   }
 
+  // Copy in overlapping resolve textures.
+  /*
+  for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
+       ++it) {
+    auto texture = (*it);
+    if (texture_info.guest_address == texture->texture_info.guest_address &&
+        texture_info.size_2d.logical_width ==
+            texture->texture_info.size_2d.logical_width &&
+        texture_info.size_2d.logical_height ==
+            texture->texture_info.size_2d.logical_height) {
+      // Exact match.
+      // TODO: Lazy match (at an offset)
+      // Upgrade this texture to a full texture.
+      texture->is_full_texture = true;
+      texture->texture_info = texture_info;
+      textures_[texture_hash] = *it;
+      it = resolve_textures_.erase(it);
+      return textures_[texture_hash];
+    }
+  }
+  */
+
   // Though we didn't find an exact match, that doesn't mean we're out of the
   // woods yet. This texture could either be a portion of another texture or
   // vice versa. Copy any overlapping textures into this texture.
@@ -306,8 +421,26 @@ TextureCache::Texture* TextureCache::Demand(
   for (auto it = textures_.begin(); it != textures_.end(); ++it) {
   }
 
-  textures_[texture_hash] = std::unique_ptr<Texture>(texture);
+  // Okay. Now that the texture is uploaded from system memory, put a writewatch
+  // on it to tell us if it's been modified from the guest.
+  texture->access_watch_handle = memory_->AddPhysicalAccessWatch(
+      texture_info.guest_address, texture_info.input_length,
+      cpu::MMIOHandler::kWatchWrite,
+      [](void* context_ptr, void* data_ptr, uint32_t address) {
+        auto self = reinterpret_cast<TextureCache*>(context_ptr);
+        auto touched_texture = reinterpret_cast<Texture*>(data_ptr);
+        // Clear watch handle first so we don't redundantly
+        // remove.
+        touched_texture->access_watch_handle = 0;
+        touched_texture->pending_invalidation = true;
+        // Add to pending list so Scavenge will clean it up.
+        self->invalidated_textures_mutex_.lock();
+        self->invalidated_textures_->push_back(touched_texture);
+        self->invalidated_textures_mutex_.unlock();
+      },
+      this, texture);
 
+  textures_[texture_hash] = texture;
   return texture;
 }
 
@@ -346,7 +479,7 @@ TextureCache::TextureView* TextureCache::DemandView(Texture* texture,
   VkComponentSwizzle swiz_component_map[] = {
       VK_COMPONENT_SWIZZLE_R,        VK_COMPONENT_SWIZZLE_G,
       VK_COMPONENT_SWIZZLE_B,        VK_COMPONENT_SWIZZLE_A,
-      VK_COMPONENT_SWIZZLE_ONE,      VK_COMPONENT_SWIZZLE_ZERO,
+      VK_COMPONENT_SWIZZLE_ZERO,     VK_COMPONENT_SWIZZLE_ONE,
       VK_COMPONENT_SWIZZLE_IDENTITY,
   };
 
@@ -373,11 +506,15 @@ TextureCache::TextureView* TextureCache::DemandView(Texture* texture,
 }
 
 TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   auto sampler_hash = sampler_info.hash();
   for (auto it = samplers_.find(sampler_hash); it != samplers_.end(); ++it) {
     if (it->second->sampler_info == sampler_info) {
       // Found a compatible sampler.
-      return it->second.get();
+      return it->second;
     }
   }
 
@@ -389,10 +526,55 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
   sampler_create_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
   sampler_create_info.pNext = nullptr;
   sampler_create_info.flags = 0;
-  sampler_create_info.minFilter = VK_FILTER_NEAREST;
-  sampler_create_info.magFilter = VK_FILTER_NEAREST;
   sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
 
+  // Texture level filtering.
+  VkSamplerMipmapMode mip_filter;
+  switch (sampler_info.mip_filter) {
+    case TextureFilter::kBaseMap:
+      // TODO(DrChat): ?
+      mip_filter = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+      break;
+    case TextureFilter::kPoint:
+      mip_filter = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+      break;
+    case TextureFilter::kLinear:
+      mip_filter = VK_SAMPLER_MIPMAP_MODE_LINEAR;
+      break;
+    default:
+      assert_unhandled_case(sampler_info.mip_filter);
+      return nullptr;
+  }
+
+  VkFilter min_filter;
+  switch (sampler_info.min_filter) {
+    case TextureFilter::kPoint:
+      min_filter = VK_FILTER_NEAREST;
+      break;
+    case TextureFilter::kLinear:
+      min_filter = VK_FILTER_LINEAR;
+      break;
+    default:
+      assert_unhandled_case(sampler_info.min_filter);
+      return nullptr;
+  }
+  VkFilter mag_filter;
+  switch (sampler_info.mag_filter) {
+    case TextureFilter::kPoint:
+      mag_filter = VK_FILTER_NEAREST;
+      break;
+    case TextureFilter::kLinear:
+      mag_filter = VK_FILTER_LINEAR;
+      break;
+    default:
+      assert_unhandled_case(mag_filter);
+      return nullptr;
+  }
+
+  sampler_create_info.minFilter = min_filter;
+  sampler_create_info.magFilter = mag_filter;
+  sampler_create_info.mipmapMode = mip_filter;
+
   // FIXME: Both halfway / mirror clamp to border aren't mapped properly.
   VkSamplerAddressMode address_mode_map[] = {
       /* kRepeat               */ VK_SAMPLER_ADDRESS_MODE_REPEAT,
@@ -431,37 +613,46 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
   auto sampler = new Sampler();
   sampler->sampler = vk_sampler;
   sampler->sampler_info = sampler_info;
-  samplers_[sampler_hash] = std::unique_ptr<Sampler>(sampler);
+  samplers_[sampler_hash] = sampler;
 
   return sampler;
 }
 
-TextureCache::Texture* TextureCache::LookupAddress(
-    uint32_t guest_address, uint32_t width, uint32_t height,
-    TextureFormat format, uint32_t* offset_x, uint32_t* offset_y) {
+TextureCache::Texture* TextureCache::LookupAddress(uint32_t guest_address,
+                                                   uint32_t width,
+                                                   uint32_t height,
+                                                   TextureFormat format,
+                                                   VkOffset2D* out_offset) {
   for (auto it = textures_.begin(); it != textures_.end(); ++it) {
     const auto& texture_info = it->second->texture_info;
     if (guest_address >= texture_info.guest_address &&
         guest_address <
             texture_info.guest_address + texture_info.input_length &&
-        offset_x && offset_y) {
+        texture_info.size_2d.input_width >= width &&
+        texture_info.size_2d.input_height >= height && out_offset) {
       auto offset_bytes = guest_address - texture_info.guest_address;
 
       if (texture_info.dimension == Dimension::k2D) {
-        *offset_y = offset_bytes / texture_info.size_2d.input_pitch;
+        out_offset->x = 0;
+        out_offset->y = offset_bytes / texture_info.size_2d.input_pitch;
         if (offset_bytes % texture_info.size_2d.input_pitch != 0) {
           // TODO: offset_x
         }
       }
 
-      return it->second.get();
+      return it->second;
     }
 
     if (texture_info.guest_address == guest_address &&
         texture_info.dimension == Dimension::k2D &&
         texture_info.size_2d.input_width == width &&
         texture_info.size_2d.input_height == height) {
-      return it->second.get();
+      if (out_offset) {
+        out_offset->x = 0;
+        out_offset->y = 0;
+      }
+
+      return it->second;
     }
   }
 
@@ -469,27 +660,16 @@ TextureCache::Texture* TextureCache::LookupAddress(
   for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
        ++it) {
     const auto& texture_info = (*it)->texture_info;
-    if (guest_address >= texture_info.guest_address &&
-        guest_address <
-            texture_info.guest_address + texture_info.input_length &&
-        offset_x && offset_y) {
-      auto offset_bytes = guest_address - texture_info.guest_address;
-
-      if (texture_info.dimension == Dimension::k2D) {
-        *offset_y = offset_bytes / texture_info.size_2d.input_pitch;
-        if (offset_bytes % texture_info.size_2d.input_pitch != 0) {
-          // TODO: offset_x
-        }
-      }
-
-      return (*it).get();
-    }
-
     if (texture_info.guest_address == guest_address &&
         texture_info.dimension == Dimension::k2D &&
         texture_info.size_2d.input_width == width &&
         texture_info.size_2d.input_height == height) {
-      return (*it).get();
+      if (out_offset) {
+        out_offset->x = 0;
+        out_offset->y = 0;
+      }
+
+      return (*it);
     }
   }
 
@@ -531,19 +711,74 @@ bool TextureCache::UploadTexture2D(
   }
 
   // Grab some temporary memory for staging.
-  auto alloc = staging_buffer_.Acquire(src.input_length, completion_fence);
+  size_t unpack_length = src.output_length;
+  auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
   assert_not_null(alloc);
 
-  // TODO: Support these cases.
-  // assert_false(src.is_tiled);
+  // TODO: Support compression.
   // assert_false(src.is_compressed());
 
   // Upload texture into GPU memory.
   // TODO: If the GPU supports it, we can submit a compute batch to convert the
   // texture and copy it to its destination. Otherwise, fallback to conversion
   // on the CPU.
-  auto guest_ptr = memory_->TranslatePhysical(src.guest_address);
-  TextureSwap(src.endianness, alloc->host_ptr, guest_ptr, src.input_length);
+  void* host_address = memory_->TranslatePhysical(src.guest_address);
+  if (!src.is_tiled) {
+    if (src.size_2d.input_pitch == src.size_2d.output_pitch) {
+      // Fast path copy entire image.
+      TextureSwap(src.endianness, alloc->host_ptr, host_address, unpack_length);
+    } else {
+      // Slow path copy row-by-row because strides differ.
+      // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does
+      // this exact thing under the covers, so we just always do it here.
+      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+      uint8_t* dest = reinterpret_cast<uint8_t*>(alloc->host_ptr);
+      uint32_t pitch =
+          std::min(src.size_2d.input_pitch, src.size_2d.output_pitch);
+      for (uint32_t y = 0;
+           y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
+           y++) {
+        TextureSwap(src.endianness, dest, src_mem, pitch);
+        src_mem += src.size_2d.input_pitch;
+        dest += src.size_2d.output_pitch;
+      }
+    }
+  } else {
+    // Untile image.
+    // We could do this in a shader to speed things up, as this is pretty slow.
+
+    // TODO(benvanik): optimize this inner loop (or work by tiles).
+    const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+    uint8_t* dest = reinterpret_cast<uint8_t*>(alloc->host_ptr);
+    uint32_t bytes_per_block = src.format_info->block_width *
+                               src.format_info->block_height *
+                               src.format_info->bits_per_pixel / 8;
+
+    // Tiled textures can be packed; get the offset into the packed texture.
+    uint32_t offset_x;
+    uint32_t offset_y;
+    TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
+
+    auto bpp = (bytes_per_block >> 2) +
+               ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
+    for (uint32_t y = 0, output_base_offset = 0;
+         y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
+         y++, output_base_offset += src.size_2d.output_pitch) {
+      auto input_base_offset = TextureInfo::TiledOffset2DOuter(
+          offset_y + y,
+          (src.size_2d.input_width / src.format_info->block_width), bpp);
+      for (uint32_t x = 0, output_offset = output_base_offset;
+           x < src.size_2d.block_width; x++, output_offset += bytes_per_block) {
+        auto input_offset =
+            TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
+                                            input_base_offset) >>
+            bpp;
+        TextureSwap(src.endianness, dest + output_offset,
+                    src_mem + input_offset * bytes_per_block, bytes_per_block);
+      }
+    }
+  }
+
   staging_buffer_.Flush(alloc);
 
   // Insert a memory barrier into the command buffer to ensure the upload has
@@ -580,21 +815,15 @@ bool TextureCache::UploadTexture2D(
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                        nullptr, 1, &barrier);
 
-  assert_true(src.size_2d.input_width >=
-              dest->texture_info.size_2d.output_width);
-  assert_true(src.size_2d.input_height >=
-              dest->texture_info.size_2d.output_height);
-
-  // For now, just transfer the grid we uploaded earlier into the texture.
+  // Now move the converted texture into the destination.
   VkBufferImageCopy copy_region;
   copy_region.bufferOffset = alloc->offset;
-  copy_region.bufferRowLength = src.width + 1;
-  copy_region.bufferImageHeight = src.height + 1;
+  copy_region.bufferRowLength = src.size_2d.output_width;
+  copy_region.bufferImageHeight = src.size_2d.output_height;
   copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
   copy_region.imageOffset = {0, 0, 0};
-  copy_region.imageExtent = {dest->texture_info.width + 1,
-                             dest->texture_info.height + 1,
-                             dest->texture_info.depth + 1};
+  copy_region.imageExtent = {src.size_2d.output_width,
+                             src.size_2d.output_height, 1};
   vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
                          dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
                          &copy_region);
@@ -659,27 +888,13 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
   VkWriteDescriptorSet descriptor_writes[4];
   std::memset(descriptor_writes, 0, sizeof(descriptor_writes));
   uint32_t descriptor_write_count = 0;
-  /*
-  // TODO(DrChat): Do we really need to separate samplers and images here?
-  if (update_set_info->sampler_write_count) {
-    auto& sampler_write = descriptor_writes[descriptor_write_count++];
-    sampler_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    sampler_write.pNext = nullptr;
-    sampler_write.dstSet = descriptor_set;
-    sampler_write.dstBinding = 0;
-    sampler_write.dstArrayElement = 0;
-    sampler_write.descriptorCount = update_set_info->sampler_write_count;
-    sampler_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER;
-    sampler_write.pImageInfo = update_set_info->sampler_infos;
-  }
-  */
   // FIXME: These are not be lined up properly with tf binding points!!!!!
   if (update_set_info->image_1d_write_count) {
     auto& image_write = descriptor_writes[descriptor_write_count++];
     image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     image_write.pNext = nullptr;
     image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 1;
+    image_write.dstBinding = 0;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_1d_write_count;
     image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
@@ -690,7 +905,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     image_write.pNext = nullptr;
     image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 2;
+    image_write.dstBinding = 1;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_2d_write_count;
     image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
@@ -701,7 +916,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     image_write.pNext = nullptr;
     image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 3;
+    image_write.dstBinding = 2;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_3d_write_count;
     image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
@@ -712,7 +927,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
     image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     image_write.pNext = nullptr;
     image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 4;
+    image_write.dstBinding = 3;
     image_write.dstArrayElement = 0;
     image_write.descriptorCount = update_set_info->image_cube_write_count;
     image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
@@ -814,7 +1029,7 @@ bool TextureCache::SetupTextureBinding(
       return false;
   }
   image_write->imageView = view->view;
-  image_write->imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+  image_write->imageLayout = texture->image_layout;
   image_write->sampler = sampler->sampler;
 
   return true;
@@ -838,6 +1053,25 @@ void TextureCache::Scavenge() {
   }
 
   staging_buffer_.Scavenge();
+
+  // Clean up any invalidated textures.
+  invalidated_textures_mutex_.lock();
+  std::vector<Texture*>& invalidated_textures = *invalidated_textures_;
+  if (invalidated_textures_ == &invalidated_textures_sets_[0]) {
+    invalidated_textures_ = &invalidated_textures_sets_[1];
+  } else {
+    invalidated_textures_ = &invalidated_textures_sets_[0];
+  }
+  invalidated_textures_mutex_.unlock();
+  if (invalidated_textures.empty()) {
+    return;
+  }
+
+  for (auto& texture : invalidated_textures) {
+    textures_.erase(texture->texture_info.hash());
+    FreeTexture(texture);
+  }
+  invalidated_textures.clear();
 }
 
 }  // namespace vulkan
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index dfc993763..b564fcc48 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -101,12 +101,12 @@ class TextureCache {
   // contains this address at an offset.
   Texture* LookupAddress(uint32_t guest_address, uint32_t width,
                          uint32_t height, TextureFormat format,
-                         uint32_t* offset_x = nullptr,
-                         uint32_t* offset_y = nullptr);
+                         VkOffset2D* out_offset = nullptr);
 
   // Demands a texture for the purpose of resolving from EDRAM. This either
   // creates a new texture or returns a previously created texture. texture_info
-  // is not required to be completely filled out, just guest_address and size.
+  // is not required to be completely filled out, just guest_address and all
+  // sizes.
   //
   // It's possible that this may return an image that is larger than the
   // requested size (e.g. resolving into a bigger texture) or an image that
@@ -114,8 +114,7 @@ class TextureCache {
   // At the very least, it's guaranteed that the image will be large enough to
   // hold the requested size.
   Texture* DemandResolveTexture(const TextureInfo& texture_info,
-                                TextureFormat format, uint32_t* out_offset_x,
-                                uint32_t* out_offset_y);
+                                TextureFormat format, VkOffset2D* out_offset);
 
   // Clears all cached content.
   void ClearCache();
@@ -172,11 +171,14 @@ class TextureCache {
   std::vector<std::pair<VkDescriptorSet, std::shared_ptr<ui::vulkan::Fence>>>
       in_flight_sets_;
 
-  // Temporary until we have circular buffers.
   ui::vulkan::CircularBuffer staging_buffer_;
-  std::unordered_map<uint64_t, std::unique_ptr<Texture>> textures_;
-  std::unordered_map<uint64_t, std::unique_ptr<Sampler>> samplers_;
-  std::vector<std::unique_ptr<Texture>> resolve_textures_;
+  std::unordered_map<uint64_t, Texture*> textures_;
+  std::unordered_map<uint64_t, Sampler*> samplers_;
+  std::vector<Texture*> resolve_textures_;
+
+  std::mutex invalidated_textures_mutex_;
+  std::vector<Texture*>* invalidated_textures_;
+  std::vector<Texture*> invalidated_textures_sets_[2];
 
   struct UpdateSetInfo {
     // Bitmap of all 32 fetch constants and whether they have been setup yet.

From 50f72b4e42364af54e287dba01de37999e4a9fc1 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 1 Apr 2016 21:52:39 -0500
Subject: [PATCH 37/77] Enable native MSAA Copy back EDRAM buffers in order by
 base offset.

---
 src/xenia/gpu/vulkan/pipeline_cache.cc        |  94 ++++--
 src/xenia/gpu/vulkan/pipeline_cache.h         |   5 +
 src/xenia/gpu/vulkan/render_cache.cc          | 284 +++++++++++-------
 src/xenia/gpu/vulkan/render_cache.h           |  28 +-
 .../gpu/vulkan/vulkan_command_processor.cc    |  88 +++---
 .../gpu/vulkan/vulkan_command_processor.h     |   1 +
 6 files changed, 333 insertions(+), 167 deletions(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index efcaf5b46..19db3cd4f 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -187,6 +187,10 @@ PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline(
     VkCommandBuffer command_buffer, const RenderState* render_state,
     VulkanShader* vertex_shader, VulkanShader* pixel_shader,
     PrimitiveType primitive_type, VkPipeline* pipeline_out) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   assert_not_null(pipeline_out);
 
   // Perform a pass over all registers and state updating our cached structures.
@@ -323,6 +327,10 @@ VkShaderModule PipelineCache::GetGeometryShader(PrimitiveType primitive_type,
 
 bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
                                     bool full_update) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   auto& regs = set_dynamic_state_registers_;
 
   bool window_offset_dirty = SetShadowRegister(&regs.pa_sc_window_offset,
@@ -393,20 +401,25 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
     auto surface_msaa =
         static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
     // TODO(benvanik): ??
+    // FIXME: Some games depend on these for proper clears (e.g. only clearing
+    // half the size they actually want with 4x MSAA), but others don't.
+    // Figure out how these games are expecting clears to be done.
     float window_width_scalar = 1;
     float window_height_scalar = 1;
     switch (surface_msaa) {
       case MsaaSamples::k1X:
         break;
       case MsaaSamples::k2X:
-        window_width_scalar = 2;
+        // ??
+        window_width_scalar = window_height_scalar = 1.41421356f;
         break;
       case MsaaSamples::k4X:
-        window_width_scalar = 2;
-        window_height_scalar = 2;
+        window_width_scalar = window_height_scalar = 2;
         break;
     }
 
+    // window_width_scalar = window_height_scalar = 1;
+
     // Whether each of the viewport settings are enabled.
     // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
     bool vport_xscale_enable = (regs.pa_cl_vte_cntl & (1 << 0)) > 0;
@@ -434,6 +447,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
       float voy = vport_yoffset_enable ? regs.pa_cl_vport_yoffset : 0;
       float vsx = vport_xscale_enable ? regs.pa_cl_vport_xscale : 1;
       float vsy = vport_yscale_enable ? regs.pa_cl_vport_yscale : 1;
+
       window_width_scalar = window_height_scalar = 1;
       float vpw = 2 * window_width_scalar * vsx;
       float vph = -2 * window_height_scalar * vsy;
@@ -481,25 +495,25 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
     vkCmdSetBlendConstants(command_buffer, regs.rb_blend_rgba);
   }
 
-  // VK_DYNAMIC_STATE_LINE_WIDTH
-  vkCmdSetLineWidth(command_buffer, 1.0f);
+  if (full_update) {
+    // VK_DYNAMIC_STATE_LINE_WIDTH
+    vkCmdSetLineWidth(command_buffer, 1.0f);
 
-  // VK_DYNAMIC_STATE_DEPTH_BIAS
-  vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f);
+    // VK_DYNAMIC_STATE_DEPTH_BIAS
+    vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f);
 
-  // VK_DYNAMIC_STATE_DEPTH_BOUNDS
-  vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f);
+    // VK_DYNAMIC_STATE_DEPTH_BOUNDS
+    vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f);
 
-  // VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK
-  vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
+    // VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK
+    vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
 
-  // VK_DYNAMIC_STATE_STENCIL_REFERENCE
-  vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
+    // VK_DYNAMIC_STATE_STENCIL_REFERENCE
+    vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
 
-  // VK_DYNAMIC_STATE_STENCIL_WRITE_MASK
-  vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
-
-  // TODO(benvanik): push constants.
+    // VK_DYNAMIC_STATE_STENCIL_WRITE_MASK
+    vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0);
+  }
 
   bool push_constants_dirty = full_update || viewport_state_dirty;
   push_constants_dirty |=
@@ -530,7 +544,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
       push_constants.window_scale[1] = -1.0f;
     } else {
       push_constants.window_scale[0] = 1.0f / 2560.0f;
-      push_constants.window_scale[1] = -1.0f / 2560.0f;
+      push_constants.window_scale[1] = 1.0f / 2560.0f;
     }
 
     // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
@@ -756,7 +770,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState(
                                            : VK_FORMAT_A2R10G10B10_UNORM_PACK32;
           break;
         case VertexFormat::k_10_11_11:
-          assert_always("unsupported?");
+          // assert_always("unsupported?");
           vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32;
           break;
         case VertexFormat::k_11_11_10:
@@ -934,6 +948,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
                              XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR);
   dirty |= SetShadowRegister(&regs.multi_prim_ib_reset_index,
                              XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX);
+  dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
   regs.primitive_type = primitive_type;
   XXH64_update(&hash_state_, &regs, sizeof(regs));
   if (!dirty) {
@@ -947,7 +962,13 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
   // TODO(benvanik): right setting?
   state_info.depthClampEnable = VK_FALSE;
 
-  // TODO(benvanik): use in depth-only mode?
+  // Discard rasterizer output in depth-only mode.
+  // TODO(DrChat): Figure out how to make this work properly.
+  /*
+  auto enable_mode = static_cast<xenos::ModeControl>(regs.rb_modecontrol & 0x7);
+  state_info.rasterizerDiscardEnable =
+      enable_mode == xenos::ModeControl::kColorDepth ? VK_FALSE : VK_TRUE;
+  //*/
   state_info.rasterizerDiscardEnable = VK_FALSE;
 
   bool poly_mode = ((regs.pa_su_sc_mode_cntl >> 3) & 0x3) != 0;
@@ -1004,20 +1025,49 @@ PipelineCache::UpdateStatus PipelineCache::UpdateMultisampleState() {
   auto& regs = update_multisample_state_regs_;
   auto& state_info = update_multisample_state_info_;
 
+  bool dirty = false;
+  dirty |= SetShadowRegister(&regs.pa_sc_aa_config, XE_GPU_REG_PA_SC_AA_CONFIG);
+  dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
+                             XE_GPU_REG_PA_SU_SC_MODE_CNTL);
+  dirty |= SetShadowRegister(&regs.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO);
+  XXH64_update(&hash_state_, &regs, sizeof(regs));
+  if (!dirty) {
+    return UpdateStatus::kCompatible;
+  }
+
   state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
   state_info.pNext = nullptr;
   state_info.flags = 0;
 
   // PA_SC_AA_CONFIG MSAA_NUM_SAMPLES
   // PA_SU_SC_MODE_CNTL MSAA_ENABLE
-  state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+  // state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+  //*
+  auto msaa_num_samples =
+      static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
+  switch (msaa_num_samples) {
+    case MsaaSamples::k1X:
+      state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+      break;
+    case MsaaSamples::k2X:
+      state_info.rasterizationSamples = VK_SAMPLE_COUNT_2_BIT;
+      break;
+    case MsaaSamples::k4X:
+      state_info.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT;
+      break;
+    default:
+      assert_unhandled_case(msaa_num_samples);
+      break;
+  }
+  //*/
+
   state_info.sampleShadingEnable = VK_FALSE;
   state_info.minSampleShading = 0;
   state_info.pSampleMask = nullptr;
   state_info.alphaToCoverageEnable = VK_FALSE;
   state_info.alphaToOneEnable = VK_FALSE;
 
-  return UpdateStatus::kCompatible;
+  return UpdateStatus::kMismatch;
 }
 
 PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() {
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h
index 66b2e87ef..f240b9c0d 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/pipeline_cache.h
@@ -211,6 +211,7 @@ class PipelineCache {
     uint32_t pa_sc_screen_scissor_tl;
     uint32_t pa_sc_screen_scissor_br;
     uint32_t multi_prim_ib_reset_index;
+    uint32_t rb_modecontrol;
 
     UpdateRasterizationStateRegisters() { Reset(); }
     void Reset() { std::memset(this, 0, sizeof(*this)); }
@@ -218,6 +219,10 @@ class PipelineCache {
   VkPipelineRasterizationStateCreateInfo update_rasterization_state_info_;
 
   struct UpdateMultisampleStateeRegisters {
+    uint32_t pa_sc_aa_config;
+    uint32_t pa_su_sc_mode_cntl;
+    uint32_t rb_surface_info;
+
     UpdateMultisampleStateeRegisters() { Reset(); }
     void Reset() { std::memset(this, 0, sizeof(*this)); }
   } update_multisample_state_regs_;
diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 334a1215f..7e0528866 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -165,8 +165,23 @@ CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device,
   image_info.extent.depth = 1;
   image_info.mipLevels = 1;
   image_info.arrayLayers = 1;
-  image_info.samples =
-      static_cast<VkSampleCountFlagBits>(VK_SAMPLE_COUNT_1_BIT);
+  // image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+  //*
+  auto msaa_samples = static_cast<MsaaSamples>(key.msaa_samples);
+  switch (msaa_samples) {
+    case MsaaSamples::k1X:
+      image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+      break;
+    case MsaaSamples::k2X:
+      image_info.samples = VK_SAMPLE_COUNT_2_BIT;
+      break;
+    case MsaaSamples::k4X:
+      image_info.samples = VK_SAMPLE_COUNT_4_BIT;
+      break;
+    default:
+      assert_unhandled_case(msaa_samples);
+  }
+  //*/
   image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
   image_info.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                      VK_IMAGE_USAGE_TRANSFER_DST_BIT |
@@ -322,13 +337,29 @@ CachedRenderPass::CachedRenderPass(VkDevice device,
     : device_(device) {
   std::memcpy(&config, &desired_config, sizeof(config));
 
+  VkSampleCountFlagBits sample_count;
+  switch (desired_config.surface_msaa) {
+  case MsaaSamples::k1X:
+    sample_count = VK_SAMPLE_COUNT_1_BIT;
+    break;
+  case MsaaSamples::k2X:
+    sample_count = VK_SAMPLE_COUNT_2_BIT;
+    break;
+  case MsaaSamples::k4X:
+    sample_count = VK_SAMPLE_COUNT_4_BIT;
+    break;
+  default:
+    assert_unhandled_case(desired_config.surface_msaa);
+    break;
+  }
+
   // Initialize all attachments to default unused.
   // As we set layout(location=RT) in shaders we must always provide 4.
   VkAttachmentDescription attachments[5];
   for (int i = 0; i < 4; ++i) {
     attachments[i].flags = 0;
     attachments[i].format = VK_FORMAT_UNDEFINED;
-    attachments[i].samples = VK_SAMPLE_COUNT_1_BIT;
+    attachments[i].samples = sample_count;
     attachments[i].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
     attachments[i].storeOp = VK_ATTACHMENT_STORE_OP_STORE;
     attachments[i].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
@@ -339,7 +370,7 @@ CachedRenderPass::CachedRenderPass(VkDevice device,
   auto& depth_stencil_attachment = attachments[4];
   depth_stencil_attachment.flags = 0;
   depth_stencil_attachment.format = VK_FORMAT_UNDEFINED;
-  depth_stencil_attachment.samples = VK_SAMPLE_COUNT_1_BIT;
+  depth_stencil_attachment.samples = sample_count;
   depth_stencil_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
   depth_stencil_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
   depth_stencil_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
@@ -404,6 +435,10 @@ CachedRenderPass::~CachedRenderPass() {
 
 bool CachedRenderPass::IsCompatible(
     const RenderConfiguration& desired_config) const {
+  if (config.surface_msaa != desired_config.surface_msaa) {
+    return false;
+  }
+
   for (int i = 0; i < 4; ++i) {
     // TODO(benvanik): allow compatible vulkan formats.
     if (config.color[i].format != desired_config.color[i].format) {
@@ -503,12 +538,18 @@ bool RenderCache::dirty() const {
            regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
   dirty |= cur_regs.pa_sc_window_scissor_br !=
            regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
+  dirty |= (cur_regs.rb_depthcontrol & (0x4 | 0x2)) !=
+           (regs[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
   return dirty;
 }
 
 const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
                                                 VulkanShader* vertex_shader,
                                                 VulkanShader* pixel_shader) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   assert_null(current_command_buffer_);
   current_command_buffer_ = command_buffer;
 
@@ -520,6 +561,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
   bool dirty = false;
   dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
   dirty |= SetShadowRegister(&regs.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO);
+  dirty |= SetShadowRegister(&regs.rb_color_mask, XE_GPU_REG_RB_COLOR_MASK);
   dirty |= SetShadowRegister(&regs.rb_color_info, XE_GPU_REG_RB_COLOR_INFO);
   dirty |= SetShadowRegister(&regs.rb_color1_info, XE_GPU_REG_RB_COLOR1_INFO);
   dirty |= SetShadowRegister(&regs.rb_color2_info, XE_GPU_REG_RB_COLOR2_INFO);
@@ -529,7 +571,11 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL);
   dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_br,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR);
-  regs.rb_depthcontrol = register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32;
+  dirty |=
+      (regs.rb_depthcontrol & (0x4 | 0x2)) !=
+      (register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
+  regs.rb_depthcontrol =
+      register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2);
   if (!dirty && current_state_.render_pass) {
     // No registers have changed so we can reuse the previous render pass -
     // just begin with what we had.
@@ -549,7 +595,10 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
 
     // Speculatively see if targets are actually used so we can skip copies
     for (int i = 0; i < 4; i++) {
-      config->color[i].used = pixel_shader->writes_color_target(i);
+      uint32_t color_mask = (regs.rb_color_mask >> (i * 4)) & 0xF;
+      config->color[i].used =
+          config->mode_control == xenos::ModeControl::kColorDepth &&
+          color_mask != 0;
     }
     config->depth_stencil.used = !!(regs.rb_depthcontrol & (0x4 | 0x2));
 
@@ -558,66 +607,20 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
     current_state_.framebuffer = framebuffer;
     current_state_.framebuffer_handle = framebuffer->handle;
 
-    VkBufferMemoryBarrier barrier;
-    barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-    barrier.pNext = nullptr;
-    barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-    barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
-    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    barrier.buffer = edram_buffer_;
-    barrier.offset = 0;
-    barrier.size = 0;
-
-    // Copy EDRAM buffer into render targets with tight packing.
-    VkBufferImageCopy region;
-    region.bufferRowLength = 0;
-    region.bufferImageHeight = 0;
-    region.imageOffset = {0, 0, 0};
-
     // Depth
     auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
     if (depth_target && current_state_.config.depth_stencil.used) {
-      region.imageSubresource = {
-          VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
-      region.bufferOffset = depth_target->key.tile_offset * 5120;
-
-      // Wait for any potential copies to finish.
-      barrier.offset = region.bufferOffset;
-      barrier.size = depth_target->key.tile_width * 80 *
-                     depth_target->key.tile_height * 16 * 4;
-      vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                           VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
-                           &barrier, 0, nullptr);
-
-      region.imageExtent = {depth_target->key.tile_width * 80u,
-                            depth_target->key.tile_height * 16u, 1};
-      vkCmdCopyBufferToImage(command_buffer, edram_buffer_, depth_target->image,
-                             VK_IMAGE_LAYOUT_GENERAL, 1, &region);
+      UpdateTileView(command_buffer, depth_target, true);
     }
 
     // Color
-    region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
     for (int i = 0; i < 4; i++) {
       auto target = current_state_.framebuffer->color_attachments[i];
       if (!target || !current_state_.config.color[i].used) {
         continue;
       }
 
-      region.bufferOffset = target->key.tile_offset * 5120;
-
-      // Wait for any potential copies to finish.
-      barrier.offset = region.bufferOffset;
-      barrier.size =
-          target->key.tile_width * 80 * target->key.tile_height * 16 * 4;
-      vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                           VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
-                           &barrier, 0, nullptr);
-
-      region.imageExtent = {target->key.tile_width * 80u,
-                            target->key.tile_height * 16u, 1};
-      vkCmdCopyBufferToImage(command_buffer, edram_buffer_, target->image,
-                             VK_IMAGE_LAYOUT_GENERAL, 1, &region);
+      UpdateTileView(command_buffer, target, true);
     }
   }
   if (!render_pass) {
@@ -758,6 +761,7 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
       color_key.tile_width = xe::round_up(config->surface_pitch_px, 80) / 80;
       color_key.tile_height = xe::round_up(config->surface_height_px, 16) / 16;
       color_key.color_or_depth = 1;
+      color_key.msaa_samples = static_cast<uint16_t>(config->surface_msaa);
       color_key.edram_format = static_cast<uint16_t>(config->color[i].format);
       target_color_attachments[i] =
           FindOrCreateTileView(command_buffer, color_key);
@@ -774,6 +778,8 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
     depth_stencil_key.tile_height =
         xe::round_up(config->surface_height_px, 16) / 16;
     depth_stencil_key.color_or_depth = 0;
+    depth_stencil_key.msaa_samples =
+        static_cast<uint16_t>(config->surface_msaa);
     depth_stencil_key.edram_format =
         static_cast<uint16_t>(config->depth_stencil.format);
     auto target_depth_stencil_attachment =
@@ -810,6 +816,51 @@ CachedTileView* RenderCache::FindOrCreateTileView(
   return tile_view;
 }
 
+void RenderCache::UpdateTileView(VkCommandBuffer command_buffer,
+                                 CachedTileView* view, bool load,
+                                 bool insert_barrier) {
+  if (insert_barrier) {
+    VkBufferMemoryBarrier barrier;
+    barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+    barrier.pNext = nullptr;
+    if (load) {
+      barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+      barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+    } else {
+      barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+      barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    }
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.buffer = edram_buffer_;
+    barrier.offset = view->key.tile_offset * 5120;
+    barrier.size = view->key.tile_width * 80 * view->key.tile_height * 16 * 4;
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
+                         &barrier, 0, nullptr);
+  }
+
+  VkBufferImageCopy region;
+  region.bufferOffset = view->key.tile_offset * 5120;
+  region.bufferRowLength = 0;
+  region.bufferImageHeight = 0;
+  region.imageSubresource = {0, 0, 0, 1};
+  region.imageSubresource.aspectMask =
+      view->key.color_or_depth
+          ? VK_IMAGE_ASPECT_COLOR_BIT
+          : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  region.imageOffset = {0, 0, 0};
+  region.imageExtent = {view->key.tile_width * 80u, view->key.tile_height * 16u,
+                        1};
+  if (load) {
+    vkCmdCopyBufferToImage(command_buffer, edram_buffer_, view->image,
+                           VK_IMAGE_LAYOUT_GENERAL, 1, &region);
+  } else {
+    vkCmdCopyImageToBuffer(command_buffer, view->image, VK_IMAGE_LAYOUT_GENERAL,
+                           edram_buffer_, 1, &region);
+  }
+}
+
 CachedTileView* RenderCache::FindTileView(const TileViewKey& view_key) const {
   // Check the cache.
   // TODO(benvanik): better lookup.
@@ -837,35 +888,31 @@ void RenderCache::EndRenderPass() {
   // can't get the correct height atm) and we may end up overwriting the valid
   // contents of another render target by mistake! Need to reorder copy commands
   // to avoid this.
-  VkBufferImageCopy region;
-  region.bufferRowLength = 0;
-  region.bufferImageHeight = 0;
-  region.imageOffset = {0, 0, 0};
-  // Depth/stencil
+
+  std::vector<CachedTileView*> cached_views;
+
+  // Depth
   auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
   if (depth_target && current_state_.config.depth_stencil.used) {
-    region.imageSubresource = {
-        VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1};
-    region.bufferOffset = depth_target->key.tile_offset * 5120;
-    region.imageExtent = {depth_target->key.tile_width * 80u,
-                          depth_target->key.tile_height * 16u, 1};
-    vkCmdCopyImageToBuffer(current_command_buffer_, depth_target->image,
-                           VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
+    cached_views.push_back(depth_target);
   }
 
   // Color
-  region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
   for (int i = 0; i < 4; i++) {
     auto target = current_state_.framebuffer->color_attachments[i];
     if (!target || !current_state_.config.color[i].used) {
       continue;
     }
 
-    region.bufferOffset = target->key.tile_offset * 5120;
-    region.imageExtent = {target->key.tile_width * 80u,
-                          target->key.tile_height * 16u, 1};
-    vkCmdCopyImageToBuffer(current_command_buffer_, target->image,
-                           VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, &region);
+    cached_views.push_back(target);
+  }
+
+  std::sort(
+      cached_views.begin(), cached_views.end(),
+      [](CachedTileView const* a, CachedTileView const* b) { return *a < *b; });
+
+  for (auto view : cached_views) {
+    UpdateTileView(current_command_buffer_, view, false, false);
   }
 
   current_command_buffer_ = nullptr;
@@ -920,6 +967,7 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
                        &buffer_barrier, 0, nullptr);
 
   // Issue the copy command.
+  // TODO(DrChat): Stencil copies.
   VkBufferImageCopy region;
   region.bufferOffset = edram_base * 5120;
   region.bufferImageHeight = 0;
@@ -928,8 +976,7 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
   region.imageExtent = extents;
   region.imageSubresource = {0, 0, 0, 1};
   region.imageSubresource.aspectMask =
-      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
-                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
   vkCmdCopyBufferToImage(command_buffer, edram_buffer_, image, image_layout, 1,
                          &region);
 
@@ -947,13 +994,15 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer,
 
 void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
                               uint32_t edram_base, uint32_t pitch,
-                              uint32_t height, VkImage image,
-                              VkImageLayout image_layout, bool color_or_depth,
-                              uint32_t format, VkFilter filter,
-                              VkOffset3D offset, VkExtent3D extents) {
+                              uint32_t height, MsaaSamples num_samples,
+                              VkImage image, VkImageLayout image_layout,
+                              bool color_or_depth, uint32_t format,
+                              VkFilter filter, VkOffset3D offset,
+                              VkExtent3D extents) {
   // Grab a tile view that represents the source image.
   TileViewKey key;
   key.color_or_depth = color_or_depth ? 1 : 0;
+  key.msaa_samples = static_cast<uint16_t>(num_samples);
   key.edram_format = format;
   key.tile_offset = edram_base;
   key.tile_width = xe::round_up(pitch, 80) / 80;
@@ -979,14 +1028,14 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
 
   // Update the tile view with current EDRAM contents.
   // TODO: Heuristics to determine if this copy is avoidable.
+  // TODO(DrChat): Stencil copies.
   VkBufferImageCopy buffer_copy;
   buffer_copy.bufferOffset = edram_base * 5120;
   buffer_copy.bufferImageHeight = 0;
   buffer_copy.bufferRowLength = 0;
   buffer_copy.imageSubresource = {0, 0, 0, 1};
   buffer_copy.imageSubresource.aspectMask =
-      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
-                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
   buffer_copy.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
   buffer_copy.imageOffset = {0, 0, 0};
   vkCmdCopyBufferToImage(command_buffer, edram_buffer_, tile_view->image,
@@ -1018,26 +1067,48 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
   assert_true(extents.height <= key.tile_height * 16u);
 
   // Now issue the blit to the destination.
-  // TODO: Resolve to destination if necessary.
-  VkImageBlit image_blit;
-  image_blit.srcSubresource = {0, 0, 0, 1};
-  image_blit.srcSubresource.aspectMask =
-      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
-                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
-  image_blit.srcOffsets[0] = {0, 0, 0};
-  image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height),
-                              int32_t(extents.depth)};
+  if (num_samples == MsaaSamples::k1X) {
+    VkImageBlit image_blit;
+    image_blit.srcSubresource = {0, 0, 0, 1};
+    image_blit.srcSubresource.aspectMask =
+        color_or_depth
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+    image_blit.srcOffsets[0] = {0, 0, 0};
+    image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height),
+                                int32_t(extents.depth)};
 
-  image_blit.dstSubresource = {0, 0, 0, 1};
-  image_blit.dstSubresource.aspectMask =
-      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT
-                     : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
-  image_blit.dstOffsets[0] = offset;
-  image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width),
-                              offset.y + int32_t(extents.height),
-                              offset.z + int32_t(extents.depth)};
-  vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL,
-                 image, image_layout, 1, &image_blit, filter);
+    image_blit.dstSubresource = {0, 0, 0, 1};
+    image_blit.dstSubresource.aspectMask =
+        color_or_depth
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+    image_blit.dstOffsets[0] = offset;
+    image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width),
+                                offset.y + int32_t(extents.height),
+                                offset.z + int32_t(extents.depth)};
+    vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL,
+                   image, image_layout, 1, &image_blit, filter);
+  } else {
+    VkImageResolve image_resolve;
+    image_resolve.srcSubresource = {0, 0, 0, 1};
+    image_resolve.srcSubresource.aspectMask =
+        color_or_depth
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+    image_resolve.srcOffset = {0, 0, 0};
+
+    image_resolve.dstSubresource = {0, 0, 0, 1};
+    image_resolve.dstSubresource.aspectMask =
+        color_or_depth
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+    image_resolve.dstOffset = offset;
+
+    image_resolve.extent = extents;
+    vkCmdResolveImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL,
+                      image, image_layout, 1, &image_resolve);
+  }
 
   // Transition the image back into its previous layout.
   image_barrier.srcAccessMask = image_barrier.dstAccessMask;
@@ -1052,13 +1123,14 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
                                   uint32_t edram_base,
                                   ColorRenderTargetFormat format,
                                   uint32_t pitch, uint32_t height,
-                                  float* color) {
+                                  MsaaSamples num_samples, float* color) {
   // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
   // need to detect this and calculate a value.
 
   // Grab a tile view (as we need to clear an image first)
   TileViewKey key;
   key.color_or_depth = 1;
+  key.msaa_samples = static_cast<uint16_t>(num_samples);
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
   key.tile_width = xe::round_up(pitch, 80) / 80;
@@ -1091,13 +1163,15 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
                                          uint32_t edram_base,
                                          DepthRenderTargetFormat format,
                                          uint32_t pitch, uint32_t height,
-                                         float depth, uint32_t stencil) {
+                                         MsaaSamples num_samples, float depth,
+                                         uint32_t stencil) {
   // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
   // need to detect this and calculate a value.
 
   // Grab a tile view (as we need to clear an image first)
   TileViewKey key;
   key.color_or_depth = 0;
+  key.msaa_samples = static_cast<uint16_t>(num_samples);
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
   key.tile_width = xe::round_up(pitch, 80) / 80;
@@ -1117,12 +1191,13 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
                               VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range);
 
   // Copy image back into EDRAM buffer
+  // TODO(DrChat): Stencil copies.
   VkBufferImageCopy copy_range;
   copy_range.bufferOffset = edram_base * 5120;
   copy_range.bufferImageHeight = 0;
   copy_range.bufferRowLength = 0;
   copy_range.imageSubresource = {
-      VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1,
+      VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0, 1,
   };
   copy_range.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
   copy_range.imageOffset = {0, 0, 0};
@@ -1131,6 +1206,11 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
                          &copy_range);
 }
 
+void RenderCache::FillEDRAM(VkCommandBuffer command_buffer, uint32_t value) {
+  vkCmdFillBuffer(command_buffer, edram_buffer_, 0, kEdramBufferCapacity,
+                  value);
+}
+
 bool RenderCache::SetShadowRegister(uint32_t* dest, uint32_t register_name) {
   uint32_t value = register_file_->values[register_name].u32;
   if (*dest == value) {
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index 2e8d1c5fe..86edac7bc 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -38,9 +38,9 @@ struct TileViewKey {
   // 1 if format is ColorRenderTargetFormat, else DepthRenderTargetFormat.
   uint16_t color_or_depth : 1;
   // Surface MSAA samples
-  // uint16_t msaa_samples : 2;
+  uint16_t msaa_samples : 2;
   // Either ColorRenderTargetFormat or DepthRenderTargetFormat.
-  uint16_t edram_format : 15;  // 13;
+  uint16_t edram_format : 13;
 };
 static_assert(sizeof(TileViewKey) == 8, "Key must be tightly packed");
 
@@ -69,6 +69,10 @@ class CachedTileView {
     return *a == *b;
   }
 
+  bool operator<(const CachedTileView& other) const {
+    return key.tile_offset < other.key.tile_offset;
+  }
+
  private:
   VkDevice device_ = nullptr;
 };
@@ -278,22 +282,26 @@ class RenderCache {
   // Queues commands to blit EDRAM contents into an image.
   // The command buffer must not be inside of a render pass when calling this.
   void BlitToImage(VkCommandBuffer command_buffer, uint32_t edram_base,
-                   uint32_t pitch, uint32_t height, VkImage image,
-                   VkImageLayout image_layout, bool color_or_depth,
-                   uint32_t format, VkFilter filter, VkOffset3D offset,
-                   VkExtent3D extents);
+                   uint32_t pitch, uint32_t height, MsaaSamples num_samples,
+                   VkImage image, VkImageLayout image_layout,
+                   bool color_or_depth, uint32_t format, VkFilter filter,
+                   VkOffset3D offset, VkExtent3D extents);
 
   // Queues commands to clear EDRAM contents with a solid color.
   // The command buffer must not be inside of a render pass when calling this.
   void ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base,
                        ColorRenderTargetFormat format, uint32_t pitch,
-                       uint32_t height, float* color);
+                       uint32_t height, MsaaSamples num_samples, float* color);
   // Queues commands to clear EDRAM contents with depth/stencil values.
   // The command buffer must not be inside of a render pass when calling this.
   void ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
                               uint32_t edram_base,
                               DepthRenderTargetFormat format, uint32_t pitch,
-                              uint32_t height, float depth, uint32_t stencil);
+                              uint32_t height, MsaaSamples num_samples,
+                              float depth, uint32_t stencil);
+  // Queues commands to fill EDRAM contents with a constant value.
+  // The command buffer must not be inside of a render pass when calling this.
+  void FillEDRAM(VkCommandBuffer command_buffer, uint32_t value);
 
  private:
   // Parses the current state into a configuration object.
@@ -306,6 +314,9 @@ class RenderCache {
   CachedTileView* FindOrCreateTileView(VkCommandBuffer command_buffer,
                                        const TileViewKey& view_key);
 
+  void UpdateTileView(VkCommandBuffer command_buffer, CachedTileView* view,
+                      bool load, bool insert_barrier = true);
+
   // Gets or creates a render pass and frame buffer for the given configuration.
   // This attempts to reuse as much as possible across render passes and
   // framebuffers.
@@ -335,6 +346,7 @@ class RenderCache {
   struct ShadowRegisters {
     uint32_t rb_modecontrol;
     uint32_t rb_surface_info;
+    uint32_t rb_color_mask;
     uint32_t rb_color_info;
     uint32_t rb_color1_info;
     uint32_t rb_color2_info;
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 1d559d896..fd604733b 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -152,19 +152,8 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
 
     // TODO(benvanik): move to CP or to host (trace dump, etc).
     // This only needs to surround a vkQueueSubmit.
-    static uint32_t frame = 0;
-    if (device_->is_renderdoc_attached() &&
-        (FLAGS_vulkan_renderdoc_capture_all ||
-         trace_state_ == TraceState::kSingleFrame)) {
-      if (queue_mutex_) {
-        queue_mutex_->lock();
-      }
-
-      device_->BeginRenderDocFrameCapture();
-
-      if (queue_mutex_) {
-        queue_mutex_->unlock();
-      }
+    if (queue_mutex_) {
+      queue_mutex_->lock();
     }
 
     // TODO(DrChat): If setup buffer is empty, don't bother queueing it up.
@@ -182,45 +171,37 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
     submit_info.signalSemaphoreCount = 0;
     submit_info.pSignalSemaphores = nullptr;
     if (queue_mutex_) {
-      queue_mutex_->lock();
+      // queue_mutex_->lock();
     }
     status = vkQueueSubmit(queue_, 1, &submit_info, *current_batch_fence_);
     if (queue_mutex_) {
-      queue_mutex_->unlock();
+      // queue_mutex_->unlock();
     }
     CheckResult(status, "vkQueueSubmit");
 
+    // TODO(DrChat): Disable this completely.
     VkFence fences[] = {*current_batch_fence_};
     status = vkWaitForFences(*device_, 1, fences, true, -1);
     CheckResult(status, "vkWaitForFences");
 
-    if (device_->is_renderdoc_attached() &&
-        (FLAGS_vulkan_renderdoc_capture_all ||
-         trace_state_ == TraceState::kSingleFrame)) {
-      if (queue_mutex_) {
-        queue_mutex_->lock();
-      }
-
+    if (device_->is_renderdoc_attached() && capturing_) {
       device_->EndRenderDocFrameCapture();
+      capturing_ = false;
 
       // HACK(DrChat): Used b/c I disabled trace saving code in the CP.
       // Remove later.
       if (!trace_writer_.is_open()) {
         trace_state_ = TraceState::kDisabled;
       }
-
-      if (queue_mutex_) {
-        queue_mutex_->unlock();
-      }
+    }
+    if (queue_mutex_) {
+      queue_mutex_->unlock();
     }
 
     // Scavenging.
     current_command_buffer_ = nullptr;
     current_setup_buffer_ = nullptr;
-    while (command_buffer_pool_->has_pending()) {
-      command_buffer_pool_->Scavenge();
-      xe::threading::MaybeYield();
-    }
+    command_buffer_pool_->Scavenge();
 
     texture_cache_->Scavenge();
     current_batch_fence_ = nullptr;
@@ -331,6 +312,22 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
         vkBeginCommandBuffer(current_setup_buffer_, &command_buffer_begin_info);
     CheckResult(status, "vkBeginCommandBuffer");
 
+    static uint32_t frame = 0;
+    if (device_->is_renderdoc_attached() && !capturing_ &&
+        (FLAGS_vulkan_renderdoc_capture_all ||
+         trace_state_ == TraceState::kSingleFrame)) {
+      if (queue_mutex_) {
+        queue_mutex_->lock();
+      }
+
+      capturing_ = true;
+      device_->BeginRenderDocFrameCapture();
+
+      if (queue_mutex_) {
+        queue_mutex_->unlock();
+      }
+    }
+
     started_command_buffer = true;
   }
   auto command_buffer = current_command_buffer_;
@@ -357,6 +354,10 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     current_render_state_ = render_cache_->BeginRenderPass(
         command_buffer, vertex_shader, pixel_shader);
     if (!current_render_state_) {
+      command_buffer_pool_->CancelBatch();
+      current_command_buffer_ = nullptr;
+      current_setup_buffer_ = nullptr;
+      current_batch_fence_ = nullptr;
       return false;
     }
   }
@@ -378,18 +379,30 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
   // Pass registers to the shaders.
   if (!PopulateConstants(command_buffer, vertex_shader, pixel_shader)) {
     render_cache_->EndRenderPass();
+    command_buffer_pool_->CancelBatch();
+    current_command_buffer_ = nullptr;
+    current_setup_buffer_ = nullptr;
+    current_batch_fence_ = nullptr;
     return false;
   }
 
   // Upload and bind index buffer data (if we have any).
   if (!PopulateIndexBuffer(command_buffer, index_buffer_info)) {
     render_cache_->EndRenderPass();
+    command_buffer_pool_->CancelBatch();
+    current_command_buffer_ = nullptr;
+    current_setup_buffer_ = nullptr;
+    current_batch_fence_ = nullptr;
     return false;
   }
 
   // Upload and bind all vertex buffer data.
   if (!PopulateVertexBuffers(command_buffer, vertex_shader)) {
     render_cache_->EndRenderPass();
+    command_buffer_pool_->CancelBatch();
+    current_command_buffer_ = nullptr;
+    current_setup_buffer_ = nullptr;
+    current_batch_fence_ = nullptr;
     return false;
   }
 
@@ -423,6 +436,10 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
 bool VulkanCommandProcessor::PopulateConstants(VkCommandBuffer command_buffer,
                                                VulkanShader* vertex_shader,
                                                VulkanShader* pixel_shader) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   // Upload the constants the shaders require.
   // These are optional, and if none are defined 0 will be returned.
   auto constant_offsets = buffer_cache_->UploadConstantRegisters(
@@ -742,7 +759,7 @@ bool VulkanCommandProcessor::IssueCopy() {
   tex_info.size_2d.input_height = dest_block_height;
   tex_info.size_2d.input_pitch = copy_dest_pitch * 4;
   auto texture = texture_cache_->DemandResolveTexture(
-      tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr, nullptr);
+      tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr);
   if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
     // Transition the image to a general layout.
     VkImageMemoryBarrier image_barrier;
@@ -810,8 +827,9 @@ bool VulkanCommandProcessor::IssueCopy() {
     case CopyCommand::kConvert:
       render_cache_->BlitToImage(
           command_buffer, edram_base, surface_pitch, resolve_extent.height,
-          texture->image, texture->image_layout, copy_src_select <= 3,
-          src_format, VK_FILTER_LINEAR, resolve_offset, resolve_extent);
+          surface_msaa, texture->image, texture->image_layout,
+          copy_src_select <= 3, src_format, VK_FILTER_LINEAR, resolve_offset,
+          resolve_extent);
       break;
 
     case CopyCommand::kConstantOne:
@@ -839,7 +857,7 @@ bool VulkanCommandProcessor::IssueCopy() {
     // TODO(DrChat): Do we know the surface height at this point?
     render_cache_->ClearEDRAMColor(command_buffer, color_edram_base,
                                    color_format, surface_pitch,
-                                   resolve_extent.height, color);
+                                   resolve_extent.height, surface_msaa, color);
   }
 
   if (depth_clear_enabled) {
@@ -850,7 +868,7 @@ bool VulkanCommandProcessor::IssueCopy() {
     // TODO(DrChat): Do we know the surface height at this point?
     render_cache_->ClearEDRAMDepthStencil(
         command_buffer, depth_edram_base, depth_format, surface_pitch,
-        resolve_extent.height, depth, stencil);
+        resolve_extent.height, surface_msaa, depth, stencil);
   }
 
   return true;
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index c87c515c0..287e4f65e 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -94,6 +94,7 @@ class VulkanCommandProcessor : public CommandProcessor {
 
   // Last copy base address, for debugging only.
   uint32_t last_copy_base_ = 0;
+  bool capturing_ = false;
 
   std::unique_ptr<BufferCache> buffer_cache_;
   std::unique_ptr<PipelineCache> pipeline_cache_;

From f9a634ad25c1b05679a94d87885cda8beb2c31b8 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 1 Apr 2016 21:53:46 -0500
Subject: [PATCH 38/77] CircularBuffer remove Discard functionality and allow
 rotation

---
 src/xenia/ui/vulkan/circular_buffer.cc | 25 +++++++------------------
 src/xenia/ui/vulkan/circular_buffer.h  |  4 +++-
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc
index 4cc22366f..110cd6c36 100644
--- a/src/xenia/ui/vulkan/circular_buffer.cc
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@@ -139,7 +139,6 @@ CircularBuffer::Allocation* CircularBuffer::Acquire(
     assert(read_head_ == write_head_);
     assert(capacity_ > aligned_length);
 
-    read_head_ = 0;
     write_head_ = length;
 
     auto alloc = new Allocation();
@@ -200,19 +199,6 @@ CircularBuffer::Allocation* CircularBuffer::Acquire(
   return nullptr;
 }
 
-void CircularBuffer::Discard(Allocation* allocation) {
-  // TODO: Revert write_head_ (only if this is the last alloc though)
-  // Or maybe just disallow discards.
-  for (auto it = allocations_.begin(); it != allocations_.end(); ++it) {
-    if (*it == allocation) {
-      allocations_.erase(it);
-      break;
-    }
-  }
-
-  delete allocation;
-}
-
 void CircularBuffer::Flush(Allocation* allocation) {
   VkMappedMemoryRange range;
   range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@@ -239,7 +225,13 @@ void CircularBuffer::Scavenge() {
       break;
     }
 
-    read_head_ = (read_head_ + (*it)->aligned_length) % capacity_;
+    if (capacity_ - read_head_ < (*it)->aligned_length) {
+      // This allocation is stored at the beginning of the buffer.
+      read_head_ = (*it)->aligned_length;
+    } else {
+      read_head_ += (*it)->aligned_length;
+    }
+
     delete *it;
     it = allocations_.erase(it);
   }
@@ -247,9 +239,6 @@ void CircularBuffer::Scavenge() {
   if (allocations_.empty()) {
     // Reset R/W heads.
     read_head_ = write_head_ = 0;
-  } else {
-    // FIXME: Haven't verified this works correctly when actually rotating :P
-    assert_always();
   }
 }
 
diff --git a/src/xenia/ui/vulkan/circular_buffer.h b/src/xenia/ui/vulkan/circular_buffer.h
index 2c036c685..6f0ec2f82 100644
--- a/src/xenia/ui/vulkan/circular_buffer.h
+++ b/src/xenia/ui/vulkan/circular_buffer.h
@@ -52,8 +52,10 @@ class CircularBuffer {
   uint8_t* host_base() const { return host_base_; }
 
   bool CanAcquire(VkDeviceSize length);
+
+  // Acquires space to hold memory. This allocation is only freed when the fence
+  // reaches the signaled state.
   Allocation* Acquire(VkDeviceSize length, std::shared_ptr<Fence> fence);
-  void Discard(Allocation* allocation);
   void Flush(Allocation* allocation);
 
   // Clears all allocations, regardless of whether they've been consumed or not.

From 3726064af5ffee371bbda7144d197d3eabcb44fd Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 1 Apr 2016 22:03:29 -0500
Subject: [PATCH 39/77] Can't use CmdCopyBufferToImage or vice versa for depth
 and stencil.

---
 src/xenia/gpu/vulkan/render_cache.cc | 32 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 7e0528866..3df5e4c9e 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -339,18 +339,18 @@ CachedRenderPass::CachedRenderPass(VkDevice device,
 
   VkSampleCountFlagBits sample_count;
   switch (desired_config.surface_msaa) {
-  case MsaaSamples::k1X:
-    sample_count = VK_SAMPLE_COUNT_1_BIT;
-    break;
-  case MsaaSamples::k2X:
-    sample_count = VK_SAMPLE_COUNT_2_BIT;
-    break;
-  case MsaaSamples::k4X:
-    sample_count = VK_SAMPLE_COUNT_4_BIT;
-    break;
-  default:
-    assert_unhandled_case(desired_config.surface_msaa);
-    break;
+    case MsaaSamples::k1X:
+      sample_count = VK_SAMPLE_COUNT_1_BIT;
+      break;
+    case MsaaSamples::k2X:
+      sample_count = VK_SAMPLE_COUNT_2_BIT;
+      break;
+    case MsaaSamples::k4X:
+      sample_count = VK_SAMPLE_COUNT_4_BIT;
+      break;
+    default:
+      assert_unhandled_case(desired_config.surface_msaa);
+      break;
   }
 
   // Initialize all attachments to default unused.
@@ -840,15 +840,15 @@ void RenderCache::UpdateTileView(VkCommandBuffer command_buffer,
                          &barrier, 0, nullptr);
   }
 
+  // TODO(DrChat): Stencil copies.
   VkBufferImageCopy region;
   region.bufferOffset = view->key.tile_offset * 5120;
   region.bufferRowLength = 0;
   region.bufferImageHeight = 0;
   region.imageSubresource = {0, 0, 0, 1};
-  region.imageSubresource.aspectMask =
-      view->key.color_or_depth
-          ? VK_IMAGE_ASPECT_COLOR_BIT
-          : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  region.imageSubresource.aspectMask = view->key.color_or_depth
+                                           ? VK_IMAGE_ASPECT_COLOR_BIT
+                                           : VK_IMAGE_ASPECT_DEPTH_BIT;
   region.imageOffset = {0, 0, 0};
   region.imageExtent = {view->key.tile_width * 80u, view->key.tile_height * 16u,
                         1};

From a1c9540063ec315646c94c0d4cc80142e2e8c319 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 9 Apr 2016 18:35:00 -0500
Subject: [PATCH 40/77] SPIR-V Validator util class

---
 src/xenia/ui/spirv/spirv_validator.cc | 80 +++++++++++++++++++++++++++
 src/xenia/ui/spirv/spirv_validator.h  | 66 ++++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 src/xenia/ui/spirv/spirv_validator.cc
 create mode 100644 src/xenia/ui/spirv/spirv_validator.h

diff --git a/src/xenia/ui/spirv/spirv_validator.cc b/src/xenia/ui/spirv/spirv_validator.cc
new file mode 100644
index 000000000..734688eb6
--- /dev/null
+++ b/src/xenia/ui/spirv/spirv_validator.cc
@@ -0,0 +1,80 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/ui/spirv/spirv_validator.h"
+
+#include "third_party/spirv-tools/include/spirv-tools/libspirv.h"
+#include "xenia/base/logging.h"
+
+namespace xe {
+namespace ui {
+namespace spirv {
+
+SpirvValidator::Result::Result(spv_text text, spv_diagnostic diagnostic)
+    : text_(text), diagnostic_(diagnostic) {}
+
+SpirvValidator::Result::~Result() {
+  if (text_) {
+    spvTextDestroy(text_);
+  }
+  if (diagnostic_) {
+    spvDiagnosticDestroy(diagnostic_);
+  }
+}
+
+bool SpirvValidator::Result::has_error() const { return !!diagnostic_; }
+
+size_t SpirvValidator::Result::error_word_index() const {
+  return diagnostic_ ? diagnostic_->position.index : 0;
+}
+
+const char* SpirvValidator::Result::error_string() const {
+  return diagnostic_ ? diagnostic_->error : "";
+}
+
+const char* SpirvValidator::Result::text() const {
+  return text_ ? text_->str : "";
+}
+
+std::string SpirvValidator::Result::to_string() const {
+  return text_ ? std::string(text_->str, text_->length) : "";
+}
+
+void SpirvValidator::Result::AppendText(StringBuffer* target_buffer) const {
+  if (text_) {
+    target_buffer->AppendBytes(reinterpret_cast<const uint8_t*>(text_->str),
+                               text_->length);
+  }
+}
+
+SpirvValidator::SpirvValidator() : spv_context_(spvContextCreate()) {}
+SpirvValidator::~SpirvValidator() { spvContextDestroy(spv_context_); }
+
+std::unique_ptr<SpirvValidator::Result> SpirvValidator::Validate(
+    const uint32_t* words, size_t word_count) {
+  spv_text text = nullptr;
+  spv_diagnostic diagnostic = nullptr;
+  spv_const_binary_t binary = {words, word_count};
+  auto result_code =
+      spvValidate(spv_context_, &binary, SPV_VALIDATE_ALL, &diagnostic);
+  std::unique_ptr<Result> result(new Result(text, diagnostic));
+  if (result_code) {
+    XELOGE("Failed to validate spv: %d", result_code);
+    if (result->has_error()) {
+      return result;
+    } else {
+      return nullptr;
+    }
+  }
+  return result;
+}
+
+}  // namespace spirv
+}  // namespace ui
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/ui/spirv/spirv_validator.h b/src/xenia/ui/spirv/spirv_validator.h
new file mode 100644
index 000000000..890843f27
--- /dev/null
+++ b/src/xenia/ui/spirv/spirv_validator.h
@@ -0,0 +1,66 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_UI_SPIRV_SPIRV_VALIDATOR_H_
+#define XENIA_UI_SPIRV_SPIRV_VALIDATOR_H_
+
+#include <memory>
+#include <string>
+
+#include "xenia/base/string_buffer.h"
+#include "xenia/ui/spirv/spirv_util.h"
+
+namespace xe {
+namespace ui {
+namespace spirv {
+
+class SpirvValidator {
+ public:
+  class Result {
+   public:
+    Result(spv_text text, spv_diagnostic diagnostic);
+    ~Result();
+
+    // True if the result has an error associated with it.
+    bool has_error() const;
+    // Index of the error in the provided binary word data.
+    size_t error_word_index() const;
+    // Human-readable description of the error.
+    const char* error_string() const;
+
+    // Disassembled source text.
+    // Returned pointer lifetime is tied to this Result instance.
+    const char* text() const;
+    // Converts the disassembled source text to a string.
+    std::string to_string() const;
+    // Appends the disassembled source text to the given buffer.
+    void AppendText(StringBuffer* target_buffer) const;
+
+   private:
+    spv_text text_ = nullptr;
+    spv_diagnostic diagnostic_ = nullptr;
+  };
+
+  SpirvValidator();
+  ~SpirvValidator();
+
+  // Validates the given SPIRV binary.
+  // The return will be nullptr if validation fails due to a library error.
+  // The return may have an error set on it if the SPIRV binary is malformed.
+  std::unique_ptr<Result> Validate(const uint32_t* words, size_t word_count);
+
+ private:
+  spv_context spv_context_ = nullptr;
+};
+
+}  // namespace spirv
+}  // namespace ui
+}  // namespace xe
+
+#endif  // XENIA_UI_SPIRV_SPIRV_VALIDATOR_H_

From b7f2c93d73bbbfd5af1fd7713da6d7e9a845b0d6 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 9 Apr 2016 21:03:44 -0500
Subject: [PATCH 41/77] SPIR-V: Batch predicated instructions together into a
 single block. Add Post-Translation validation. Fix a couple of type-related
 typos.

---
 src/xenia/gpu/shader_translator.cc       |  19 +-
 src/xenia/gpu/shader_translator.h        |   6 +-
 src/xenia/gpu/spirv_shader_translator.cc | 285 +++++++++++++++--------
 src/xenia/gpu/spirv_shader_translator.h  |  12 +-
 4 files changed, 210 insertions(+), 112 deletions(-)

diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 6e8b69cea..1097dbc55 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -986,16 +986,19 @@ void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) {
     return;
   }
 
+  ParsedAluInstruction instr;
   if (op.has_vector_op()) {
     const auto& opcode_info =
         alu_vector_opcode_infos_[static_cast<int>(op.vector_opcode())];
-    ParseAluVectorInstruction(op, opcode_info);
+    ParseAluVectorInstruction(op, opcode_info, instr);
+    ProcessAluInstruction(instr);
   }
 
   if (op.has_scalar_op()) {
     const auto& opcode_info =
         alu_scalar_opcode_infos_[static_cast<int>(op.scalar_opcode())];
-    ParseAluScalarInstruction(op, opcode_info);
+    ParseAluScalarInstruction(op, opcode_info, instr);
+    ProcessAluInstruction(instr);
   }
 }
 
@@ -1088,8 +1091,8 @@ void ParseAluInstructionOperandSpecial(const AluInstruction& op,
 }
 
 void ShaderTranslator::ParseAluVectorInstruction(
-    const AluInstruction& op, const AluOpcodeInfo& opcode_info) {
-  ParsedAluInstruction i;
+    const AluInstruction& op, const AluOpcodeInfo& opcode_info,
+    ParsedAluInstruction& i) {
   i.dword_index = 0;
   i.type = ParsedAluInstruction::Type::kVector;
   i.vector_opcode = op.vector_opcode();
@@ -1203,13 +1206,11 @@ void ShaderTranslator::ParseAluVectorInstruction(
   }
 
   i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessAluInstruction(i);
 }
 
 void ShaderTranslator::ParseAluScalarInstruction(
-    const AluInstruction& op, const AluOpcodeInfo& opcode_info) {
-  ParsedAluInstruction i;
+    const AluInstruction& op, const AluOpcodeInfo& opcode_info,
+    ParsedAluInstruction& i) {
   i.dword_index = 0;
   i.type = ParsedAluInstruction::Type::kScalar;
   i.scalar_opcode = op.scalar_opcode();
@@ -1319,8 +1320,6 @@ void ShaderTranslator::ParseAluScalarInstruction(
   }
 
   i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessAluInstruction(i);
 }
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index d1b27a997..7dc173dc5 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -173,9 +173,11 @@ class ShaderTranslator {
 
   void TranslateAluInstruction(const ucode::AluInstruction& op);
   void ParseAluVectorInstruction(const ucode::AluInstruction& op,
-                                 const AluOpcodeInfo& opcode_info);
+                                 const AluOpcodeInfo& opcode_info,
+                                 ParsedAluInstruction& instr);
   void ParseAluScalarInstruction(const ucode::AluInstruction& op,
-                                 const AluOpcodeInfo& opcode_info);
+                                 const AluOpcodeInfo& opcode_info,
+                                 ParsedAluInstruction& instr);
 
   // Input shader metadata and microcode.
   ShaderType shader_type_;
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index ef242f0bd..0b0ab0626 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -85,16 +85,14 @@ void SpirvShaderTranslator::StartTranslation() {
                          "ps");
   pv_ = b.createVariable(spv::StorageClass::StorageClassFunction,
                          vec4_float_type_, "pv");
-  a0_ = b.createVariable(spv::StorageClass::StorageClassFunction,
-                         b.makeUintType(32), "a0");
+  a0_ = b.createVariable(spv::StorageClass::StorageClassFunction, int_type_,
+                         "a0");
 
   // Uniform constants.
   Id float_consts_type =
       b.makeArrayType(vec4_float_type_, b.makeUintConstant(512), 1);
-  Id loop_consts_type =
-      b.makeArrayType(b.makeUintType(32), b.makeUintConstant(32), 1);
-  Id bool_consts_type =
-      b.makeArrayType(b.makeUintType(32), b.makeUintConstant(8), 1);
+  Id loop_consts_type = b.makeArrayType(uint_type_, b.makeUintConstant(32), 1);
+  Id bool_consts_type = b.makeArrayType(uint_type_, b.makeUintConstant(8), 1);
 
   Id consts_struct_type = b.makeStructType(
       {float_consts_type, loop_consts_type, bool_consts_type}, "consts_type");
@@ -242,6 +240,13 @@ void SpirvShaderTranslator::StartTranslation() {
     interpolators_ = b.createVariable(spv::StorageClass::StorageClassOutput,
                                       interpolators_type, "interpolators");
     b.addDecoration(interpolators_, spv::Decoration::DecorationLocation, 0);
+    for (uint32_t i = 0; i < 16; i++) {
+      // Zero interpolators.
+      auto ptr = b.createAccessChain(spv::StorageClass::StorageClassOutput,
+                                     interpolators_,
+                                     std::vector<Id>({b.makeUintConstant(i)}));
+      b.createStore(vec4_float_zero_, ptr);
+    }
 
     pos_ = b.createVariable(spv::StorageClass::StorageClassOutput,
                             vec4_float_type_, "gl_Position");
@@ -338,6 +343,9 @@ void SpirvShaderTranslator::StartTranslation() {
 std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
   auto& b = *builder_;
 
+  assert_false(open_predicated_block_);
+  auto block = &b.makeNewBlock();
+  b.createBranch(block);
   b.makeReturn(false);
 
   // main() entry point.
@@ -397,9 +405,10 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
     b.createStore(p, pos_);
   } else {
     // Alpha test
-    auto alpha_test_x = b.createCompositeExtract(
-        push_consts_, float_type_, std::vector<uint32_t>{2, 0});
-    auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, alpha_test_x, b.makeFloatConstant(1.f));
+    auto alpha_test_x = b.createCompositeExtract(push_consts_, float_type_,
+                                                 std::vector<uint32_t>{2, 0});
+    auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, alpha_test_x,
+                              b.makeFloatConstant(1.f));
 
     spv::Builder::If alpha_if(cond, b);
 
@@ -433,15 +442,25 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
 }
 
 void SpirvShaderTranslator::PostTranslation(Shader* shader) {
+  // Validation.
+  // TODO(DrChat): Only do this if a flag is set (this is pretty slow).
+  auto validation = validator_.Validate(
+      reinterpret_cast<const uint32_t*>(shader->translated_binary().data()),
+      shader->translated_binary().size() / 4);
+  if (validation->has_error()) {
+    XELOGE("SPIR-V Shader Validation failed! Error: %s",
+           validation->error_string());
+  }
+
   // TODO(benvanik): only if needed? could be slowish.
   auto disasm = disassembler_.Disassemble(
       reinterpret_cast<const uint32_t*>(shader->translated_binary().data()),
       shader->translated_binary().size() / 4);
   if (disasm->has_error()) {
     XELOGE("Failed to disassemble SPIRV - invalid?");
-    return;
+  } else {
+    set_host_disassembly(shader, disasm->to_string());
   }
-  set_host_disassembly(shader, disasm->to_string());
 }
 
 void SpirvShaderTranslator::PreProcessControlFlowInstruction(
@@ -475,13 +494,18 @@ void SpirvShaderTranslator::ProcessControlFlowInstructionEnd(
 void SpirvShaderTranslator::ProcessControlFlowNopInstruction() {
   auto& b = *builder_;
 
-  b.createNoResultOp(spv::Op::OpNop);
+  // b.createNoResultOp(spv::Op::OpNop);
 }
 
 void SpirvShaderTranslator::ProcessExecInstructionBegin(
     const ParsedExecInstruction& instr) {
   auto& b = *builder_;
 
+  assert_false(open_predicated_block_);
+  open_predicated_block_ = false;
+  predicated_block_cond_ = false;
+  predicated_block_end_ = nullptr;
+
   // Head has the logic to check if the body should execute.
   auto head = cf_blocks_[instr.dword_index];
   b.setBuildPoint(head);
@@ -500,7 +524,7 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
       v = b.createLoad(v);
 
       // Bitfield extract the bool constant.
-      v = b.createTriOp(spv::Op::OpBitFieldUExtract, b.makeUintType(32), v,
+      v = b.createTriOp(spv::Op::OpBitFieldUExtract, uint_type_, v,
                         b.makeUintConstant(instr.bool_constant_index % 32),
                         b.makeUintConstant(1));
 
@@ -519,6 +543,7 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
           b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
                         b.makeBoolConstant(instr.condition));
       b.createConditionalBranch(cond, body, cf_blocks_[instr.dword_index + 1]);
+
     } break;
   }
   b.setBuildPoint(body);
@@ -528,6 +553,14 @@ void SpirvShaderTranslator::ProcessExecInstructionEnd(
     const ParsedExecInstruction& instr) {
   auto& b = *builder_;
 
+  if (open_predicated_block_) {
+    b.createBranch(predicated_block_end_);
+    b.setBuildPoint(predicated_block_end_);
+    open_predicated_block_ = false;
+    predicated_block_cond_ = false;
+    predicated_block_end_ = nullptr;
+  }
+
   if (instr.is_end) {
     b.makeReturn(false);
   } else {
@@ -671,7 +704,30 @@ void SpirvShaderTranslator::ProcessVertexFetchInstruction(
   assert_true(is_vertex_shader());
   assert_not_zero(vertex_id_);
 
-  // TODO: instr.is_predicated
+  // Close the open predicated block if this instr isn't predicated or the
+  // conditions do not match.
+  if (open_predicated_block_ &&
+      (!instr.is_predicated ||
+       instr.predicate_condition != predicated_block_cond_)) {
+    b.createBranch(predicated_block_end_);
+    b.setBuildPoint(predicated_block_end_);
+    open_predicated_block_ = false;
+    predicated_block_cond_ = false;
+    predicated_block_end_ = nullptr;
+  }
+
+  if (!open_predicated_block_ && instr.is_predicated) {
+    Id pred_cond =
+        b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
+                      b.makeBoolConstant(instr.predicate_condition));
+    auto block = &b.makeNewBlock();
+    open_predicated_block_ = true;
+    predicated_block_cond_ = instr.predicate_condition;
+    predicated_block_end_ = &b.makeNewBlock();
+
+    b.createConditionalBranch(pred_cond, block, predicated_block_end_);
+    b.setBuildPoint(block);
+  }
 
   // Operand 0 is the index
   // Operand 1 is the binding
@@ -726,7 +782,31 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
     const ParsedTextureFetchInstruction& instr) {
   auto& b = *builder_;
 
-  // TODO: instr.is_predicated
+  // Close the open predicated block if this instr isn't predicated or the
+  // conditions do not match.
+  if (open_predicated_block_ &&
+      (!instr.is_predicated ||
+       instr.predicate_condition != predicated_block_cond_)) {
+    b.createBranch(predicated_block_end_);
+    b.setBuildPoint(predicated_block_end_);
+    open_predicated_block_ = false;
+    predicated_block_cond_ = false;
+    predicated_block_end_ = nullptr;
+  }
+
+  if (!open_predicated_block_ && instr.is_predicated) {
+    Id pred_cond =
+        b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
+                      b.makeBoolConstant(instr.predicate_condition));
+    auto block = &b.makeNewBlock();
+    open_predicated_block_ = true;
+    predicated_block_cond_ = instr.predicate_condition;
+    predicated_block_end_ = &b.makeNewBlock();
+
+    b.createConditionalBranch(pred_cond, block, predicated_block_end_);
+    b.setBuildPoint(block);
+  }
+
   // Operand 0 is the offset
   // Operand 1 is the sampler index
   Id dest = 0;
@@ -804,13 +884,32 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     sources[i] = LoadFromOperand(instr.operands[i]);
   }
 
-  Id pred_cond = 0;
-  if (instr.is_predicated) {
-    pred_cond =
-        b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
-                      b.makeBoolConstant(instr.predicate_condition));
+  // Close the open predicated block if this instr isn't predicated or the
+  // conditions do not match.
+  if (open_predicated_block_ &&
+      (!instr.is_predicated ||
+       instr.predicate_condition != predicated_block_cond_)) {
+    b.createBranch(predicated_block_end_);
+    b.setBuildPoint(predicated_block_end_);
+    open_predicated_block_ = false;
+    predicated_block_cond_ = false;
+    predicated_block_end_ = nullptr;
   }
 
+  if (!open_predicated_block_ && instr.is_predicated) {
+    Id pred_cond =
+        b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
+                      b.makeBoolConstant(instr.predicate_condition));
+    auto block = &b.makeNewBlock();
+    open_predicated_block_ = true;
+    predicated_block_cond_ = instr.predicate_condition;
+    predicated_block_end_ = &b.makeNewBlock();
+
+    b.createConditionalBranch(pred_cond, block, predicated_block_end_);
+    b.setBuildPoint(block);
+  }
+
+  bool close_predicated_block = false;
   switch (instr.vector_opcode) {
     case AluVectorOpcode::kAdd: {
       dest = b.createBinOp(spv::Op::OpFAdd, vec4_float_type_, sources[0],
@@ -863,8 +962,8 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto src1_xy = b.createOp(spv::Op::OpVectorShuffle, vec2_float_type_,
                                 {sources[1], sources[1], 0, 1});
       auto src2_x = b.createCompositeExtract(sources[2], float_type_, 0);
-      auto dot = b.createBinOp(spv::Op::OpDot, float_type_, src0_xy, src1_xy);
-      dest = b.createBinOp(spv::Op::OpFAdd, float_type_, dot, src2_x);
+      dest = b.createBinOp(spv::Op::OpDot, float_type_, src0_xy, src1_xy);
+      dest = b.createBinOp(spv::Op::OpFAdd, float_type_, dest, src2_x);
       dest = b.smearScalar(spv::NoPrecision, dest, vec4_float_type_);
     } break;
 
@@ -873,12 +972,13 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
                                  {sources[0], sources[0], 0, 1, 2});
       auto src1_xyz = b.createOp(spv::Op::OpVectorShuffle, vec3_float_type_,
                                  {sources[1], sources[1], 0, 1, 2});
-      auto dot = b.createBinOp(spv::Op::OpDot, float_type_, src0_xyz, src1_xyz);
-      dest = b.smearScalar(spv::NoPrecision, dot, vec4_float_type_);
+      dest = b.createBinOp(spv::Op::OpDot, float_type_, src0_xyz, src1_xyz);
+      dest = b.smearScalar(spv::NoPrecision, dest, vec4_float_type_);
     } break;
 
     case AluVectorOpcode::kDp4: {
       dest = b.createBinOp(spv::Op::OpDot, float_type_, sources[0], sources[1]);
+      dest = b.smearScalar(spv::NoPrecision, dest, vec4_float_type_);
     } break;
 
     case AluVectorOpcode::kFloor: {
@@ -899,10 +999,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto cond = b.createBinOp(spv::Op::OpFOrdEqual, vec4_bool_type_,
                                 sources[0], sources[1]);
       cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond);
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -918,10 +1014,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThanEqual,
                                 vec4_bool_type_, sources[0], sources[1]);
       cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond);
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -937,10 +1029,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThan, vec4_bool_type_,
                                 sources[0], sources[1]);
       cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond);
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -956,10 +1044,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto cond = b.createBinOp(spv::Op::OpFOrdNotEqual, vec4_bool_type_,
                                 sources[0], sources[1]);
       cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond);
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -1053,6 +1137,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
 
       // p0
       b.createStore(c_and_w, p0_);
+      close_predicated_block = true;
 
       // dest
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
@@ -1076,6 +1161,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
 
       // p0
       b.createStore(c_and_w, p0_);
+      close_predicated_block = true;
 
       // dest
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
@@ -1099,6 +1185,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
 
       // p0
       b.createStore(c_and_w, p0_);
+      close_predicated_block = true;
 
       // dest
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
@@ -1122,6 +1209,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
 
       // p0
       b.createStore(c_and_w, p0_);
+      close_predicated_block = true;
 
       // dest
       auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0);
@@ -1177,15 +1265,16 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
 
   assert_not_zero(dest);
   if (dest) {
-    // If predicated, discard the result from the instruction.
-    Id pv_dest = dest;
-    if (instr.is_predicated) {
-      pv_dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, pred_cond,
-                              dest, b.createLoad(pv_));
-    }
+    b.createStore(dest, pv_);
+    StoreToResult(dest, instr.result);
+  }
 
-    b.createStore(pv_dest, pv_);
-    StoreToResult(dest, instr.result, pred_cond);
+  if (close_predicated_block && open_predicated_block_) {
+    b.createBranch(predicated_block_end_);
+    b.setBuildPoint(predicated_block_end_);
+    open_predicated_block_ = false;
+    predicated_block_cond_ = false;
+    predicated_block_end_ = nullptr;
   }
 }
 
@@ -1229,13 +1318,32 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     }
   }
 
-  Id pred_cond = 0;
-  if (instr.is_predicated) {
-    pred_cond =
-        b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
-                      b.makeBoolConstant(instr.predicate_condition));
+  // Close the open predicated block if this instr isn't predicated or the
+  // conditions do not match.
+  if (open_predicated_block_ &&
+      (!instr.is_predicated ||
+       instr.predicate_condition != predicated_block_cond_)) {
+    b.createBranch(predicated_block_end_);
+    b.setBuildPoint(predicated_block_end_);
+    open_predicated_block_ = false;
+    predicated_block_cond_ = false;
+    predicated_block_end_ = nullptr;
   }
 
+  if (!open_predicated_block_ && instr.is_predicated) {
+    Id pred_cond =
+        b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
+                      b.makeBoolConstant(instr.predicate_condition));
+    auto block = &b.makeNewBlock();
+    open_predicated_block_ = true;
+    predicated_block_cond_ = instr.predicate_condition;
+    predicated_block_end_ = &b.makeNewBlock();
+
+    b.createConditionalBranch(pred_cond, block, predicated_block_end_);
+    b.setBuildPoint(block);
+  }
+
+  bool close_predicated_block = false;
   switch (instr.scalar_opcode) {
     case AluScalarOpcode::kAdds:
     case AluScalarOpcode::kAddsc0:
@@ -1276,10 +1384,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto kill_block = &b.makeNewBlock();
       auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                                 b.makeFloatConstant(0.f));
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -1294,10 +1398,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto kill_block = &b.makeNewBlock();
       auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThanEqual, bool_type_,
                                 sources[0], b.makeFloatConstant(0.f));
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -1312,10 +1412,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto kill_block = &b.makeNewBlock();
       auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThan, bool_type_,
                                 sources[0], b.makeFloatConstant(0.f));
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -1330,10 +1426,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto kill_block = &b.makeNewBlock();
       auto cond = b.createBinOp(spv::Op::OpFOrdNotEqual, bool_type_, sources[0],
                                 b.makeFloatConstant(0.f));
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -1348,10 +1440,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto kill_block = &b.makeNewBlock();
       auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                                 b.makeFloatConstant(1.f));
-      if (pred_cond) {
-        cond =
-            b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond);
-      }
       b.createConditionalBranch(cond, kill_block, continue_block);
 
       b.setBuildPoint(kill_block);
@@ -1448,7 +1536,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
                              b.makeFloatConstant(0.f));
       auto d = b.createBinOp(spv::Op::OpFDiv, float_type_,
                              b.makeFloatConstant(1.f), sources[0]);
-      dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c,
+      dest = b.createTriOp(spv::Op::OpSelect, float_type_, c,
                            b.makeFloatConstant(0.f), d);
     } break;
 
@@ -1462,10 +1550,10 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       // dest = src0 != 0.0 ? inversesqrt(src0) : 0.0;
       auto c = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                              b.makeFloatConstant(0.f));
-      auto d = CreateGlslStd450InstructionCall(
-          spv::NoPrecision, vec4_float_type_, spv::GLSLstd450::kInverseSqrt,
-          {sources[0]});
-      dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c,
+      auto d = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                               spv::GLSLstd450::kInverseSqrt,
+                                               {sources[0]});
+      dest = b.createTriOp(spv::Op::OpSelect, float_type_, c,
                            b.makeFloatConstant(0.f), d);
     } break;
 
@@ -1503,6 +1591,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
 
     case AluScalarOpcode::kSetpClr: {
       b.createStore(b.makeBoolConstant(false), p0_);
+      close_predicated_block = true;
       dest = b.makeFloatConstant(FLT_MAX);
     } break;
 
@@ -1511,6 +1600,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
                                 b.makeFloatConstant(0.f));
       // p0 = cond
       b.createStore(cond, p0_);
+      close_predicated_block = true;
 
       // dest = cond ? 0.f : 1.f;
       dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond,
@@ -1522,6 +1612,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
                                 sources[0], b.makeFloatConstant(0.f));
       // p0 = cond
       b.createStore(cond, p0_);
+      close_predicated_block = true;
 
       // dest = cond ? 0.f : 1.f;
       dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond,
@@ -1533,6 +1624,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
                                 sources[0], b.makeFloatConstant(0.f));
       // p0 = cond
       b.createStore(cond, p0_);
+      close_predicated_block = true;
 
       // dest = cond ? 0.f : 1.f;
       dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond,
@@ -1544,6 +1636,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                                 b.makeFloatConstant(1.f));
       b.createStore(cond, p0_);
+      close_predicated_block = true;
 
       // if (!cond) dest = src0 == 0.0 ? 1.0 : src0;
       auto dst_cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_,
@@ -1560,6 +1653,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
 
       // p0 = cond
       b.createStore(cond, p0_);
+      close_predicated_block = true;
 
       // dest = cond ? 0.f : 1.f;
       dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond,
@@ -1572,6 +1666,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto c = b.createBinOp(spv::Op::OpFOrdLessThanEqual, bool_type_, src,
                              b.makeFloatConstant(0.f));
       b.createStore(c, p0_);
+      close_predicated_block = true;
 
       dest = CreateGlslStd450InstructionCall(
           spv::NoPrecision, float_type_, GLSLstd450::kFMax,
@@ -1582,6 +1677,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
       auto c = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0],
                              b.makeFloatConstant(0.f));
       b.createStore(c, p0_);
+      close_predicated_block = true;
       dest = sources[0];
     } break;
 
@@ -1618,15 +1714,16 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
 
   assert_not_zero(dest);
   if (dest) {
-    // If predicated, discard the result from the instruction.
-    Id ps_dest = dest;
-    if (instr.is_predicated) {
-      ps_dest = b.createTriOp(spv::Op::OpSelect, float_type_, pred_cond, dest,
-                              b.createLoad(ps_));
-    }
+    b.createStore(dest, ps_);
+    StoreToResult(dest, instr.result);
+  }
 
-    b.createStore(ps_dest, ps_);
-    StoreToResult(dest, instr.result, pred_cond);
+  if (close_predicated_block && open_predicated_block_) {
+    b.createBranch(predicated_block_end_);
+    b.setBuildPoint(predicated_block_end_);
+    open_predicated_block_ = false;
+    predicated_block_cond_ = false;
+    predicated_block_end_ = nullptr;
   }
 }
 
@@ -1763,8 +1860,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
 }
 
 void SpirvShaderTranslator::StoreToResult(Id source_value_id,
-                                          const InstructionResult& result,
-                                          Id predicate_cond) {
+                                          const InstructionResult& result) {
   auto& b = *builder_;
 
   if (result.storage_target == InstructionStorageTarget::kNone) {
@@ -1865,7 +1961,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
 
   // Only load from storage if we need it later.
   Id storage_value = 0;
-  if (!result.has_all_writes() || predicate_cond) {
+  if (!result.has_all_writes()) {
     storage_value = b.createLoad(storage_pointer);
   }
 
@@ -1965,13 +2061,8 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
   assert_true(b.getNumComponents(source_value_id) ==
               b.getNumTypeComponents(storage_type));
 
-  // Discard if predicate condition is false.
-  if (predicate_cond) {
-    source_value_id =
-        b.createTriOp(spv::Op::OpSelect, storage_type, predicate_cond,
-                      source_value_id, storage_value);
-  }
-
+  assert_true(b.getTypeId(source_value_id) ==
+              b.getDerefTypeId(storage_pointer));
   b.createStore(source_value_id, storage_pointer);
 }
 
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 1d5dea31b..f30d3ab44 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -18,6 +18,7 @@
 #include "third_party/spirv/GLSL.std.450.hpp11"
 #include "xenia/gpu/shader_translator.h"
 #include "xenia/ui/spirv/spirv_disassembler.h"
+#include "xenia/ui/spirv/spirv_validator.h"
 
 namespace xe {
 namespace gpu {
@@ -91,10 +92,15 @@ class SpirvShaderTranslator : public ShaderTranslator {
   // Stores a value based on the specified result information.
   // The value will be transformed into the appropriate form for the result and
   // the proper components will be selected.
-  void StoreToResult(spv::Id source_value_id, const InstructionResult& result,
-                     spv::Id predicate_cond = 0);
+  void StoreToResult(spv::Id source_value_id, const InstructionResult& result);
 
   xe::ui::spirv::SpirvDisassembler disassembler_;
+  xe::ui::spirv::SpirvValidator validator_;
+
+  // True if there's an open predicated block
+  bool open_predicated_block_ = false;
+  bool predicated_block_cond_ = false;
+  spv::Block* predicated_block_end_ = nullptr;
 
   // TODO(benvanik): replace with something better, make reusable, etc.
   std::unique_ptr<spv::Builder> builder_;

From 4811ebc2ceb1c9f77282d00ef74052eeac787603 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 9 Apr 2016 21:27:32 -0500
Subject: [PATCH 42/77] BufferCache: Use a CircularBuffer as the transient
 buffer.

---
 src/xenia/gpu/vulkan/buffer_cache.cc   | 201 ++++++-------------------
 src/xenia/gpu/vulkan/buffer_cache.h    |  41 +++--
 src/xenia/ui/vulkan/circular_buffer.cc |   3 +-
 src/xenia/ui/vulkan/circular_buffer.h  |   1 +
 4 files changed, 67 insertions(+), 179 deletions(-)

diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc
index 90b7c487e..4ae98c864 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.cc
+++ b/src/xenia/gpu/vulkan/buffer_cache.cc
@@ -30,90 +30,14 @@ constexpr VkDeviceSize kConstantRegisterUniformRange =
 
 BufferCache::BufferCache(RegisterFile* register_file,
                          ui::vulkan::VulkanDevice* device, size_t capacity)
-    : register_file_(register_file),
-      device_(*device),
-      transient_capacity_(capacity) {
-  // Uniform buffer.
-  VkBufferCreateInfo uniform_buffer_info;
-  uniform_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-  uniform_buffer_info.pNext = nullptr;
-  uniform_buffer_info.flags = 0;
-  uniform_buffer_info.size = transient_capacity_;
-  uniform_buffer_info.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-  uniform_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  uniform_buffer_info.queueFamilyIndexCount = 0;
-  uniform_buffer_info.pQueueFamilyIndices = nullptr;
-  auto err = vkCreateBuffer(device_, &uniform_buffer_info, nullptr,
-                            &transient_uniform_buffer_);
-  CheckResult(err, "vkCreateBuffer");
-
-  // Index buffer.
-  VkBufferCreateInfo index_buffer_info;
-  index_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-  index_buffer_info.pNext = nullptr;
-  index_buffer_info.flags = 0;
-  index_buffer_info.size = transient_capacity_;
-  index_buffer_info.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
-  index_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  index_buffer_info.queueFamilyIndexCount = 0;
-  index_buffer_info.pQueueFamilyIndices = nullptr;
-  err = vkCreateBuffer(device_, &index_buffer_info, nullptr,
-                       &transient_index_buffer_);
-  CheckResult(err, "vkCreateBuffer");
-
-  // Vertex buffer.
-  VkBufferCreateInfo vertex_buffer_info;
-  vertex_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-  vertex_buffer_info.pNext = nullptr;
-  vertex_buffer_info.flags = 0;
-  vertex_buffer_info.size = transient_capacity_;
-  vertex_buffer_info.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
-  vertex_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  vertex_buffer_info.queueFamilyIndexCount = 0;
-  vertex_buffer_info.pQueueFamilyIndices = nullptr;
-  err = vkCreateBuffer(*device, &vertex_buffer_info, nullptr,
-                       &transient_vertex_buffer_);
-  CheckResult(err, "vkCreateBuffer");
-
-  // Allocate the underlying buffer we use for all storage.
-  // We query all types and take the max alignment.
-  VkMemoryRequirements uniform_buffer_requirements;
-  VkMemoryRequirements index_buffer_requirements;
-  VkMemoryRequirements vertex_buffer_requirements;
-  vkGetBufferMemoryRequirements(device_, transient_uniform_buffer_,
-                                &uniform_buffer_requirements);
-  vkGetBufferMemoryRequirements(device_, transient_index_buffer_,
-                                &index_buffer_requirements);
-  vkGetBufferMemoryRequirements(device_, transient_vertex_buffer_,
-                                &vertex_buffer_requirements);
-  uniform_buffer_alignment_ = uniform_buffer_requirements.alignment;
-  index_buffer_alignment_ = index_buffer_requirements.alignment;
-  vertex_buffer_alignment_ = vertex_buffer_requirements.alignment;
-  VkMemoryRequirements buffer_requirements;
-  buffer_requirements.size = transient_capacity_;
-  buffer_requirements.alignment =
-      std::max(uniform_buffer_requirements.alignment,
-               std::max(index_buffer_requirements.alignment,
-                        vertex_buffer_requirements.alignment));
-  buffer_requirements.memoryTypeBits =
-      uniform_buffer_requirements.memoryTypeBits |
-      index_buffer_requirements.memoryTypeBits |
-      vertex_buffer_requirements.memoryTypeBits;
-  transient_buffer_memory_ = device->AllocateMemory(
-      buffer_requirements, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-
-  // Alias all buffers to our memory.
-  vkBindBufferMemory(device_, transient_uniform_buffer_,
-                     transient_buffer_memory_, 0);
-  vkBindBufferMemory(device_, transient_index_buffer_, transient_buffer_memory_,
-                     0);
-  vkBindBufferMemory(device_, transient_vertex_buffer_,
-                     transient_buffer_memory_, 0);
-
-  // Map memory and keep it mapped while we use it.
-  err = vkMapMemory(device_, transient_buffer_memory_, 0, VK_WHOLE_SIZE, 0,
-                    &transient_buffer_data_);
-  CheckResult(err, "vkMapMemory");
+    : register_file_(register_file), device_(*device) {
+  transient_buffer_ = std::make_unique<ui::vulkan::CircularBuffer>(device);
+  if (!transient_buffer_->Initialize(capacity,
+                                     VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+                                         VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
+                                         VK_BUFFER_USAGE_VERTEX_BUFFER_BIT)) {
+    assert_always();
+  }
 
   // Descriptor pool used for all of our cached descriptors.
   // In the steady state we don't allocate anything, so these are all manually
@@ -129,8 +53,8 @@ BufferCache::BufferCache(RegisterFile* register_file,
   pool_sizes[0].descriptorCount = 2;
   descriptor_pool_info.poolSizeCount = 1;
   descriptor_pool_info.pPoolSizes = pool_sizes;
-  err = vkCreateDescriptorPool(device_, &descriptor_pool_info, nullptr,
-                               &descriptor_pool_);
+  auto err = vkCreateDescriptorPool(device_, &descriptor_pool_info, nullptr,
+                                    &descriptor_pool_);
   CheckResult(err, "vkCreateDescriptorPool");
 
   // Create the descriptor set layout used for our uniform buffer.
@@ -180,7 +104,7 @@ BufferCache::BufferCache(RegisterFile* register_file,
 
   // Initialize descriptor set with our buffers.
   VkDescriptorBufferInfo buffer_info;
-  buffer_info.buffer = transient_uniform_buffer_;
+  buffer_info.buffer = transient_buffer_->gpu_buffer();
   buffer_info.offset = 0;
   buffer_info.range = kConstantRegisterUniformRange;
   VkWriteDescriptorSet descriptor_writes[2];
@@ -212,25 +136,20 @@ BufferCache::~BufferCache() {
                        &transient_descriptor_set_);
   vkDestroyDescriptorSetLayout(device_, descriptor_set_layout_, nullptr);
   vkDestroyDescriptorPool(device_, descriptor_pool_, nullptr);
-  vkUnmapMemory(device_, transient_buffer_memory_);
-  vkFreeMemory(device_, transient_buffer_memory_, nullptr);
-  vkDestroyBuffer(device_, transient_uniform_buffer_, nullptr);
-  vkDestroyBuffer(device_, transient_index_buffer_, nullptr);
-  vkDestroyBuffer(device_, transient_vertex_buffer_, nullptr);
+  transient_buffer_->Shutdown();
 }
 
 std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
     const Shader::ConstantRegisterMap& vertex_constant_register_map,
-    const Shader::ConstantRegisterMap& pixel_constant_register_map) {
+    const Shader::ConstantRegisterMap& pixel_constant_register_map,
+    std::shared_ptr<ui::vulkan::Fence> fence) {
   // Fat struct, including all registers:
   // struct {
   //   vec4 float[512];
   //   uint bool[8];
   //   uint loop[32];
   // };
-  size_t total_size =
-      xe::round_up(kConstantRegisterUniformRange, uniform_buffer_alignment_);
-  auto offset = AllocateTransientData(uniform_buffer_alignment_, total_size);
+  auto offset = AllocateTransientData(kConstantRegisterUniformRange, fence);
   if (offset == VK_WHOLE_SIZE) {
     // OOM.
     return {VK_WHOLE_SIZE, VK_WHOLE_SIZE};
@@ -238,8 +157,7 @@ std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
 
   // Copy over all the registers.
   const auto& values = register_file_->values;
-  uint8_t* dest_ptr =
-      reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset;
+  uint8_t* dest_ptr = transient_buffer_->host_base() + offset;
   std::memcpy(dest_ptr, &values[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
               (512 * 4 * 4));
   dest_ptr += 512 * 4 * 4;
@@ -258,8 +176,8 @@ std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
 // constant indexing.
 #if 0
   // Allocate space in the buffer for our data.
-  auto offset = AllocateTransientData(uniform_buffer_alignment_,
-                                      constant_register_map.packed_byte_length);
+  auto offset =
+      AllocateTransientData(constant_register_map.packed_byte_length, fence);
   if (offset == VK_WHOLE_SIZE) {
     // OOM.
     return VK_WHOLE_SIZE;
@@ -304,11 +222,12 @@ std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
 }
 
 std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
-    const void* source_ptr, size_t source_length, IndexFormat format) {
+    const void* source_ptr, size_t source_length, IndexFormat format,
+    std::shared_ptr<ui::vulkan::Fence> fence) {
   // TODO(benvanik): check cache.
 
   // Allocate space in the buffer for our data.
-  auto offset = AllocateTransientData(index_buffer_alignment_, source_length);
+  auto offset = AllocateTransientData(source_length, fence);
   if (offset == VK_WHOLE_SIZE) {
     // OOM.
     return {nullptr, VK_WHOLE_SIZE};
@@ -319,25 +238,24 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
   // TODO(benvanik): memcpy then use compute shaders to swap?
   if (format == IndexFormat::kInt16) {
     // Endian::k8in16, swap half-words.
-    xe::copy_and_swap_16_aligned(
-        reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset, source_ptr,
-        source_length / 2);
+    xe::copy_and_swap_16_aligned(transient_buffer_->host_base() + offset,
+                                 source_ptr, source_length / 2);
   } else if (format == IndexFormat::kInt32) {
     // Endian::k8in32, swap words.
-    xe::copy_and_swap_32_aligned(
-        reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset, source_ptr,
-        source_length / 4);
+    xe::copy_and_swap_32_aligned(transient_buffer_->host_base() + offset,
+                                 source_ptr, source_length / 4);
   }
 
-  return {transient_index_buffer_, offset};
+  return {transient_buffer_->gpu_buffer(), offset};
 }
 
 std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
-    const void* source_ptr, size_t source_length) {
+    const void* source_ptr, size_t source_length,
+    std::shared_ptr<ui::vulkan::Fence> fence) {
   // TODO(benvanik): check cache.
 
   // Allocate space in the buffer for our data.
-  auto offset = AllocateTransientData(vertex_buffer_alignment_, source_length);
+  auto offset = AllocateTransientData(source_length, fence);
   if (offset == VK_WHOLE_SIZE) {
     // OOM.
     return {nullptr, VK_WHOLE_SIZE};
@@ -346,59 +264,34 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
   // Copy data into the buffer.
   // TODO(benvanik): memcpy then use compute shaders to swap?
   // Endian::k8in32, swap words.
-  xe::copy_and_swap_32_aligned(
-      reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset, source_ptr,
-      source_length / 4);
+  xe::copy_and_swap_32_aligned(transient_buffer_->host_base() + offset,
+                               source_ptr, source_length / 4);
 
-  return {transient_vertex_buffer_, offset};
+  return {transient_buffer_->gpu_buffer(), offset};
 }
 
-VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize alignment,
-                                                VkDeviceSize length) {
+VkDeviceSize BufferCache::AllocateTransientData(
+    VkDeviceSize length, std::shared_ptr<ui::vulkan::Fence> fence) {
   // Try fast path (if we have space).
-  VkDeviceSize offset = TryAllocateTransientData(alignment, length);
+  VkDeviceSize offset = TryAllocateTransientData(length, fence);
   if (offset != VK_WHOLE_SIZE) {
     return offset;
   }
 
   // Ran out of easy allocations.
   // Try consuming fences before we panic.
-  assert_always("Reclamation not yet implemented");
+  transient_buffer_->Scavenge();
 
   // Try again. It may still fail if we didn't get enough space back.
-  return TryAllocateTransientData(alignment, length);
+  offset = TryAllocateTransientData(length, fence);
+  return offset;
 }
 
-VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize alignment,
-                                                   VkDeviceSize length) {
-  if (transient_tail_offset_ >= transient_head_offset_) {
-    // Tail follows head, so things are easy:
-    // |    H----T   |
-    if (xe::round_up(transient_tail_offset_, alignment) + length <=
-        transient_capacity_) {
-      // Allocation fits from tail to end of buffer, so grow.
-      // |    H----**T |
-      VkDeviceSize offset = xe::round_up(transient_tail_offset_, alignment);
-      transient_tail_offset_ = offset + length;
-      return offset;
-    } else if (length + kDeadZone <= transient_head_offset_) {
-      // Can't fit at the end, but can fit if we wrap around.
-      // |**T H----....|
-      VkDeviceSize offset = 0;
-      transient_tail_offset_ = length;
-      return offset;
-    }
-  } else {
-    // Head follows tail, so we're reversed:
-    // |----T    H---|
-    if (xe::round_up(transient_tail_offset_, alignment) + length + kDeadZone <=
-        transient_head_offset_) {
-      // Fits from tail to head.
-      // |----***T H---|
-      VkDeviceSize offset = xe::round_up(transient_tail_offset_, alignment);
-      transient_tail_offset_ = offset + length;
-      return offset;
-    }
+VkDeviceSize BufferCache::TryAllocateTransientData(
+    VkDeviceSize length, std::shared_ptr<ui::vulkan::Fence> fence) {
+  auto alloc = transient_buffer_->Acquire(length, fence);
+  if (alloc) {
+    return alloc->offset;
   }
 
   // No more space.
@@ -420,9 +313,9 @@ void BufferCache::Flush(VkCommandBuffer command_buffer) {
   VkMappedMemoryRange dirty_range;
   dirty_range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
   dirty_range.pNext = nullptr;
-  dirty_range.memory = transient_buffer_memory_;
+  dirty_range.memory = transient_buffer_->gpu_memory();
   dirty_range.offset = 0;
-  dirty_range.size = transient_capacity_;
+  dirty_range.size = transient_buffer_->capacity();
   vkFlushMappedMemoryRanges(device_, 1, &dirty_range);
 }
 
@@ -432,10 +325,10 @@ void BufferCache::InvalidateCache() {
 
 void BufferCache::ClearCache() {
   // TODO(benvanik): caching.
-  // Temporary clear.
-  transient_tail_offset_ = transient_head_offset_;
 }
 
+void BufferCache::Scavenge() { transient_buffer_->Scavenge(); }
+
 }  // namespace vulkan
 }  // namespace gpu
 }  // namespace xe
diff --git a/src/xenia/gpu/vulkan/buffer_cache.h b/src/xenia/gpu/vulkan/buffer_cache.h
index 1c7330e52..ee09585b5 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.h
+++ b/src/xenia/gpu/vulkan/buffer_cache.h
@@ -13,6 +13,7 @@
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/xenos.h"
+#include "xenia/ui/vulkan/circular_buffer.h"
 #include "xenia/ui/vulkan/vulkan.h"
 #include "xenia/ui/vulkan/vulkan_device.h"
 
@@ -50,22 +51,24 @@ class BufferCache {
   // The returned offsets may alias.
   std::pair<VkDeviceSize, VkDeviceSize> UploadConstantRegisters(
       const Shader::ConstantRegisterMap& vertex_constant_register_map,
-      const Shader::ConstantRegisterMap& pixel_constant_register_map);
+      const Shader::ConstantRegisterMap& pixel_constant_register_map,
+      std::shared_ptr<ui::vulkan::Fence> fence);
 
   // Uploads index buffer data from guest memory, possibly eliding with
   // recently uploaded data or cached copies.
   // Returns a buffer and offset that can be used with vkCmdBindIndexBuffer.
   // Size will be VK_WHOLE_SIZE if the data could not be uploaded (OOM).
-  std::pair<VkBuffer, VkDeviceSize> UploadIndexBuffer(const void* source_ptr,
-                                                      size_t source_length,
-                                                      IndexFormat format);
+  std::pair<VkBuffer, VkDeviceSize> UploadIndexBuffer(
+      const void* source_ptr, size_t source_length, IndexFormat format,
+      std::shared_ptr<ui::vulkan::Fence> fence);
 
   // Uploads vertex buffer data from guest memory, possibly eliding with
   // recently uploaded data or cached copies.
   // Returns a buffer and offset that can be used with vkCmdBindVertexBuffers.
   // Size will be VK_WHOLE_SIZE if the data could not be uploaded (OOM).
-  std::pair<VkBuffer, VkDeviceSize> UploadVertexBuffer(const void* source_ptr,
-                                                       size_t source_length);
+  std::pair<VkBuffer, VkDeviceSize> UploadVertexBuffer(
+      const void* source_ptr, size_t source_length,
+      std::shared_ptr<ui::vulkan::Fence> fence);
 
   // Flushes all pending data to the GPU.
   // Until this is called the GPU is not guaranteed to see any data.
@@ -81,36 +84,26 @@ class BufferCache {
   // Clears all cached content and prevents future elision with pending data.
   void ClearCache();
 
+  // Wipes all data no longer needed.
+  void Scavenge();
+
  private:
   // Allocates a block of memory in the transient buffer.
   // When memory is not available fences are checked and space is reclaimed.
   // Returns VK_WHOLE_SIZE if requested amount of memory is not available.
-  VkDeviceSize AllocateTransientData(VkDeviceSize alignment,
-                                     VkDeviceSize length);
+  VkDeviceSize AllocateTransientData(VkDeviceSize length,
+                                     std::shared_ptr<ui::vulkan::Fence> fence);
   // Tries to allocate a block of memory in the transient buffer.
   // Returns VK_WHOLE_SIZE if requested amount of memory is not available.
-  VkDeviceSize TryAllocateTransientData(VkDeviceSize alignment,
-                                        VkDeviceSize length);
+  VkDeviceSize TryAllocateTransientData(
+      VkDeviceSize length, std::shared_ptr<ui::vulkan::Fence> fence);
 
   RegisterFile* register_file_ = nullptr;
   VkDevice device_ = nullptr;
 
   // Staging ringbuffer we cycle through fast. Used for data we don't
   // plan on keeping past the current frame.
-  size_t transient_capacity_ = 0;
-  VkBuffer transient_uniform_buffer_ = nullptr;
-  VkBuffer transient_index_buffer_ = nullptr;
-  VkBuffer transient_vertex_buffer_ = nullptr;
-  VkDeviceMemory transient_buffer_memory_ = nullptr;
-  void* transient_buffer_data_ = nullptr;
-  VkDeviceSize transient_head_offset_ = 0;
-  VkDeviceSize transient_tail_offset_ = 0;
-
-  // Required alignments for our various types.
-  // All allocations must start at the appropriate alignment.
-  VkDeviceSize uniform_buffer_alignment_ = 0;
-  VkDeviceSize index_buffer_alignment_ = 0;
-  VkDeviceSize vertex_buffer_alignment_ = 0;
+  std::unique_ptr<ui::vulkan::CircularBuffer> transient_buffer_ = nullptr;
 
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkDescriptorSetLayout descriptor_set_layout_ = nullptr;
diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc
index 110cd6c36..43d868120 100644
--- a/src/xenia/ui/vulkan/circular_buffer.cc
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@@ -134,12 +134,13 @@ CircularBuffer::Allocation* CircularBuffer::Acquire(
   }
 
   VkDeviceSize aligned_length = xe::round_up(length, alignment_);
+  assert_true(write_head_ % alignment_ == 0);
   if (allocations_.empty()) {
     // Entire buffer available.
     assert(read_head_ == write_head_);
     assert(capacity_ > aligned_length);
 
-    write_head_ = length;
+    write_head_ = aligned_length;
 
     auto alloc = new Allocation();
     alloc->host_ptr = host_base_ + 0;
diff --git a/src/xenia/ui/vulkan/circular_buffer.h b/src/xenia/ui/vulkan/circular_buffer.h
index 6f0ec2f82..6e4331ab9 100644
--- a/src/xenia/ui/vulkan/circular_buffer.h
+++ b/src/xenia/ui/vulkan/circular_buffer.h
@@ -46,6 +46,7 @@ class CircularBuffer {
                   VkDeviceSize alignment = 256);
   void Shutdown();
 
+  VkDeviceSize alignment() const { return alignment_; }
   VkDeviceSize capacity() const { return capacity_; }
   VkBuffer gpu_buffer() const { return gpu_buffer_; }
   VkDeviceMemory gpu_memory() const { return gpu_memory_; }

From 2bd603bf182b9ab41de291cc76ad6c530982a7af Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 9 Apr 2016 21:40:18 -0500
Subject: [PATCH 43/77] CircularBuffer: use std::list for allocations instead
 of a vector.

---
 src/xenia/ui/vulkan/circular_buffer.cc | 6 +++---
 src/xenia/ui/vulkan/circular_buffer.h  | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc
index 43d868120..404f7a503 100644
--- a/src/xenia/ui/vulkan/circular_buffer.cc
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@@ -211,10 +211,10 @@ void CircularBuffer::Flush(Allocation* allocation) {
 }
 
 void CircularBuffer::Clear() {
-  for (auto it = allocations_.begin(); it != allocations_.end();) {
-    delete *it;
-    it = allocations_.erase(it);
+  for (auto alloc : allocations_) {
+    delete alloc;
   }
+  allocations_.clear();
 
   write_head_ = read_head_ = 0;
 }
diff --git a/src/xenia/ui/vulkan/circular_buffer.h b/src/xenia/ui/vulkan/circular_buffer.h
index 6e4331ab9..54aa916fd 100644
--- a/src/xenia/ui/vulkan/circular_buffer.h
+++ b/src/xenia/ui/vulkan/circular_buffer.h
@@ -10,7 +10,7 @@
 #ifndef XENIA_UI_VULKAN_CIRCULAR_BUFFER_H_
 #define XENIA_UI_VULKAN_CIRCULAR_BUFFER_H_
 
-#include <unordered_map>
+#include <list>
 
 #include "xenia/ui/vulkan/vulkan.h"
 #include "xenia/ui/vulkan/vulkan_device.h"
@@ -77,8 +77,7 @@ class CircularBuffer {
   VkDeviceSize gpu_base_ = 0;
   uint8_t* host_base_ = nullptr;
 
-  std::unordered_map<uint64_t, uintptr_t> allocation_cache_;
-  std::vector<Allocation*> allocations_;
+  std::list<Allocation*> allocations_;
 };
 
 }  // namespace vulkan

From 9b2e2a7275c3cefa0d002bc86deaac5d7f858299 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 13 Apr 2016 23:17:03 -0500
Subject: [PATCH 44/77] SPIR-V: Hack in OpSelectionMerge as hints to NVidia's
 shader compiler (TODO: Make a Shader Compiler)

---
 src/xenia/gpu/shader_translator.cc       |   4 +-
 src/xenia/gpu/shader_translator.h        |   3 +-
 src/xenia/gpu/spirv_shader_translator.cc | 113 +++++++++++++++--------
 src/xenia/gpu/spirv_shader_translator.h  |  10 +-
 4 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 1097dbc55..f6bfbdd65 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -369,9 +369,9 @@ bool ShaderTranslator::TranslateBlocks() {
     AddControlFlowTargetLabel(cf_a, &label_addresses);
     AddControlFlowTargetLabel(cf_b, &label_addresses);
 
-    PreProcessControlFlowInstruction(cf_index);
+    PreProcessControlFlowInstruction(cf_index, cf_a);
     ++cf_index;
-    PreProcessControlFlowInstruction(cf_index);
+    PreProcessControlFlowInstruction(cf_index, cf_b);
     ++cf_index;
   }
 
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index 7dc173dc5..5df53bc0a 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -79,7 +79,8 @@ class ShaderTranslator {
   }
 
   // Pre-process a control-flow instruction before anything else.
-  virtual void PreProcessControlFlowInstruction(uint32_t cf_index) {}
+  virtual void PreProcessControlFlowInstruction(
+      uint32_t cf_index, const ucode::ControlFlowInstruction& instr) {}
 
   // Handles translation for control flow label addresses.
   // This is triggered once for each label required (due to control flow
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 0b0ab0626..855df73f7 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -259,6 +259,7 @@ void SpirvShaderTranslator::StartTranslation() {
                     spv::BuiltIn::BuiltInVertexId);
 
     auto vertex_id = b.createLoad(vertex_id_);
+    vertex_id = b.createUnaryOp(spv::Op::OpConvertSToF, float_type_, vertex_id);
     auto r0_ptr = b.createAccessChain(spv::StorageClass::StorageClassFunction,
                                       registers_ptr_,
                                       std::vector<Id>({b.makeUintConstant(0)}));
@@ -464,16 +465,33 @@ void SpirvShaderTranslator::PostTranslation(Shader* shader) {
 }
 
 void SpirvShaderTranslator::PreProcessControlFlowInstruction(
-    uint32_t cf_index) {
+    uint32_t cf_index, const ControlFlowInstruction& instr) {
   auto& b = *builder_;
 
-  cf_blocks_[cf_index] = &b.makeNewBlock();
+  if (cf_blocks_.find(cf_index) == cf_blocks_.end()) {
+    CFBlock block;
+    block.block = &b.makeNewBlock();
+    cf_blocks_[cf_index] = block;
+  } else {
+    cf_blocks_[cf_index].block = &b.makeNewBlock();
+  }
+
+  if (instr.opcode() == ControlFlowOpcode::kCondJmp) {
+    auto cf_block = cf_blocks_.find(instr.cond_jmp.address());
+    if (cf_block == cf_blocks_.end()) {
+      CFBlock block;
+      block.prev_dominates = false;
+      cf_blocks_[instr.cond_jmp.address()] = block;
+    } else {
+      cf_block->second.prev_dominates = false;
+    }
+  } else if (instr.opcode() == ControlFlowOpcode::kLoopStart) {
+    // TODO
+  }
 }
 
 void SpirvShaderTranslator::ProcessLabel(uint32_t cf_index) {
   auto& b = *builder_;
-
-  EmitUnimplementedTranslationError();
 }
 
 void SpirvShaderTranslator::ProcessControlFlowInstructionBegin(
@@ -482,7 +500,7 @@ void SpirvShaderTranslator::ProcessControlFlowInstructionBegin(
 
   if (cf_index == 0) {
     // Kind of cheaty, but emit a branch to the first block.
-    b.createBranch(cf_blocks_[cf_index]);
+    b.createBranch(cf_blocks_[cf_index].block);
   }
 }
 
@@ -507,7 +525,7 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
   predicated_block_end_ = nullptr;
 
   // Head has the logic to check if the body should execute.
-  auto head = cf_blocks_[instr.dword_index];
+  auto head = cf_blocks_[instr.dword_index].block;
   b.setBuildPoint(head);
   auto body = head;
   switch (instr.type) {
@@ -516,6 +534,7 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
     } break;
     case ParsedExecInstruction::Type::kConditional: {
       // Based off of bool_consts
+      // FIXME: Nvidia compiler is complaining about this.
       std::vector<Id> offsets;
       offsets.push_back(b.makeUintConstant(2));  // bool_consts
       offsets.push_back(b.makeUintConstant(instr.bool_constant_index / 32));
@@ -532,8 +551,14 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
       assert_true(cf_blocks_.size() > instr.dword_index + 1);
       body = &b.makeNewBlock();
       auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
-                                b.makeUintConstant(uint32_t(instr.condition)));
-      b.createConditionalBranch(cond, body, cf_blocks_[instr.dword_index + 1]);
+                                b.makeUintConstant(instr.condition ? 1 : 0));
+
+      auto next_block = cf_blocks_[instr.dword_index + 1];
+      if (next_block.prev_dominates) {
+        b.createNoResultOp(spv::Op::OpSelectionMerge,
+                           {next_block.block->getId(), 0});
+      }
+      b.createConditionalBranch(cond, body, next_block.block);
     } break;
     case ParsedExecInstruction::Type::kPredicated: {
       // Branch based on p0.
@@ -542,7 +567,13 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
       auto cond =
           b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
                         b.makeBoolConstant(instr.condition));
-      b.createConditionalBranch(cond, body, cf_blocks_[instr.dword_index + 1]);
+
+      auto next_block = cf_blocks_[instr.dword_index + 1];
+      if (next_block.prev_dominates) {
+        b.createNoResultOp(spv::Op::OpSelectionMerge,
+                           {next_block.block->getId(), 0});
+      }
+      b.createConditionalBranch(cond, body, next_block.block);
 
     } break;
   }
@@ -565,7 +596,7 @@ void SpirvShaderTranslator::ProcessExecInstructionEnd(
     b.makeReturn(false);
   } else {
     assert_true(cf_blocks_.size() > instr.dword_index + 1);
-    b.createBranch(cf_blocks_[instr.dword_index + 1]);
+    b.createBranch(cf_blocks_[instr.dword_index + 1].block);
   }
 }
 
@@ -573,7 +604,7 @@ void SpirvShaderTranslator::ProcessLoopStartInstruction(
     const ParsedLoopStartInstruction& instr) {
   auto& b = *builder_;
 
-  auto head = cf_blocks_[instr.dword_index];
+  auto head = cf_blocks_[instr.dword_index].block;
   b.setBuildPoint(head);
 
   // TODO: Emit a spv LoopMerge
@@ -582,27 +613,27 @@ void SpirvShaderTranslator::ProcessLoopStartInstruction(
   EmitUnimplementedTranslationError();
 
   assert_true(cf_blocks_.size() > instr.dword_index + 1);
-  b.createBranch(cf_blocks_[instr.dword_index + 1]);
+  b.createBranch(cf_blocks_[instr.dword_index + 1].block);
 }
 
 void SpirvShaderTranslator::ProcessLoopEndInstruction(
     const ParsedLoopEndInstruction& instr) {
   auto& b = *builder_;
 
-  auto head = cf_blocks_[instr.dword_index];
+  auto head = cf_blocks_[instr.dword_index].block;
   b.setBuildPoint(head);
 
   EmitUnimplementedTranslationError();
 
   assert_true(cf_blocks_.size() > instr.dword_index + 1);
-  b.createBranch(cf_blocks_[instr.dword_index + 1]);
+  b.createBranch(cf_blocks_[instr.dword_index + 1].block);
 }
 
 void SpirvShaderTranslator::ProcessCallInstruction(
     const ParsedCallInstruction& instr) {
   auto& b = *builder_;
 
-  auto head = cf_blocks_[instr.dword_index];
+  auto head = cf_blocks_[instr.dword_index].block;
   b.setBuildPoint(head);
 
   // Unused instruction(?)
@@ -610,14 +641,14 @@ void SpirvShaderTranslator::ProcessCallInstruction(
   EmitUnimplementedTranslationError();
 
   assert_true(cf_blocks_.size() > instr.dword_index + 1);
-  b.createBranch(cf_blocks_[instr.dword_index + 1]);
+  b.createBranch(cf_blocks_[instr.dword_index + 1].block);
 }
 
 void SpirvShaderTranslator::ProcessReturnInstruction(
     const ParsedReturnInstruction& instr) {
   auto& b = *builder_;
 
-  auto head = cf_blocks_[instr.dword_index];
+  auto head = cf_blocks_[instr.dword_index].block;
   b.setBuildPoint(head);
 
   // Unused instruction(?)
@@ -625,7 +656,7 @@ void SpirvShaderTranslator::ProcessReturnInstruction(
   EmitUnimplementedTranslationError();
 
   assert_true(cf_blocks_.size() > instr.dword_index + 1);
-  b.createBranch(cf_blocks_[instr.dword_index + 1]);
+  b.createBranch(cf_blocks_[instr.dword_index + 1].block);
 }
 
 // CF jump
@@ -633,11 +664,11 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
     const ParsedJumpInstruction& instr) {
   auto& b = *builder_;
 
-  auto head = cf_blocks_[instr.dword_index];
+  auto head = cf_blocks_[instr.dword_index].block;
   b.setBuildPoint(head);
   switch (instr.type) {
     case ParsedJumpInstruction::Type::kUnconditional: {
-      b.createBranch(cf_blocks_[instr.target_address]);
+      b.createBranch(cf_blocks_[instr.target_address].block);
     } break;
     case ParsedJumpInstruction::Type::kConditional: {
       assert_true(cf_blocks_.size() > instr.dword_index + 1);
@@ -652,14 +683,14 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
 
       // Bitfield extract the bool constant.
       v = b.createTriOp(spv::Op::OpBitFieldUExtract, uint_type_, v,
-                        b.makeIntConstant(instr.bool_constant_index % 32),
-                        b.makeIntConstant(1));
+                        b.makeUintConstant(instr.bool_constant_index % 32),
+                        b.makeUintConstant(1));
 
       // Conditional branch
       auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
-                                b.makeUintConstant(uint32_t(instr.condition)));
-      b.createConditionalBranch(cond, cf_blocks_[instr.target_address],
-                                cf_blocks_[instr.dword_index + 1]);
+                                b.makeUintConstant(instr.condition ? 1 : 0));
+      b.createConditionalBranch(cond, cf_blocks_[instr.target_address].block,
+                                cf_blocks_[instr.dword_index + 1].block);
     } break;
     case ParsedJumpInstruction::Type::kPredicated: {
       assert_true(cf_blocks_.size() > instr.dword_index + 1);
@@ -667,8 +698,8 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
       auto cond =
           b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_),
                         b.makeBoolConstant(instr.condition));
-      b.createConditionalBranch(cond, cf_blocks_[instr.target_address],
-                                cf_blocks_[instr.dword_index + 1]);
+      b.createConditionalBranch(cond, cf_blocks_[instr.target_address].block,
+                                cf_blocks_[instr.dword_index + 1].block);
     } break;
   }
 }
@@ -677,7 +708,7 @@ void SpirvShaderTranslator::ProcessAllocInstruction(
     const ParsedAllocInstruction& instr) {
   auto& b = *builder_;
 
-  auto head = cf_blocks_[instr.dword_index];
+  auto head = cf_blocks_[instr.dword_index].block;
   b.setBuildPoint(head);
 
   switch (instr.type) {
@@ -695,7 +726,7 @@ void SpirvShaderTranslator::ProcessAllocInstruction(
   }
 
   assert_true(cf_blocks_.size() > instr.dword_index + 1);
-  b.createBranch(cf_blocks_[instr.dword_index + 1]);
+  b.createBranch(cf_blocks_[instr.dword_index + 1].block);
 }
 
 void SpirvShaderTranslator::ProcessVertexFetchInstruction(
@@ -725,6 +756,8 @@ void SpirvShaderTranslator::ProcessVertexFetchInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
+    b.createNoResultOp(spv::Op::OpSelectionMerge,
+                       {predicated_block_end_->getId(), 0});
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
@@ -803,6 +836,8 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
+    b.createNoResultOp(spv::Op::OpSelectionMerge,
+                       {predicated_block_end_->getId(), 0});
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
@@ -905,6 +940,8 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
+    b.createNoResultOp(spv::Op::OpSelectionMerge,
+                       {predicated_block_end_->getId(), 0});
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
@@ -1339,6 +1376,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
+    b.createNoResultOp(spv::Op::OpSelectionMerge,
+                       {predicated_block_end_->getId(), 0});
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
@@ -1965,6 +2004,14 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
     storage_value = b.createLoad(storage_pointer);
   }
 
+  // Clamp the input value.
+  if (result.is_clamped) {
+    source_value_id = CreateGlslStd450InstructionCall(
+        spv::NoPrecision, b.getTypeId(source_value_id),
+        spv::GLSLstd450::kFClamp,
+        {source_value_id, b.makeFloatConstant(0.0), b.makeFloatConstant(1.0)});
+  }
+
   // Convert to the appropriate type, if needed.
   if (b.getTypeId(source_value_id) != storage_type) {
     std::vector<Id> constituents;
@@ -1990,14 +2037,6 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
         b.createConstructor(spv::NoPrecision, constituents, storage_type);
   }
 
-  // Clamp the input value.
-  if (result.is_clamped) {
-    source_value_id = CreateGlslStd450InstructionCall(
-        spv::NoPrecision, b.getTypeId(source_value_id),
-        spv::GLSLstd450::kFClamp,
-        {source_value_id, b.makeFloatConstant(0.0), b.makeFloatConstant(1.0)});
-  }
-
   // swizzle
   if (!result.is_standard_swizzle()) {
     std::vector<uint32_t> operands;
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index f30d3ab44..39d3899c1 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -55,7 +55,8 @@ class SpirvShaderTranslator : public ShaderTranslator {
   std::vector<uint8_t> CompleteTranslation() override;
   void PostTranslation(Shader* shader) override;
 
-  void PreProcessControlFlowInstruction(uint32_t cf_index) override;
+  void PreProcessControlFlowInstruction(
+      uint32_t cf_index, const ucode::ControlFlowInstruction& instr) override;
   void ProcessLabel(uint32_t cf_index) override;
   void ProcessControlFlowInstructionBegin(uint32_t cf_index) override;
   void ProcessControlFlowInstructionEnd(uint32_t cf_index) override;
@@ -133,7 +134,12 @@ class SpirvShaderTranslator : public ShaderTranslator {
 
   // Map of {binding -> {offset -> spv input}}
   std::map<uint32_t, std::map<uint32_t, spv::Id>> vertex_binding_map_;
-  std::map<uint32_t, spv::Block*> cf_blocks_;
+
+  struct CFBlock {
+    spv::Block* block = nullptr;
+    bool prev_dominates = true;
+  };
+  std::map<uint32_t, CFBlock> cf_blocks_;
 };
 
 }  // namespace gpu

From 6101b70641436c4e9e068434caeca6a7026f8dc0 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 29 Apr 2016 13:09:39 -0500
Subject: [PATCH 45/77] Fix the Vulkan immediate drawer not drawing lines.

---
 src/xenia/ui/vulkan/vulkan_immediate_drawer.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc b/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc
index 23dffd6c6..a68b44c5f 100644
--- a/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc
+++ b/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc
@@ -538,7 +538,7 @@ VulkanImmediateDrawer::VulkanImmediateDrawer(VulkanContext* graphics_context)
   pipeline_info.renderPass = context_->swap_chain()->render_pass();
   pipeline_info.subpass = 0;
   pipeline_info.basePipelineHandle = nullptr;
-  pipeline_info.basePipelineIndex = 0;
+  pipeline_info.basePipelineIndex = -1;
   err = vkCreateGraphicsPipelines(*device, nullptr, 1, &pipeline_info, nullptr,
                                   &triangle_pipeline_);
   CheckResult(err, "vkCreateGraphicsPipelines");
@@ -547,7 +547,7 @@ VulkanImmediateDrawer::VulkanImmediateDrawer(VulkanContext* graphics_context)
   pipeline_info.flags = VK_PIPELINE_CREATE_DERIVATIVE_BIT;
   input_info.topology = VK_PRIMITIVE_TOPOLOGY_LINE_LIST;
   pipeline_info.basePipelineHandle = triangle_pipeline_;
-  pipeline_info.basePipelineIndex = 0;
+  pipeline_info.basePipelineIndex = -1;
   err = vkCreateGraphicsPipelines(*device, nullptr, 1, &pipeline_info, nullptr,
                                   &line_pipeline_);
   CheckResult(err, "vkCreateGraphicsPipelines");
@@ -672,9 +672,6 @@ void VulkanImmediateDrawer::BeginDrawBatch(const ImmediateDrawBatch& batch) {
 void VulkanImmediateDrawer::Draw(const ImmediateDraw& draw) {
   auto swap_chain = context_->swap_chain();
 
-  if (draw.primitive_type != ImmediatePrimitiveType::kTriangles) {
-    return;
-  }
   switch (draw.primitive_type) {
     case ImmediatePrimitiveType::kLines:
       vkCmdBindPipeline(current_cmd_buffer_, VK_PIPELINE_BIND_POINT_GRAPHICS,

From cbccc785cc45eda064eb6e0c7c3beb5ed85e58ee Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 1 May 2016 10:15:33 -0500
Subject: [PATCH 46/77] TraceViewer: Build a tree of all command buffers and
 display that instead of a flat list.

---
 src/xenia/gpu/trace_player.cc |  24 ++++---
 src/xenia/gpu/trace_player.h  |   4 +-
 src/xenia/gpu/trace_reader.cc |  22 ++++++
 src/xenia/gpu/trace_reader.h  |  42 ++++++++++++
 src/xenia/gpu/trace_viewer.cc | 124 ++++++++++++++++++++++++----------
 src/xenia/gpu/trace_viewer.h  |   2 +
 6 files changed, 171 insertions(+), 47 deletions(-)

diff --git a/src/xenia/gpu/trace_player.cc b/src/xenia/gpu/trace_player.cc
index 54c199736..b79b49df2 100644
--- a/src/xenia/gpu/trace_player.cc
+++ b/src/xenia/gpu/trace_player.cc
@@ -51,7 +51,7 @@ void TracePlayer::SeekFrame(int target_frame) {
 
   assert_true(frame->start_ptr <= frame->end_ptr);
   PlayTrace(frame->start_ptr, frame->end_ptr - frame->start_ptr,
-            TracePlaybackMode::kBreakOnSwap);
+            TracePlaybackMode::kBreakOnSwap, false);
 }
 
 void TracePlayer::SeekCommand(int target_command) {
@@ -71,11 +71,11 @@ void TracePlayer::SeekCommand(int target_command) {
     const auto& previous_command = frame->commands[previous_command_index];
     PlayTrace(previous_command.end_ptr,
               command.end_ptr - previous_command.end_ptr,
-              TracePlaybackMode::kBreakOnSwap);
+              TracePlaybackMode::kBreakOnSwap, false);
   } else {
     // Full playback from frame start.
     PlayTrace(frame->start_ptr, command.end_ptr - frame->start_ptr,
-              TracePlaybackMode::kBreakOnSwap);
+              TracePlaybackMode::kBreakOnSwap, true);
   }
 }
 
@@ -84,19 +84,25 @@ void TracePlayer::WaitOnPlayback() {
 }
 
 void TracePlayer::PlayTrace(const uint8_t* trace_data, size_t trace_size,
-                            TracePlaybackMode playback_mode) {
-  graphics_system_->command_processor()->CallInThread(
-      [this, trace_data, trace_size, playback_mode]() {
-        PlayTraceOnThread(trace_data, trace_size, playback_mode);
-      });
+                            TracePlaybackMode playback_mode,
+                            bool clear_caches) {
+  playing_trace_ = true;
+  graphics_system_->command_processor()->CallInThread([=]() {
+    PlayTraceOnThread(trace_data, trace_size, playback_mode, clear_caches);
+  });
 }
 
 void TracePlayer::PlayTraceOnThread(const uint8_t* trace_data,
                                     size_t trace_size,
-                                    TracePlaybackMode playback_mode) {
+                                    TracePlaybackMode playback_mode,
+                                    bool clear_caches) {
   auto memory = graphics_system_->memory();
   auto command_processor = graphics_system_->command_processor();
 
+  if (clear_caches) {
+    command_processor->ClearCaches();
+  }
+
   command_processor->set_swap_mode(SwapMode::kIgnored);
   playback_percent_ = 0;
   auto trace_end = trace_data + trace_size;
diff --git a/src/xenia/gpu/trace_player.h b/src/xenia/gpu/trace_player.h
index d3926d460..0c3c6571a 100644
--- a/src/xenia/gpu/trace_player.h
+++ b/src/xenia/gpu/trace_player.h
@@ -50,9 +50,9 @@ class TracePlayer : public TraceReader {
 
  private:
   void PlayTrace(const uint8_t* trace_data, size_t trace_size,
-                 TracePlaybackMode playback_mode);
+                 TracePlaybackMode playback_mode, bool clear_caches);
   void PlayTraceOnThread(const uint8_t* trace_data, size_t trace_size,
-                         TracePlaybackMode playback_mode);
+                         TracePlaybackMode playback_mode, bool clear_caches);
 
   xe::ui::Loop* loop_;
   GraphicsSystem* graphics_system_;
diff --git a/src/xenia/gpu/trace_reader.cc b/src/xenia/gpu/trace_reader.cc
index fb58c436b..16980c28c 100644
--- a/src/xenia/gpu/trace_reader.cc
+++ b/src/xenia/gpu/trace_reader.cc
@@ -75,6 +75,10 @@ void TraceReader::ParseTrace() {
   const uint8_t* packet_start_ptr = nullptr;
   const uint8_t* last_ptr = trace_ptr;
   bool pending_break = false;
+  auto current_command_buffer = new CommandBuffer();
+  current_frame.command_tree =
+      std::unique_ptr<CommandBuffer>(current_command_buffer);
+
   while (trace_ptr < trace_data_ + trace_size_) {
     ++current_frame.command_count;
     auto type = static_cast<TraceCommandType>(xe::load<uint32_t>(trace_ptr));
@@ -94,11 +98,24 @@ void TraceReader::ParseTrace() {
         auto cmd =
             reinterpret_cast<const IndirectBufferStartCommand*>(trace_ptr);
         trace_ptr += sizeof(*cmd) + cmd->count * 4;
+
+        // Traverse down a level.
+        auto sub_command_buffer = new CommandBuffer();
+        sub_command_buffer->parent = current_command_buffer;
+        current_command_buffer->commands.push_back(
+            CommandBuffer::Command(sub_command_buffer));
+        current_command_buffer = sub_command_buffer;
         break;
       }
       case TraceCommandType::kIndirectBufferEnd: {
         auto cmd = reinterpret_cast<const IndirectBufferEndCommand*>(trace_ptr);
         trace_ptr += sizeof(*cmd);
+
+        // Go back up a level. If parent is null, this frame started in an
+        // indirect buffer.
+        if (current_command_buffer->parent) {
+          current_command_buffer = current_command_buffer->parent;
+        }
         break;
       }
       case TraceCommandType::kPacketStart: {
@@ -125,6 +142,8 @@ void TraceReader::ParseTrace() {
             command.end_ptr = trace_ptr;
             current_frame.commands.push_back(std::move(command));
             last_ptr = trace_ptr;
+            current_command_buffer->commands.push_back(CommandBuffer::Command(
+                uint32_t(current_frame.commands.size() - 1)));
             break;
           }
           case PacketCategory::kSwap:
@@ -136,6 +155,9 @@ void TraceReader::ParseTrace() {
         if (pending_break) {
           current_frame.end_ptr = trace_ptr;
           frames_.push_back(std::move(current_frame));
+          current_command_buffer = new CommandBuffer();
+          current_frame.command_tree =
+              std::unique_ptr<CommandBuffer>(current_command_buffer);
           current_frame.start_ptr = trace_ptr;
           current_frame.end_ptr = nullptr;
           current_frame.command_count = 0;
diff --git a/src/xenia/gpu/trace_reader.h b/src/xenia/gpu/trace_reader.h
index 5445bd1f9..b3245da46 100644
--- a/src/xenia/gpu/trace_reader.h
+++ b/src/xenia/gpu/trace_reader.h
@@ -11,6 +11,7 @@
 #define XENIA_GPU_TRACE_READER_H_
 
 #include <string>
+#include <vector>
 
 #include "xenia/base/mapped_memory.h"
 #include "xenia/gpu/trace_protocol.h"
@@ -51,6 +52,42 @@ namespace gpu {
 
 class TraceReader {
  public:
+  struct CommandBuffer {
+    struct Command {
+      enum class Type {
+        kCommand,
+        kBuffer,
+      };
+
+      Command() {}
+      Command(Command&& other) {
+        type = other.type;
+        command_id = other.command_id;
+        command_subtree = std::move(other.command_subtree);
+      }
+      Command(CommandBuffer* buf) {
+        type = Type::kBuffer;
+        command_subtree = std::unique_ptr<CommandBuffer>(buf);
+      }
+      Command(uint32_t id) {
+        type = Type::kCommand;
+        command_id = id;
+      }
+      ~Command() = default;
+
+      Type type;
+      uint32_t command_id = -1;
+      std::unique_ptr<CommandBuffer> command_subtree = nullptr;
+    };
+
+    CommandBuffer() {}
+    ~CommandBuffer() {}
+
+    // Parent command buffer, if one exists.
+    CommandBuffer* parent = nullptr;
+    std::vector<Command> commands;
+  };
+
   struct Frame {
     struct Command {
       enum class Type {
@@ -74,7 +111,12 @@ class TraceReader {
     const uint8_t* start_ptr = nullptr;
     const uint8_t* end_ptr = nullptr;
     int command_count = 0;
+
+    // Flat list of all commands in this frame.
     std::vector<Command> commands;
+
+    // Tree of all command buffers
+    std::unique_ptr<CommandBuffer> command_tree;
   };
 
   TraceReader() = default;
diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc
index 7ce20c7ca..8079631f5 100644
--- a/src/xenia/gpu/trace_viewer.cc
+++ b/src/xenia/gpu/trace_viewer.cc
@@ -390,6 +390,66 @@ void TraceViewer::DrawPacketDisassemblerUI() {
   ImGui::End();
 }
 
+int TraceViewer::RecursiveDrawCommandBufferUI(
+    const TraceReader::Frame* frame, TraceReader::CommandBuffer* buffer) {
+  int selected_id = -1;
+  int column_width = int(ImGui::GetContentRegionMax().x);
+
+  for (size_t i = 0; i < buffer->commands.size(); i++) {
+    switch (buffer->commands[i].type) {
+      case TraceReader::CommandBuffer::Command::Type::kBuffer: {
+        auto subtree = buffer->commands[i].command_subtree.get();
+        if (!subtree->commands.size()) {
+          continue;
+        }
+
+        ImGui::PushID(int(i));
+        if (ImGui::TreeNode((void*)0, "Indirect Buffer %d", i)) {
+          ImGui::Indent();
+          auto id = RecursiveDrawCommandBufferUI(
+              frame, buffer->commands[i].command_subtree.get());
+          ImGui::Unindent();
+          ImGui::TreePop();
+
+          if (id != -1) {
+            selected_id = id;
+          }
+        }
+        ImGui::PopID();
+      } break;
+
+      case TraceReader::CommandBuffer::Command::Type::kCommand: {
+        uint32_t command_id = buffer->commands[i].command_id;
+
+        const auto& command = frame->commands[command_id];
+        bool is_selected = command_id == player_->current_command_index();
+        const char* label;
+        switch (command.type) {
+          case TraceReader::Frame::Command::Type::kDraw:
+            label = "Draw";
+            break;
+          case TraceReader::Frame::Command::Type::kSwap:
+            label = "Swap";
+            break;
+        }
+
+        ImGui::PushID(command_id);
+        if (ImGui::Selectable(label, &is_selected)) {
+          selected_id = command_id;
+        }
+        ImGui::SameLine(column_width - 60.0f);
+        ImGui::Text("%d", command_id);
+        ImGui::PopID();
+        // if (did_seek && target_command == i) {
+        //   ImGui::SetScrollPosHere();
+        // }
+      } break;
+    }
+  }
+
+  return selected_id;
+}
+
 void TraceViewer::DrawCommandListUI() {
   ImGui::SetNextWindowPos(ImVec2(5, 70), ImGuiSetCond_FirstUseEver);
   if (!ImGui::Begin("Command List", nullptr, ImVec2(200, 640))) {
@@ -473,31 +533,12 @@ void TraceViewer::DrawCommandListUI() {
     ImGui::SetScrollPosHere();
   }
 
-  for (int i = 0; i < int(frame->commands.size()); ++i) {
-    ImGui::PushID(i);
-    is_selected = i == player_->current_command_index();
-    const auto& command = frame->commands[i];
-    const char* label;
-    switch (command.type) {
-      case TraceReader::Frame::Command::Type::kDraw:
-        label = "Draw";
-        break;
-      case TraceReader::Frame::Command::Type::kSwap:
-        label = "Swap";
-        break;
-    }
-    if (ImGui::Selectable(label, &is_selected)) {
-      if (!player_->is_playing_trace()) {
-        player_->SeekCommand(i);
-      }
-    }
-    ImGui::SameLine(column_width - 60.0f);
-    ImGui::Text("%d", i);
-    ImGui::PopID();
-    if (did_seek && target_command == i) {
-      ImGui::SetScrollPosHere();
-    }
+  auto id = RecursiveDrawCommandBufferUI(frame, frame->command_tree.get());
+  if (id != -1 && id != player_->current_command_index() &&
+      !player_->is_playing_trace()) {
+    player_->SeekCommand(id);
   }
+
   ImGui::EndChild();
   ImGui::End();
 }
@@ -639,8 +680,8 @@ void TraceViewer::DrawTextureInfo(
 
   ImGui::Columns(2);
   ImVec2 button_size(256, 256);
-  if (ImGui::ImageButton(ImTextureID(texture | ui::ImGuiDrawer::kIgnoreAlpha),
-                         button_size, ImVec2(0, 0), ImVec2(1, 1))) {
+  if (ImGui::ImageButton(ImTextureID(texture), button_size, ImVec2(0, 0),
+                         ImVec2(1, 1))) {
     // show viewer
   }
   ImGui::NextColumn();
@@ -1108,11 +1149,14 @@ void TraceViewer::DrawStateUI() {
         ((window_scissor_br >> 16) & 0x7FFF) -
             ((window_scissor_tl >> 16) & 0x7FFF));
     uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
+    uint32_t surface_actual = (surface_info >> 18) & 0x3FFF;
     uint32_t surface_pitch = surface_info & 0x3FFF;
     auto surface_msaa = (surface_info >> 16) & 0x3;
     static const char* kMsaaNames[] = {
         "1X", "2X", "4X",
     };
+    ImGui::BulletText("Surface Pitch - Actual: %d - %d", surface_pitch,
+                      surface_actual);
     ImGui::BulletText("Surface MSAA: %s", kMsaaNames[surface_msaa]);
     uint32_t vte_control = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32;
     bool vport_xscale_enable = (vte_control & (1 << 0)) > 0;
@@ -1124,6 +1168,9 @@ void TraceViewer::DrawStateUI() {
     assert_true(vport_xscale_enable == vport_yscale_enable ==
                 vport_zscale_enable == vport_xoffset_enable ==
                 vport_yoffset_enable == vport_zoffset_enable);
+    if (!vport_xscale_enable) {
+      ImGui::PushStyleColor(ImGuiCol_Text, kColorIgnored);
+    }
     ImGui::BulletText(
         "Viewport Offset: %f, %f, %f",
         vport_xoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 : 0,
@@ -1134,6 +1181,10 @@ void TraceViewer::DrawStateUI() {
         vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1,
         vport_yscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 : 1,
         vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1);
+    if (!vport_xscale_enable) {
+      ImGui::PopStyleColor();
+    }
+
     ImGui::BulletText("Vertex Format: %s, %s, %s, %s",
                       ((vte_control >> 8) & 0x1) ? "x/w0" : "x",
                       ((vte_control >> 8) & 0x1) ? "y/w0" : "y",
@@ -1318,7 +1369,7 @@ void TraceViewer::DrawStateUI() {
         if (write_mask) {
           auto color_target = GetColorRenderTarget(surface_pitch, surface_msaa,
                                                    color_base, color_format);
-          tex = ImTextureID(color_target | ui::ImGuiDrawer::kIgnoreAlpha);
+          tex = ImTextureID(color_target);
           if (ImGui::ImageButton(tex, button_size, ImVec2(0, 0),
                                  ImVec2(1, 1))) {
             // show viewer
@@ -1330,10 +1381,9 @@ void TraceViewer::DrawStateUI() {
         }
         if (ImGui::IsItemHovered()) {
           ImGui::BeginTooltip();
-          ImGui::Text(
-              "Color Target %d (%s), base %.4X, pitch %d, msaa %d, format %d",
-              i, write_mask ? "enabled" : "disabled", color_base, surface_pitch,
-              surface_msaa, color_format);
+          ImGui::Text("Color Target %d (%s), base %.4X, pitch %d, format %d", i,
+                      write_mask ? "enabled" : "disabled", color_base,
+                      surface_pitch, color_format);
 
           if (tex) {
             ImVec2 rel_pos;
@@ -1407,17 +1457,19 @@ void TraceViewer::DrawStateUI() {
 
       auto button_pos = ImGui::GetCursorScreenPos();
       ImVec2 button_size(256, 256);
-      ImGui::ImageButton(
-          ImTextureID(depth_target | ui::ImGuiDrawer::kIgnoreAlpha),
-          button_size, ImVec2(0, 0), ImVec2(1, 1));
+      ImGui::ImageButton(ImTextureID(depth_target), button_size, ImVec2(0, 0),
+                         ImVec2(1, 1));
       if (ImGui::IsItemHovered()) {
         ImGui::BeginTooltip();
 
+        ImGui::Text("Depth Target: base %.4X, pitch %d, format %d", depth_base,
+                    surface_pitch, depth_format);
+
         ImVec2 rel_pos;
         rel_pos.x = ImGui::GetMousePos().x - button_pos.x;
         rel_pos.y = ImGui::GetMousePos().y - button_pos.y;
-        ZoomedImage(ImTextureID(depth_target | ui::ImGuiDrawer::kIgnoreAlpha),
-                    rel_pos, button_size, 32.f, ImVec2(256, 256));
+        ZoomedImage(ImTextureID(depth_target), rel_pos, button_size, 32.f,
+                    ImVec2(256, 256));
 
         ImGui::EndTooltip();
       }
diff --git a/src/xenia/gpu/trace_viewer.h b/src/xenia/gpu/trace_viewer.h
index 6f7c900fc..7e82ad831 100644
--- a/src/xenia/gpu/trace_viewer.h
+++ b/src/xenia/gpu/trace_viewer.h
@@ -80,6 +80,8 @@ class TraceViewer {
   void DrawUI();
   void DrawControllerUI();
   void DrawPacketDisassemblerUI();
+  int RecursiveDrawCommandBufferUI(const TraceReader::Frame* frame,
+                                   TraceReader::CommandBuffer* buffer);
   void DrawCommandListUI();
   void DrawStateUI();
 

From aa038fbf23cb26e411b38252231e0f920c46e56d Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 1 May 2016 15:48:31 -0500
Subject: [PATCH 47/77] Skip the wrapping packet end after parsing IB end (to
 avoid false draws appearing)

---
 src/xenia/gpu/trace_reader.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/xenia/gpu/trace_reader.cc b/src/xenia/gpu/trace_reader.cc
index 16980c28c..6bedfb9b4 100644
--- a/src/xenia/gpu/trace_reader.cc
+++ b/src/xenia/gpu/trace_reader.cc
@@ -111,6 +111,11 @@ void TraceReader::ParseTrace() {
         auto cmd = reinterpret_cast<const IndirectBufferEndCommand*>(trace_ptr);
         trace_ptr += sizeof(*cmd);
 
+        // IB packet is wrapped in a kPacketStart/kPacketEnd. Skip the end.
+        auto end_cmd = reinterpret_cast<const PacketEndCommand*>(trace_ptr);
+        assert_true(end_cmd->type == TraceCommandType::kPacketEnd);
+        trace_ptr += sizeof(*cmd);
+
         // Go back up a level. If parent is null, this frame started in an
         // indirect buffer.
         if (current_command_buffer->parent) {

From d18c99aab6517e560dc2b8f2a022ddea7abf1a35 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 3 May 2016 14:05:34 -0500
Subject: [PATCH 48/77] RenderCache: Account for MSAA when calculating tile
 sizes. Add a new flag to enable native MSAA (this does not work properly at
 the moment)

---
 src/xenia/gpu/vulkan/render_cache.cc     | 283 +++++++++++++----------
 src/xenia/gpu/vulkan/render_cache.h      |  25 +-
 src/xenia/gpu/vulkan/vulkan_gpu_flags.cc |   3 +
 src/xenia/gpu/vulkan/vulkan_gpu_flags.h  |   2 +
 4 files changed, 186 insertions(+), 127 deletions(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 3df5e4c9e..7d73951b5 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -149,7 +149,8 @@ CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device,
     vulkan_format = DepthRenderTargetFormatToVkFormat(edram_format);
   }
   assert_true(vulkan_format != VK_FORMAT_UNDEFINED);
-  assert_true(bpp == 4);
+  // FIXME(DrChat): Was this check necessary?
+  // assert_true(bpp == 4);
 
   // Create the image with the desired properties.
   VkImageCreateInfo image_info;
@@ -165,23 +166,25 @@ CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device,
   image_info.extent.depth = 1;
   image_info.mipLevels = 1;
   image_info.arrayLayers = 1;
-  // image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-  //*
-  auto msaa_samples = static_cast<MsaaSamples>(key.msaa_samples);
-  switch (msaa_samples) {
-    case MsaaSamples::k1X:
-      image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-      break;
-    case MsaaSamples::k2X:
-      image_info.samples = VK_SAMPLE_COUNT_2_BIT;
-      break;
-    case MsaaSamples::k4X:
-      image_info.samples = VK_SAMPLE_COUNT_4_BIT;
-      break;
-    default:
-      assert_unhandled_case(msaa_samples);
+  if (FLAGS_vulkan_native_msaa) {
+    auto msaa_samples = static_cast<MsaaSamples>(key.msaa_samples);
+    switch (msaa_samples) {
+      case MsaaSamples::k1X:
+        image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+        break;
+      case MsaaSamples::k2X:
+        image_info.samples = VK_SAMPLE_COUNT_2_BIT;
+        break;
+      case MsaaSamples::k4X:
+        image_info.samples = VK_SAMPLE_COUNT_4_BIT;
+        break;
+      default:
+        assert_unhandled_case(msaa_samples);
+    }
+  } else {
+    image_info.samples = VK_SAMPLE_COUNT_1_BIT;
   }
-  //*/
+  sample_count = image_info.samples;
   image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
   image_info.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                      VK_IMAGE_USAGE_TRANSFER_DST_BIT |
@@ -243,7 +246,10 @@ CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device,
   image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   image_barrier.image = image;
-  image_barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+  image_barrier.subresourceRange.aspectMask =
+      key.color_or_depth
+          ? VK_IMAGE_ASPECT_COLOR_BIT
+          : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
   image_barrier.subresourceRange.baseMipLevel = 0;
   image_barrier.subresourceRange.levelCount = 1;
   image_barrier.subresourceRange.baseArrayLayer = 0;
@@ -338,19 +344,23 @@ CachedRenderPass::CachedRenderPass(VkDevice device,
   std::memcpy(&config, &desired_config, sizeof(config));
 
   VkSampleCountFlagBits sample_count;
-  switch (desired_config.surface_msaa) {
-    case MsaaSamples::k1X:
-      sample_count = VK_SAMPLE_COUNT_1_BIT;
-      break;
-    case MsaaSamples::k2X:
-      sample_count = VK_SAMPLE_COUNT_2_BIT;
-      break;
-    case MsaaSamples::k4X:
-      sample_count = VK_SAMPLE_COUNT_4_BIT;
-      break;
-    default:
-      assert_unhandled_case(desired_config.surface_msaa);
-      break;
+  if (FLAGS_vulkan_native_msaa) {
+    switch (desired_config.surface_msaa) {
+      case MsaaSamples::k1X:
+        sample_count = VK_SAMPLE_COUNT_1_BIT;
+        break;
+      case MsaaSamples::k2X:
+        sample_count = VK_SAMPLE_COUNT_2_BIT;
+        break;
+      case MsaaSamples::k4X:
+        sample_count = VK_SAMPLE_COUNT_4_BIT;
+        break;
+      default:
+        assert_unhandled_case(desired_config.surface_msaa);
+        break;
+    }
+  } else {
+    sample_count = VK_SAMPLE_COUNT_1_BIT;
   }
 
   // Initialize all attachments to default unused.
@@ -538,7 +548,7 @@ bool RenderCache::dirty() const {
            regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
   dirty |= cur_regs.pa_sc_window_scissor_br !=
            regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
-  dirty |= (cur_regs.rb_depthcontrol & (0x4 | 0x2)) !=
+  dirty |= (cur_regs.rb_depthcontrol & (0x4 | 0x2)) <
            (regs[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
   return dirty;
 }
@@ -561,7 +571,6 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
   bool dirty = false;
   dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
   dirty |= SetShadowRegister(&regs.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO);
-  dirty |= SetShadowRegister(&regs.rb_color_mask, XE_GPU_REG_RB_COLOR_MASK);
   dirty |= SetShadowRegister(&regs.rb_color_info, XE_GPU_REG_RB_COLOR_INFO);
   dirty |= SetShadowRegister(&regs.rb_color1_info, XE_GPU_REG_RB_COLOR1_INFO);
   dirty |= SetShadowRegister(&regs.rb_color2_info, XE_GPU_REG_RB_COLOR2_INFO);
@@ -572,7 +581,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
   dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_br,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR);
   dirty |=
-      (regs.rb_depthcontrol & (0x4 | 0x2)) !=
+      (regs.rb_depthcontrol & (0x4 | 0x2)) <
       (register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
   regs.rb_depthcontrol =
       register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2);
@@ -593,14 +602,8 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
       return nullptr;
     }
 
-    // Speculatively see if targets are actually used so we can skip copies
-    for (int i = 0; i < 4; i++) {
-      uint32_t color_mask = (regs.rb_color_mask >> (i * 4)) & 0xF;
-      config->color[i].used =
-          config->mode_control == xenos::ModeControl::kColorDepth &&
-          color_mask != 0;
-    }
-    config->depth_stencil.used = !!(regs.rb_depthcontrol & (0x4 | 0x2));
+    // Initial state update.
+    UpdateState();
 
     current_state_.render_pass = render_pass;
     current_state_.render_pass_handle = render_pass->handle;
@@ -610,7 +613,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
     // Depth
     auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
     if (depth_target && current_state_.config.depth_stencil.used) {
-      UpdateTileView(command_buffer, depth_target, true);
+      // UpdateTileView(command_buffer, depth_target, true);
     }
 
     // Color
@@ -620,7 +623,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
         continue;
       }
 
-      UpdateTileView(command_buffer, target, true);
+      // UpdateTileView(command_buffer, target, true);
     }
   }
   if (!render_pass) {
@@ -693,12 +696,23 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) {
         case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
           config->color[i].format = ColorRenderTargetFormat::k_8_8_8_8;
           break;
+        case ColorRenderTargetFormat::k_2_10_10_10_unknown:
+          config->color[i].format = ColorRenderTargetFormat::k_2_10_10_10;
+          break;
+        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown:
+          config->color[i].format = ColorRenderTargetFormat::k_2_10_10_10_FLOAT;
+          break;
       }
+
+      // Make sure all unknown bits are unset.
+      // RDR sets bit 0x00400000
+      // assert_zero(color_info[i] & ~0x000F0FFF);
     }
   } else {
     for (int i = 0; i < 4; ++i) {
       config->color[i].edram_base = 0;
       config->color[i].format = ColorRenderTargetFormat::k_8_8_8_8;
+      config->color[i].used = false;
     }
   }
 
@@ -708,9 +722,13 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) {
     config->depth_stencil.edram_base = regs.rb_depth_info & 0xFFF;
     config->depth_stencil.format =
         static_cast<DepthRenderTargetFormat>((regs.rb_depth_info >> 16) & 0x1);
+
+    // Make sure all unknown bits are unset.
+    // assert_zero(regs.rb_depth_info & ~0x00010FFF);
   } else {
     config->depth_stencil.edram_base = 0;
     config->depth_stencil.format = DepthRenderTargetFormat::kD24S8;
+    config->depth_stencil.used = false;
   }
 
   return true;
@@ -753,15 +771,22 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
 
   // If no framebuffer was found in the cache create a new one.
   if (!framebuffer) {
+    uint32_t tile_width = config->surface_msaa == MsaaSamples::k4X ? 40 : 80;
+    uint32_t tile_height = config->surface_msaa != MsaaSamples::k1X ? 8 : 16;
+
     CachedTileView* target_color_attachments[4] = {nullptr, nullptr, nullptr,
                                                    nullptr};
     for (int i = 0; i < 4; ++i) {
       TileViewKey color_key;
       color_key.tile_offset = config->color[i].edram_base;
-      color_key.tile_width = xe::round_up(config->surface_pitch_px, 80) / 80;
-      color_key.tile_height = xe::round_up(config->surface_height_px, 16) / 16;
+      color_key.tile_width =
+          xe::round_up(config->surface_pitch_px, tile_width) / tile_width;
+      color_key.tile_height = std::min(
+          2560 / tile_height, 160u);  // xe::round_up(config->surface_height_px,
+                                      // tile_height) / tile_height;
       color_key.color_or_depth = 1;
-      color_key.msaa_samples = static_cast<uint16_t>(config->surface_msaa);
+      color_key.msaa_samples =
+          0;  // static_cast<uint16_t>(config->surface_msaa);
       color_key.edram_format = static_cast<uint16_t>(config->color[i].format);
       target_color_attachments[i] =
           FindOrCreateTileView(command_buffer, color_key);
@@ -774,12 +799,13 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
     TileViewKey depth_stencil_key;
     depth_stencil_key.tile_offset = config->depth_stencil.edram_base;
     depth_stencil_key.tile_width =
-        xe::round_up(config->surface_pitch_px, 80) / 80;
-    depth_stencil_key.tile_height =
-        xe::round_up(config->surface_height_px, 16) / 16;
+        xe::round_up(config->surface_pitch_px, tile_width) / tile_width;
+    depth_stencil_key.tile_height = std::min(
+        2560 / tile_height, 160u);  // xe::round_up(config->surface_height_px,
+                                    // tile_height) / tile_height;
     depth_stencil_key.color_or_depth = 0;
     depth_stencil_key.msaa_samples =
-        static_cast<uint16_t>(config->surface_msaa);
+        0;  // static_cast<uint16_t>(config->surface_msaa);
     depth_stencil_key.edram_format =
         static_cast<uint16_t>(config->depth_stencil.format);
     auto target_depth_stencil_attachment =
@@ -819,6 +845,11 @@ CachedTileView* RenderCache::FindOrCreateTileView(
 void RenderCache::UpdateTileView(VkCommandBuffer command_buffer,
                                  CachedTileView* view, bool load,
                                  bool insert_barrier) {
+  uint32_t tile_width =
+      view->key.msaa_samples == uint16_t(MsaaSamples::k4X) ? 40 : 80;
+  uint32_t tile_height =
+      view->key.msaa_samples != uint16_t(MsaaSamples::k1X) ? 8 : 16;
+
   if (insert_barrier) {
     VkBufferMemoryBarrier barrier;
     barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -834,7 +865,10 @@ void RenderCache::UpdateTileView(VkCommandBuffer command_buffer,
     barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
     barrier.buffer = edram_buffer_;
     barrier.offset = view->key.tile_offset * 5120;
-    barrier.size = view->key.tile_width * 80 * view->key.tile_height * 16 * 4;
+    barrier.size = view->key.tile_width * tile_width * view->key.tile_height *
+                           tile_height * view->key.color_or_depth
+                       ? 4
+                       : 1;
     vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                          VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
                          &barrier, 0, nullptr);
@@ -850,8 +884,8 @@ void RenderCache::UpdateTileView(VkCommandBuffer command_buffer,
                                            ? VK_IMAGE_ASPECT_COLOR_BIT
                                            : VK_IMAGE_ASPECT_DEPTH_BIT;
   region.imageOffset = {0, 0, 0};
-  region.imageExtent = {view->key.tile_width * 80u, view->key.tile_height * 16u,
-                        1};
+  region.imageExtent = {view->key.tile_width * tile_width,
+                        view->key.tile_height * tile_height, 1};
   if (load) {
     vkCmdCopyBufferToImage(command_buffer, edram_buffer_, view->image,
                            VK_IMAGE_LAYOUT_GENERAL, 1, &region);
@@ -912,12 +946,27 @@ void RenderCache::EndRenderPass() {
       [](CachedTileView const* a, CachedTileView const* b) { return *a < *b; });
 
   for (auto view : cached_views) {
-    UpdateTileView(current_command_buffer_, view, false, false);
+    // UpdateTileView(current_command_buffer_, view, false, false);
   }
 
   current_command_buffer_ = nullptr;
 }
 
+void RenderCache::UpdateState() {
+  // Keep track of whether color attachments were used or not in this pass.
+  uint32_t rb_color_mask = register_file_->values[XE_GPU_REG_RB_COLOR_MASK].u32;
+  uint32_t rb_depthcontrol =
+      register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32;
+  for (int i = 0; i < 4; i++) {
+    uint32_t color_mask = (rb_color_mask >> (i * 4)) & 0xF;
+    current_state_.config.color[i].used |=
+        current_state_.config.mode_control == xenos::ModeControl::kColorDepth &&
+        color_mask != 0;
+  }
+
+  current_state_.config.depth_stencil.used |= !!(rb_depthcontrol & (0x4 | 0x2));
+}
+
 void RenderCache::ClearCache() {
   // TODO(benvanik): caching.
 }
@@ -999,47 +1048,39 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
                               bool color_or_depth, uint32_t format,
                               VkFilter filter, VkOffset3D offset,
                               VkExtent3D extents) {
+  if (color_or_depth) {
+    // Adjust similar formats for easier matching.
+    switch (static_cast<ColorRenderTargetFormat>(format)) {
+      case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
+        format = uint32_t(ColorRenderTargetFormat::k_8_8_8_8);
+        break;
+      case ColorRenderTargetFormat::k_2_10_10_10_unknown:
+        format = uint32_t(ColorRenderTargetFormat::k_2_10_10_10);
+        break;
+      case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown:
+        format = uint32_t(ColorRenderTargetFormat::k_2_10_10_10_FLOAT);
+        break;
+    }
+  }
+
+  uint32_t tile_width = num_samples == MsaaSamples::k4X ? 40 : 80;
+  uint32_t tile_height = num_samples != MsaaSamples::k1X ? 8 : 16;
+
   // Grab a tile view that represents the source image.
   TileViewKey key;
   key.color_or_depth = color_or_depth ? 1 : 0;
-  key.msaa_samples = static_cast<uint16_t>(num_samples);
+  key.msaa_samples = 0;  // static_cast<uint16_t>(num_samples);
   key.edram_format = format;
   key.tile_offset = edram_base;
-  key.tile_width = xe::round_up(pitch, 80) / 80;
-  key.tile_height = xe::round_up(height, 16) / 16;
+  key.tile_width = xe::round_up(pitch, tile_width) / tile_width;
+  key.tile_height =
+      std::min(2560 / tile_height,
+               160u);  // xe::round_up(height, tile_height) / tile_height;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
-  // Issue a memory barrier before we update this tile view.
-  VkBufferMemoryBarrier buffer_barrier;
-  buffer_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-  buffer_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-  buffer_barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
-  buffer_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  buffer_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  buffer_barrier.buffer = edram_buffer_;
-  buffer_barrier.offset = edram_base * 5120;
-  // TODO: Calculate this accurately (need texel size)
-  buffer_barrier.size = extents.width * extents.height * 4;
-
-  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
-                       &buffer_barrier, 0, nullptr);
-
-  // Update the tile view with current EDRAM contents.
-  // TODO: Heuristics to determine if this copy is avoidable.
-  // TODO(DrChat): Stencil copies.
-  VkBufferImageCopy buffer_copy;
-  buffer_copy.bufferOffset = edram_base * 5120;
-  buffer_copy.bufferImageHeight = 0;
-  buffer_copy.bufferRowLength = 0;
-  buffer_copy.imageSubresource = {0, 0, 0, 1};
-  buffer_copy.imageSubresource.aspectMask =
-      color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
-  buffer_copy.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
-  buffer_copy.imageOffset = {0, 0, 0};
-  vkCmdCopyBufferToImage(command_buffer, edram_buffer_, tile_view->image,
-                         VK_IMAGE_LAYOUT_GENERAL, 1, &buffer_copy);
+  // Update the view with the latest contents.
+  // UpdateTileView(command_buffer, tile_view, true, true);
 
   // Transition the image into a transfer destination layout, if needed.
   // TODO: Util function for this
@@ -1063,11 +1104,11 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
                        nullptr, 1, &image_barrier);
 
   // If we overflow we'll lose the device here.
-  assert_true(extents.width <= key.tile_width * 80u);
-  assert_true(extents.height <= key.tile_height * 16u);
+  assert_true(extents.width <= key.tile_width * tile_width);
+  assert_true(extents.height <= key.tile_height * tile_height);
 
   // Now issue the blit to the destination.
-  if (num_samples == MsaaSamples::k1X) {
+  if (tile_view->sample_count == VK_SAMPLE_COUNT_1_BIT) {
     VkImageBlit image_blit;
     image_blit.srcSubresource = {0, 0, 0, 1};
     image_blit.srcSubresource.aspectMask =
@@ -1127,14 +1168,32 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
   // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
   // need to detect this and calculate a value.
 
+  // Adjust similar formats for easier matching.
+  switch (format) {
+    case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
+      format = ColorRenderTargetFormat::k_8_8_8_8;
+      break;
+    case ColorRenderTargetFormat::k_2_10_10_10_unknown:
+      format = ColorRenderTargetFormat::k_2_10_10_10;
+      break;
+    case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown:
+      format = ColorRenderTargetFormat::k_2_10_10_10_FLOAT;
+      break;
+  }
+
+  uint32_t tile_width = num_samples == MsaaSamples::k4X ? 40 : 80;
+  uint32_t tile_height = num_samples != MsaaSamples::k1X ? 8 : 16;
+
   // Grab a tile view (as we need to clear an image first)
   TileViewKey key;
   key.color_or_depth = 1;
-  key.msaa_samples = static_cast<uint16_t>(num_samples);
+  key.msaa_samples = 0;  // static_cast<uint16_t>(num_samples);
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
-  key.tile_width = xe::round_up(pitch, 80) / 80;
-  key.tile_height = xe::round_up(height, 16) / 16;
+  key.tile_width = xe::round_up(pitch, tile_width) / tile_width;
+  key.tile_height =
+      std::min(2560 / tile_height,
+               160u);  // xe::round_up(height, tile_height) / tile_height;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
@@ -1147,16 +1206,7 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
                        VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range);
 
   // Copy image back into EDRAM buffer
-  VkBufferImageCopy copy_range;
-  copy_range.bufferOffset = edram_base * 5120;
-  copy_range.bufferImageHeight = 0;
-  copy_range.bufferRowLength = 0;
-  copy_range.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
-  copy_range.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
-  copy_range.imageOffset = {0, 0, 0};
-  vkCmdCopyImageToBuffer(command_buffer, tile_view->image,
-                         VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1,
-                         &copy_range);
+  // UpdateTileView(command_buffer, tile_view, false, false);
 }
 
 void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
@@ -1168,14 +1218,19 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
   // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just
   // need to detect this and calculate a value.
 
+  uint32_t tile_width = num_samples == MsaaSamples::k4X ? 40 : 80;
+  uint32_t tile_height = num_samples != MsaaSamples::k1X ? 8 : 16;
+
   // Grab a tile view (as we need to clear an image first)
   TileViewKey key;
   key.color_or_depth = 0;
-  key.msaa_samples = static_cast<uint16_t>(num_samples);
+  key.msaa_samples = 0;  // static_cast<uint16_t>(num_samples);
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
-  key.tile_width = xe::round_up(pitch, 80) / 80;
-  key.tile_height = xe::round_up(height, 16) / 16;
+  key.tile_width = xe::round_up(pitch, tile_width) / tile_width;
+  key.tile_height =
+      std::min(2560 / tile_height,
+               160u);  // xe::round_up(height, tile_height) / tile_height;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
@@ -1191,19 +1246,7 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
                               VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range);
 
   // Copy image back into EDRAM buffer
-  // TODO(DrChat): Stencil copies.
-  VkBufferImageCopy copy_range;
-  copy_range.bufferOffset = edram_base * 5120;
-  copy_range.bufferImageHeight = 0;
-  copy_range.bufferRowLength = 0;
-  copy_range.imageSubresource = {
-      VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0, 1,
-  };
-  copy_range.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u};
-  copy_range.imageOffset = {0, 0, 0};
-  vkCmdCopyImageToBuffer(command_buffer, tile_view->image,
-                         VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1,
-                         &copy_range);
+  // UpdateTileView(command_buffer, tile_view, false, false);
 }
 
 void RenderCache::FillEDRAM(VkCommandBuffer command_buffer, uint32_t value) {
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index 86edac7bc..4eeca42bf 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -57,6 +57,8 @@ class CachedTileView {
   VkImageView image_view = nullptr;
   // Memory buffer
   VkDeviceMemory memory = nullptr;
+  // Image sample count
+  VkSampleCountFlagBits sample_count = VK_SAMPLE_COUNT_1_BIT;
 
   CachedTileView(ui::vulkan::VulkanDevice* device,
                  VkCommandBuffer command_buffer, VkDeviceMemory edram_memory,
@@ -81,9 +83,9 @@ class CachedTileView {
 struct RenderConfiguration {
   // Render mode (color+depth, depth-only, etc).
   xenos::ModeControl mode_control;
-  // Target surface pitch, in pixels.
+  // Target surface pitch multiplied by MSAA, in pixels.
   uint32_t surface_pitch_px;
-  // ESTIMATED target surface height, in pixels.
+  // ESTIMATED target surface height multiplied by MSAA, in pixels.
   uint32_t surface_height_px;
   // Surface MSAA setting.
   MsaaSamples surface_msaa;
@@ -111,6 +113,9 @@ struct RenderState {
   // Target framebuffer bound to the render pass.
   CachedFramebuffer* framebuffer = nullptr;
   VkFramebuffer framebuffer_handle = nullptr;
+
+  bool color_attachment_written[4] = {false};
+  bool depth_attachment_written = false;
 };
 
 // Manages the virtualized EDRAM and the render target cache.
@@ -135,9 +140,13 @@ struct RenderState {
 // 320px by rounding up to the next tile.
 //
 // MSAA and other settings will modify the exact pixel sizes, like 4X makes
-// each tile effectively 40x8px, but they are still all 5120b. As we try to
-// emulate this we adjust our viewport when rendering to stretch pixels as
-// needed.
+// each tile effectively 40x8px / 2X makes each tile 80x8px, but they are still
+// all 5120b. As we try to emulate this we adjust our viewport when rendering to
+// stretch pixels as needed.
+//
+// It appears that games also take advantage of MSAA stretching tiles when doing
+// clears. Games will clear a view with 1/2X pitch/height and 4X MSAA and then
+// later draw to that view with 1X pitch/height and 1X MSAA.
 //
 // The good news is that games cannot read EDRAM directly but must use a copy
 // operation to get the data out. That gives us a chance to do whatever we
@@ -269,6 +278,9 @@ class RenderCache {
   // The command buffer will be transitioned out of the render pass phase.
   void EndRenderPass();
 
+  // Updates current render state. Call this every draw with an open render pass
+  void UpdateState();
+
   // Clears all cached content.
   void ClearCache();
 
@@ -346,13 +358,12 @@ class RenderCache {
   struct ShadowRegisters {
     uint32_t rb_modecontrol;
     uint32_t rb_surface_info;
-    uint32_t rb_color_mask;
     uint32_t rb_color_info;
     uint32_t rb_color1_info;
     uint32_t rb_color2_info;
     uint32_t rb_color3_info;
-    uint32_t rb_depthcontrol;
     uint32_t rb_depth_info;
+    uint32_t rb_depthcontrol;
     uint32_t pa_sc_window_scissor_tl;
     uint32_t pa_sc_window_scissor_br;
 
diff --git a/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc b/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc
index 1f018db54..52bc10c84 100644
--- a/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc
+++ b/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc
@@ -11,3 +11,6 @@
 
 DEFINE_bool(vulkan_renderdoc_capture_all, false,
             "Capture everything with RenderDoc.");
+DEFINE_bool(vulkan_native_msaa, true, "Use native MSAA");
+DEFINE_bool(vulkan_dump_disasm, false,
+            "Dump shader disassembly. NVIDIA only supported.");
diff --git a/src/xenia/gpu/vulkan/vulkan_gpu_flags.h b/src/xenia/gpu/vulkan/vulkan_gpu_flags.h
index ca83dfb7a..169e797c8 100644
--- a/src/xenia/gpu/vulkan/vulkan_gpu_flags.h
+++ b/src/xenia/gpu/vulkan/vulkan_gpu_flags.h
@@ -15,5 +15,7 @@
 #define FINE_GRAINED_DRAW_SCOPES 1
 
 DECLARE_bool(vulkan_renderdoc_capture_all);
+DECLARE_bool(vulkan_native_msaa);
+DECLARE_bool(vulkan_dump_disasm);
 
 #endif  // XENIA_GPU_VULKAN_VULKAN_GPU_FLAGS_H_

From 8e8df2e778e59d20350889ea46c1163f0f499921 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 3 May 2016 14:07:20 -0500
Subject: [PATCH 49/77] PipelineCache: Support shader disasm dumps for nvidia
 cards. Fix MSAA 2X multiplier.

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 169 ++++++++++++++++++++-----
 src/xenia/gpu/vulkan/pipeline_cache.h  |   3 +
 2 files changed, 138 insertions(+), 34 deletions(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index 19db3cd4f..70054f5e2 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -17,6 +17,9 @@
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/vulkan/vulkan_gpu_flags.h"
 
+#include <cinttypes>
+#include <string>
+
 namespace xe {
 namespace gpu {
 namespace vulkan {
@@ -169,9 +172,9 @@ VulkanShader* PipelineCache::LoadShader(ShaderType shader_type,
   }
 
   if (shader->is_valid()) {
-    XELOGGPU("Generated %s shader at 0x%.8X (%db):\n%s",
+    XELOGGPU("Generated %s shader at 0x%.8X (%db) - hash %.16" PRIX64 ":\n%s\n",
              shader_type == ShaderType::kVertex ? "vertex" : "pixel",
-             guest_address, dword_count * 4,
+             guest_address, dword_count * 4, shader->ucode_data_hash(),
              shader->ucode_disassembly().c_str());
   }
 
@@ -288,12 +291,105 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state,
                                        &pipeline_info, nullptr, &pipeline);
   CheckResult(err, "vkCreateGraphicsPipelines");
 
+  // Dump shader disassembly.
+  if (FLAGS_vulkan_dump_disasm) {
+    DumpShaderDisasmNV(pipeline_info);
+  }
+
   // Add to cache with the hash key for reuse.
   cached_pipelines_.insert({hash_key, pipeline});
 
   return pipeline;
 }
 
+void PipelineCache::DumpShaderDisasmNV(
+    const VkGraphicsPipelineCreateInfo& pipeline_info) {
+  // !! HACK !!: This only works on NVidia drivers. Dumps shader disasm.
+  // This code is super ugly. Update this when NVidia includes an official
+  // way to dump shader disassembly.
+
+  VkPipelineCacheCreateInfo pipeline_cache_info;
+  VkPipelineCache dummy_pipeline_cache;
+  pipeline_cache_info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
+  pipeline_cache_info.pNext = nullptr;
+  pipeline_cache_info.flags = 0;
+  pipeline_cache_info.initialDataSize = 0;
+  pipeline_cache_info.pInitialData = nullptr;
+  auto err = vkCreatePipelineCache(device_, &pipeline_cache_info, nullptr,
+                                   &dummy_pipeline_cache);
+  CheckResult(err, "vkCreatePipelineCache");
+
+  // Create a pipeline on the dummy cache and dump it.
+  VkPipeline dummy_pipeline;
+  err = vkCreateGraphicsPipelines(device_, dummy_pipeline_cache, 1,
+                                  &pipeline_info, nullptr, &dummy_pipeline);
+
+  std::vector<uint8_t> pipeline_data;
+  size_t data_size = 0;
+  err = vkGetPipelineCacheData(device_, dummy_pipeline_cache, &data_size,
+                               nullptr);
+  if (err == VK_SUCCESS) {
+    pipeline_data.resize(data_size);
+    vkGetPipelineCacheData(device_, dummy_pipeline_cache, &data_size,
+                           pipeline_data.data());
+
+    // Scan the data for the disassembly.
+    std::string disasm_vp, disasm_fp;
+
+    const char* disasm_start_vp = nullptr;
+    const char* disasm_start_fp = nullptr;
+    size_t search_offset = 0;
+    const char* search_start =
+        reinterpret_cast<const char*>(pipeline_data.data());
+    while (true) {
+      auto p = reinterpret_cast<const char*>(
+          memchr(pipeline_data.data() + search_offset, '!',
+                 pipeline_data.size() - search_offset));
+      if (!p) {
+        break;
+      }
+      if (!strncmp(p, "!!NV", 4)) {
+        if (!strncmp(p + 4, "vp", 2)) {
+          disasm_start_vp = p;
+        } else if (!strncmp(p + 4, "fp", 2)) {
+          disasm_start_fp = p;
+        }
+
+        if (disasm_start_fp && disasm_start_vp) {
+          // Found all we needed.
+          break;
+        }
+      }
+      search_offset = p - search_start;
+      ++search_offset;
+    }
+    if (disasm_start_vp) {
+      disasm_vp = std::string(disasm_start_vp);
+
+      // For some reason there's question marks all over the code.
+      disasm_vp.erase(std::remove(disasm_vp.begin(), disasm_vp.end(), '?'),
+                      disasm_vp.end());
+    } else {
+      disasm_vp = std::string("Shader disassembly not available.");
+    }
+
+    if (disasm_start_fp) {
+      disasm_fp = std::string(disasm_start_fp);
+
+      // For some reason there's question marks all over the code.
+      disasm_fp.erase(std::remove(disasm_fp.begin(), disasm_fp.end(), '?'),
+                      disasm_fp.end());
+    } else {
+      disasm_fp = std::string("Shader disassembly not available.");
+    }
+
+    XELOGI("%s\n=====================================\n%s", disasm_vp.c_str(),
+           disasm_fp.c_str());
+  }
+
+  vkDestroyPipelineCache(device_, dummy_pipeline_cache, nullptr);
+}
+
 VkShaderModule PipelineCache::GetGeometryShader(PrimitiveType primitive_type,
                                                 bool is_line_mode) {
   switch (primitive_type) {
@@ -396,22 +492,18 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
   viewport_state_dirty |= SetShadowRegister(&regs.pa_cl_vport_zscale,
                                             XE_GPU_REG_PA_CL_VPORT_ZSCALE);
   if (viewport_state_dirty) {
-    // HACK: no clue where to get these values.
     // RB_SURFACE_INFO
     auto surface_msaa =
         static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
-    // TODO(benvanik): ??
-    // FIXME: Some games depend on these for proper clears (e.g. only clearing
-    // half the size they actually want with 4x MSAA), but others don't.
-    // Figure out how these games are expecting clears to be done.
+
+    // Apply a multiplier to emulate MSAA.
     float window_width_scalar = 1;
     float window_height_scalar = 1;
     switch (surface_msaa) {
       case MsaaSamples::k1X:
         break;
       case MsaaSamples::k2X:
-        // ??
-        window_width_scalar = window_height_scalar = 1.41421356f;
+        window_height_scalar = 2;
         break;
       case MsaaSamples::k4X:
         window_width_scalar = window_height_scalar = 2;
@@ -770,11 +862,13 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState(
                                            : VK_FORMAT_A2R10G10B10_UNORM_PACK32;
           break;
         case VertexFormat::k_10_11_11:
-          // assert_always("unsupported?");
+          assert_true(is_signed);
           vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32;
           break;
         case VertexFormat::k_11_11_10:
-          assert_true(is_signed);
+          // Converted in-shader.
+          // TODO(DrChat)
+          // vertex_attrib_descr.format = VK_FORMAT_R32_UINT;
           vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32;
           break;
         case VertexFormat::k_16_16:
@@ -946,6 +1040,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
                              XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL);
   dirty |= SetShadowRegister(&regs.pa_sc_screen_scissor_br,
                              XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR);
+  dirty |= SetShadowRegister(&regs.pa_sc_viz_query, XE_GPU_REG_PA_SC_VIZ_QUERY);
   dirty |= SetShadowRegister(&regs.multi_prim_ib_reset_index,
                              XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX);
   dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
@@ -964,12 +1059,14 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
 
   // Discard rasterizer output in depth-only mode.
   // TODO(DrChat): Figure out how to make this work properly.
-  /*
   auto enable_mode = static_cast<xenos::ModeControl>(regs.rb_modecontrol & 0x7);
   state_info.rasterizerDiscardEnable =
       enable_mode == xenos::ModeControl::kColorDepth ? VK_FALSE : VK_TRUE;
-  //*/
-  state_info.rasterizerDiscardEnable = VK_FALSE;
+
+  // KILL_PIX_POST_EARLY_Z
+  if (regs.pa_sc_viz_query & 0x80) {
+    state_info.rasterizerDiscardEnable = VK_TRUE;
+  }
 
   bool poly_mode = ((regs.pa_su_sc_mode_cntl >> 3) & 0x3) != 0;
   if (poly_mode) {
@@ -1039,27 +1136,31 @@ PipelineCache::UpdateStatus PipelineCache::UpdateMultisampleState() {
   state_info.pNext = nullptr;
   state_info.flags = 0;
 
-  // PA_SC_AA_CONFIG MSAA_NUM_SAMPLES
-  // PA_SU_SC_MODE_CNTL MSAA_ENABLE
-  // state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
-  //*
-  auto msaa_num_samples =
-      static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
-  switch (msaa_num_samples) {
-    case MsaaSamples::k1X:
-      state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
-      break;
-    case MsaaSamples::k2X:
-      state_info.rasterizationSamples = VK_SAMPLE_COUNT_2_BIT;
-      break;
-    case MsaaSamples::k4X:
-      state_info.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT;
-      break;
-    default:
-      assert_unhandled_case(msaa_num_samples);
-      break;
+  // PA_SC_AA_CONFIG MSAA_NUM_SAMPLES (0x7)
+  // PA_SC_AA_MASK (0xFFFF)
+  // PA_SU_SC_MODE_CNTL MSAA_ENABLE (0x10000)
+  // If set, all samples will be sampled at set locations. Otherwise, they're
+  // all sampled from the pixel center.
+  if (FLAGS_vulkan_native_msaa) {
+    auto msaa_num_samples =
+        static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
+    switch (msaa_num_samples) {
+      case MsaaSamples::k1X:
+        state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+        break;
+      case MsaaSamples::k2X:
+        state_info.rasterizationSamples = VK_SAMPLE_COUNT_2_BIT;
+        break;
+      case MsaaSamples::k4X:
+        state_info.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT;
+        break;
+      default:
+        assert_unhandled_case(msaa_num_samples);
+        break;
+    }
+  } else {
+    state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
   }
-  //*/
 
   state_info.sampleShadingEnable = VK_FALSE;
   state_info.minSampleShading = 0;
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h
index f240b9c0d..e5645f638 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/pipeline_cache.h
@@ -75,6 +75,8 @@ class PipelineCache {
   // state.
   VkPipeline GetPipeline(const RenderState* render_state, uint64_t hash_key);
 
+  void DumpShaderDisasmNV(const VkGraphicsPipelineCreateInfo& info);
+
   // Gets a geometry shader used to emulate the given primitive type.
   // Returns nullptr if the primitive doesn't need to be emulated.
   VkShaderModule GetGeometryShader(PrimitiveType primitive_type,
@@ -210,6 +212,7 @@ class PipelineCache {
     uint32_t pa_su_sc_mode_cntl;
     uint32_t pa_sc_screen_scissor_tl;
     uint32_t pa_sc_screen_scissor_br;
+    uint32_t pa_sc_viz_query;
     uint32_t multi_prim_ib_reset_index;
     uint32_t rb_modecontrol;
 

From f2af28c3228857912a1b8da5db7cca6029dc1d3f Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 3 May 2016 14:10:15 -0500
Subject: [PATCH 50/77] TextureCache: Fix up some synchronization flaws
 (deleting in-use textures/etc) Fix texture binding IDs not matching fetch
 instruction IDs. Fix some bad texture format matching. Add access watches

---
 src/xenia/gpu/vulkan/texture_cache.cc | 362 ++++++++++++++++----------
 src/xenia/gpu/vulkan/texture_cache.h  |  23 +-
 2 files changed, 245 insertions(+), 140 deletions(-)

diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 0deddf36d..ee82cb74a 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -50,9 +50,9 @@ static const TextureConfig texture_configs[64] = {
     {TextureFormat::k_4_4_4_4, VK_FORMAT_R4G4B4A4_UNORM_PACK16},
     {TextureFormat::k_10_11_11, VK_FORMAT_B10G11R11_UFLOAT_PACK32},  // ?
     {TextureFormat::k_11_11_10, VK_FORMAT_B10G11R11_UFLOAT_PACK32},  // ?
-    {TextureFormat::k_DXT1, VK_FORMAT_BC1_RGBA_SRGB_BLOCK},          // ?
-    {TextureFormat::k_DXT2_3, VK_FORMAT_BC3_SRGB_BLOCK},             // ?
-    {TextureFormat::k_DXT4_5, VK_FORMAT_BC5_UNORM_BLOCK},            // ?
+    {TextureFormat::k_DXT1, VK_FORMAT_BC1_RGBA_SRGB_BLOCK},
+    {TextureFormat::k_DXT2_3, VK_FORMAT_BC2_SRGB_BLOCK},
+    {TextureFormat::k_DXT4_5, VK_FORMAT_BC3_SRGB_BLOCK},
     {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
     {TextureFormat::k_24_8, VK_FORMAT_D24_UNORM_S8_UINT},
     {TextureFormat::k_24_8_FLOAT, VK_FORMAT_D24_UNORM_S8_UINT},  // ?
@@ -81,14 +81,13 @@ static const TextureConfig texture_configs[64] = {
     {TextureFormat::k_16_INTERLACED, VK_FORMAT_UNDEFINED},
     {TextureFormat::k_16_MPEG_INTERLACED, VK_FORMAT_UNDEFINED},
     {TextureFormat::k_16_16_MPEG_INTERLACED, VK_FORMAT_UNDEFINED},
-    {TextureFormat::k_DXN, VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RG_RGTC2 */},
+
+    // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
+    {TextureFormat::k_DXN, VK_FORMAT_BC5_UNORM_BLOCK},  // ?
     {TextureFormat::k_8_8_8_8_AS_16_16_16_16, VK_FORMAT_R8G8B8A8_UNORM},
-    {TextureFormat::k_DXT1_AS_16_16_16_16,
-     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGB_S3TC_DXT1_EXT */},
-    {TextureFormat::k_DXT2_3_AS_16_16_16_16,
-     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT3_EXT */},
-    {TextureFormat::k_DXT4_5_AS_16_16_16_16,
-     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT5_EXT */},
+    {TextureFormat::k_DXT1_AS_16_16_16_16, VK_FORMAT_BC1_RGB_SRGB_BLOCK},
+    {TextureFormat::k_DXT2_3_AS_16_16_16_16, VK_FORMAT_BC2_SRGB_BLOCK},
+    {TextureFormat::k_DXT4_5_AS_16_16_16_16, VK_FORMAT_BC3_SRGB_BLOCK},
     {TextureFormat::k_2_10_10_10_AS_16_16_16_16,
      VK_FORMAT_A2R10G10B10_UNORM_PACK32},
     {TextureFormat::k_10_11_11_AS_16_16_16_16,
@@ -96,10 +95,8 @@ static const TextureConfig texture_configs[64] = {
     {TextureFormat::k_11_11_10_AS_16_16_16_16,
      VK_FORMAT_B10G11R11_UFLOAT_PACK32},  // ?
     {TextureFormat::k_32_32_32_FLOAT, VK_FORMAT_R32G32B32_SFLOAT},
-    {TextureFormat::k_DXT3A,
-     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT3_EXT */},
-    {TextureFormat::k_DXT5A,
-     VK_FORMAT_UNDEFINED /* GL_COMPRESSED_RGBA_S3TC_DXT5_EXT */},
+    {TextureFormat::k_DXT3A, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_DXT5A, VK_FORMAT_UNDEFINED},
     {TextureFormat::k_CTX1, VK_FORMAT_UNDEFINED},
     {TextureFormat::k_DXT3A_AS_1_1_1_1, VK_FORMAT_UNDEFINED},
     {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
@@ -120,10 +117,10 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
   descriptor_pool_info.pNext = nullptr;
   descriptor_pool_info.flags =
       VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
-  descriptor_pool_info.maxSets = 4096;
+  descriptor_pool_info.maxSets = 8192;
   VkDescriptorPoolSize pool_sizes[1];
   pool_sizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-  pool_sizes[0].descriptorCount = 4096;
+  pool_sizes[0].descriptorCount = 8192;
   descriptor_pool_info.poolSizeCount = 1;
   descriptor_pool_info.pPoolSizes = pool_sizes;
   auto err = vkCreateDescriptorPool(*device_, &descriptor_pool_info, nullptr,
@@ -301,11 +298,21 @@ TextureCache::Texture* TextureCache::AllocateTexture(
 }
 
 bool TextureCache::FreeTexture(Texture* texture) {
+  if (texture->in_flight_fence->status() != VK_SUCCESS) {
+    // Texture still in flight.
+    return false;
+  }
+
   for (auto it = texture->views.begin(); it != texture->views.end();) {
     vkDestroyImageView(*device_, (*it)->view, nullptr);
     it = texture->views.erase(it);
   }
 
+  if (texture->access_watch_handle) {
+    memory_->CancelAccessWatch(texture->access_watch_handle);
+    texture->access_watch_handle = 0;
+  }
+
   vkDestroyImage(*device_, texture->image, nullptr);
   vkFreeMemory(*device_, texture->image_memory, nullptr);
   delete texture;
@@ -326,6 +333,25 @@ TextureCache::Texture* TextureCache::DemandResolveTexture(
   // No texture at this location. Make a new one.
   texture = AllocateTexture(texture_info);
   texture->is_full_texture = false;
+
+  // Setup an access watch. If this texture is touched, it is destroyed.
+  texture->access_watch_handle = memory_->AddPhysicalAccessWatch(
+      texture_info.guest_address, texture_info.input_length,
+      cpu::MMIOHandler::kWatchWrite,
+      [](void* context_ptr, void* data_ptr, uint32_t address) {
+        auto self = reinterpret_cast<TextureCache*>(context_ptr);
+        auto touched_texture = reinterpret_cast<Texture*>(data_ptr);
+        // Clear watch handle first so we don't redundantly
+        // remove.
+        touched_texture->access_watch_handle = 0;
+        touched_texture->pending_invalidation = true;
+        // Add to pending list so Scavenge will clean it up.
+        self->invalidated_resolve_textures_mutex_.lock();
+        self->invalidated_resolve_textures_.push_back(touched_texture);
+        self->invalidated_resolve_textures_mutex_.unlock();
+      },
+      this, texture);
+
   resolve_textures_.push_back(texture);
   return texture;
 }
@@ -337,6 +363,12 @@ TextureCache::Texture* TextureCache::Demand(
   auto texture_hash = texture_info.hash();
   for (auto it = textures_.find(texture_hash); it != textures_.end(); ++it) {
     if (it->second->texture_info == texture_info) {
+      if (it->second->pending_invalidation) {
+        // This texture has been invalidated!
+        Scavenge();
+        break;
+      }
+
       return it->second;
     }
   }
@@ -355,6 +387,25 @@ TextureCache::Texture* TextureCache::Demand(
       // Upgrade this texture to a full texture.
       texture->is_full_texture = true;
       texture->texture_info = texture_info;
+
+      memory_->CancelAccessWatch(texture->access_watch_handle);
+      texture->access_watch_handle = memory_->AddPhysicalAccessWatch(
+          texture_info.guest_address, texture_info.input_length,
+          cpu::MMIOHandler::kWatchWrite,
+          [](void* context_ptr, void* data_ptr, uint32_t address) {
+            auto self = reinterpret_cast<TextureCache*>(context_ptr);
+            auto touched_texture = reinterpret_cast<Texture*>(data_ptr);
+            // Clear watch handle first so we don't redundantly
+            // remove.
+            touched_texture->access_watch_handle = 0;
+            touched_texture->pending_invalidation = true;
+            // Add to pending list so Scavenge will clean it up.
+            self->invalidated_textures_mutex_.lock();
+            self->invalidated_textures_->push_back(touched_texture);
+            self->invalidated_textures_mutex_.unlock();
+          },
+          this, texture);
+
       textures_[texture_hash] = *it;
       it = resolve_textures_.erase(it);
       return textures_[texture_hash];
@@ -367,6 +418,11 @@ TextureCache::Texture* TextureCache::Demand(
     return nullptr;
   }
 
+  if (texture_info.dimension != Dimension::k2D) {
+    // Abort.
+    return nullptr;
+  }
+
   // Create a new texture and cache it.
   auto texture = AllocateTexture(texture_info);
   if (!texture) {
@@ -388,31 +444,25 @@ TextureCache::Texture* TextureCache::Demand(
 
   if (!uploaded) {
     // TODO: Destroy the texture.
-    assert_always();
+    FreeTexture(texture);
     return nullptr;
   }
 
   // Copy in overlapping resolve textures.
-  /*
-  for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
-       ++it) {
-    auto texture = (*it);
-    if (texture_info.guest_address == texture->texture_info.guest_address &&
-        texture_info.size_2d.logical_width ==
-            texture->texture_info.size_2d.logical_width &&
-        texture_info.size_2d.logical_height ==
-            texture->texture_info.size_2d.logical_height) {
-      // Exact match.
-      // TODO: Lazy match (at an offset)
-      // Upgrade this texture to a full texture.
-      texture->is_full_texture = true;
-      texture->texture_info = texture_info;
-      textures_[texture_hash] = *it;
-      it = resolve_textures_.erase(it);
-      return textures_[texture_hash];
+  // FIXME: RDR appears to take textures from small chunks of a resolve texture?
+  if (texture_info.dimension == Dimension::k2D) {
+    for (auto it = resolve_textures_.begin(); it != resolve_textures_.end();
+         ++it) {
+      auto texture = (*it);
+      if (texture_info.guest_address >= texture->texture_info.guest_address &&
+          texture_info.guest_address < texture->texture_info.guest_address +
+                                           texture->texture_info.input_length) {
+        // Lazy matched a resolve texture. Copy it in and destroy it.
+        // Future resolves will just copy directly into this texture.
+        // assert_always();
+      }
     }
   }
-  */
 
   // Though we didn't find an exact match, that doesn't mean we're out of the
   // woods yet. This texture could either be a portion of another texture or
@@ -594,8 +644,36 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
       address_mode_map[static_cast<int>(sampler_info.clamp_w)];
 
   sampler_create_info.mipLodBias = 0.0f;
-  sampler_create_info.anisotropyEnable = VK_FALSE;
-  sampler_create_info.maxAnisotropy = 1.0f;
+
+  float aniso = 0.f;
+  switch (sampler_info.aniso_filter) {
+    case AnisoFilter::kDisabled:
+      aniso = 1.0f;
+      break;
+    case AnisoFilter::kMax_1_1:
+      aniso = 1.0f;
+      break;
+    case AnisoFilter::kMax_2_1:
+      aniso = 2.0f;
+      break;
+    case AnisoFilter::kMax_4_1:
+      aniso = 4.0f;
+      break;
+    case AnisoFilter::kMax_8_1:
+      aniso = 8.0f;
+      break;
+    case AnisoFilter::kMax_16_1:
+      aniso = 16.0f;
+      break;
+    default:
+      assert_unhandled_case(aniso);
+      return nullptr;
+  }
+
+  sampler_create_info.anisotropyEnable =
+      sampler_info.aniso_filter != AnisoFilter::kDisabled ? VK_TRUE : VK_FALSE;
+  sampler_create_info.maxAnisotropy = aniso;
+
   sampler_create_info.compareEnable = VK_FALSE;
   sampler_create_info.compareOp = VK_COMPARE_OP_NEVER;
   sampler_create_info.minLod = 0.0f;
@@ -758,7 +836,6 @@ bool TextureCache::UploadTexture2D(
     uint32_t offset_x;
     uint32_t offset_y;
     TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
-
     auto bpp = (bytes_per_block >> 2) +
                ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
     for (uint32_t y = 0, output_base_offset = 0;
@@ -783,6 +860,7 @@ bool TextureCache::UploadTexture2D(
 
   // Insert a memory barrier into the command buffer to ensure the upload has
   // finished before we copy it into the destination texture.
+  /*
   VkBufferMemoryBarrier upload_barrier = {
       VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
       NULL,
@@ -797,6 +875,7 @@ bool TextureCache::UploadTexture2D(
   vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
                        &upload_barrier, 0, nullptr);
+  //*/
 
   // Transition the texture into a transfer destination layout.
   VkImageMemoryBarrier barrier;
@@ -805,7 +884,7 @@ bool TextureCache::UploadTexture2D(
   barrier.srcAccessMask = 0;
   barrier.dstAccessMask =
       VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT;
-  barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  barrier.oldLayout = dest->image_layout;
   barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
   barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
   barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
@@ -849,10 +928,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
   // Clear state.
   auto update_set_info = &update_set_info_;
   update_set_info->has_setup_fetch_mask = 0;
-  update_set_info->image_1d_write_count = 0;
-  update_set_info->image_2d_write_count = 0;
-  update_set_info->image_3d_write_count = 0;
-  update_set_info->image_cube_write_count = 0;
+  update_set_info->image_write_count = 0;
 
   std::memset(update_set_info, 0, sizeof(update_set_info_));
 
@@ -885,60 +961,75 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
 
   // Write all updated descriptors.
   // TODO(benvanik): optimize? split into multiple sets? set per type?
-  VkWriteDescriptorSet descriptor_writes[4];
-  std::memset(descriptor_writes, 0, sizeof(descriptor_writes));
-  uint32_t descriptor_write_count = 0;
-  // FIXME: These are not be lined up properly with tf binding points!!!!!
-  if (update_set_info->image_1d_write_count) {
-    auto& image_write = descriptor_writes[descriptor_write_count++];
-    image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    image_write.pNext = nullptr;
-    image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 0;
-    image_write.dstArrayElement = 0;
-    image_write.descriptorCount = update_set_info->image_1d_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    image_write.pImageInfo = update_set_info->image_1d_infos;
-  }
-  if (update_set_info->image_2d_write_count) {
-    auto& image_write = descriptor_writes[descriptor_write_count++];
-    image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    image_write.pNext = nullptr;
-    image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 1;
-    image_write.dstArrayElement = 0;
-    image_write.descriptorCount = update_set_info->image_2d_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    image_write.pImageInfo = update_set_info->image_2d_infos;
-  }
-  if (update_set_info->image_3d_write_count) {
-    auto& image_write = descriptor_writes[descriptor_write_count++];
-    image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    image_write.pNext = nullptr;
-    image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 2;
-    image_write.dstArrayElement = 0;
-    image_write.descriptorCount = update_set_info->image_3d_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    image_write.pImageInfo = update_set_info->image_3d_infos;
-  }
-  if (update_set_info->image_cube_write_count) {
-    auto& image_write = descriptor_writes[descriptor_write_count++];
-    image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    image_write.pNext = nullptr;
-    image_write.dstSet = descriptor_set;
-    image_write.dstBinding = 3;
-    image_write.dstArrayElement = 0;
-    image_write.descriptorCount = update_set_info->image_cube_write_count;
-    image_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    image_write.pImageInfo = update_set_info->image_cube_infos;
-  }
-  if (descriptor_write_count) {
-    vkUpdateDescriptorSets(*device_, descriptor_write_count, descriptor_writes,
-                           0, nullptr);
+  // First: Reorganize and pool image update infos.
+  struct DescriptorInfo {
+    Dimension dimension;
+    uint32_t tf_binding_base;
+    std::vector<VkDescriptorImageInfo> infos;
+  };
+
+  std::vector<DescriptorInfo> descriptor_update_infos;
+  for (uint32_t i = 0; i < update_set_info->image_write_count; i++) {
+    auto& image_info = update_set_info->image_infos[i];
+    if (descriptor_update_infos.size() > 0) {
+      // Check last write to see if we can pool more into it.
+      DescriptorInfo& last_write =
+          descriptor_update_infos[descriptor_update_infos.size() - 1];
+      if (last_write.dimension == image_info.dimension &&
+          last_write.tf_binding_base + last_write.infos.size() ==
+              image_info.tf_binding) {
+        // Compatible! Pool into it.
+        last_write.infos.push_back(image_info.info);
+        continue;
+      }
+    }
+
+    // Push a new descriptor write entry.
+    DescriptorInfo desc_info;
+    desc_info.dimension = image_info.dimension;
+    desc_info.tf_binding_base = image_info.tf_binding;
+    desc_info.infos.push_back(image_info.info);
+    descriptor_update_infos.push_back(desc_info);
   }
 
-  in_flight_sets_.push_back({descriptor_set, completion_fence});
+  // Finalize the writes so they're consumable by Vulkan.
+  std::vector<VkWriteDescriptorSet> descriptor_writes;
+  descriptor_writes.resize(descriptor_update_infos.size());
+  for (size_t i = 0; i < descriptor_update_infos.size(); i++) {
+    auto& update_info = descriptor_update_infos[i];
+    auto& write_info = descriptor_writes[i];
+    std::memset(&write_info, 0, sizeof(VkWriteDescriptorSet));
+
+    write_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    write_info.dstSet = descriptor_set;
+
+    switch (update_info.dimension) {
+      case Dimension::k1D:
+        write_info.dstBinding = 0;
+        break;
+      case Dimension::k2D:
+        write_info.dstBinding = 1;
+        break;
+      case Dimension::k3D:
+        write_info.dstBinding = 2;
+        break;
+      case Dimension::kCube:
+        write_info.dstBinding = 3;
+        break;
+    }
+
+    write_info.dstArrayElement = update_info.tf_binding_base;
+    write_info.descriptorCount = uint32_t(update_info.infos.size());
+    write_info.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+    write_info.pImageInfo = update_info.infos.data();
+  }
+
+  if (descriptor_writes.size() > 0) {
+    vkUpdateDescriptorSets(*device_, uint32_t(descriptor_writes.size()),
+                           descriptor_writes.data(), 0, nullptr);
+  }
+
+  in_flight_sets_[descriptor_set] = completion_fence;
   return descriptor_set;
 }
 
@@ -991,7 +1082,7 @@ bool TextureCache::SetupTextureBinding(
 
   auto texture = Demand(texture_info, command_buffer, completion_fence);
   auto sampler = Demand(sampler_info);
-  assert_true(texture != nullptr && sampler != nullptr);
+  // assert_true(texture != nullptr && sampler != nullptr);
   if (texture == nullptr || sampler == nullptr) {
     return false;
   }
@@ -1002,35 +1093,14 @@ bool TextureCache::SetupTextureBinding(
   trace_writer_->WriteMemoryRead(texture_info.guest_address,
                                  texture_info.input_length);
 
-  VkDescriptorImageInfo* image_write = nullptr;
-  switch (texture_info.dimension) {
-    case Dimension::k1D:
-      image_write =
-          &update_set_info
-               ->image_1d_infos[update_set_info->image_1d_write_count++];
-      break;
-    case Dimension::k2D:
-      image_write =
-          &update_set_info
-               ->image_2d_infos[update_set_info->image_2d_write_count++];
-      break;
-    case Dimension::k3D:
-      image_write =
-          &update_set_info
-               ->image_3d_infos[update_set_info->image_3d_write_count++];
-      break;
-    case Dimension::kCube:
-      image_write =
-          &update_set_info
-               ->image_cube_infos[update_set_info->image_cube_write_count++];
-      break;
-    default:
-      assert_unhandled_case(texture_info.dimension);
-      return false;
-  }
-  image_write->imageView = view->view;
-  image_write->imageLayout = texture->image_layout;
-  image_write->sampler = sampler->sampler;
+  auto image_write =
+      &update_set_info->image_infos[update_set_info->image_write_count++];
+  image_write->dimension = texture_info.dimension;
+  image_write->tf_binding = binding.fetch_constant;
+  image_write->info.imageView = view->view;
+  image_write->info.imageLayout = texture->image_layout;
+  image_write->info.sampler = sampler->sampler;
+  texture->in_flight_fence = completion_fence;
 
   return true;
 }
@@ -1054,6 +1124,18 @@ void TextureCache::Scavenge() {
 
   staging_buffer_.Scavenge();
 
+  // Kill all pending delete textures.
+  if (!pending_delete_textures_.empty()) {
+    for (auto it = pending_delete_textures_.begin();
+         it != pending_delete_textures_.end();) {
+      if (!FreeTexture(*it)) {
+        break;
+      }
+
+      it = pending_delete_textures_.erase(it);
+    }
+  }
+
   // Clean up any invalidated textures.
   invalidated_textures_mutex_.lock();
   std::vector<Texture*>& invalidated_textures = *invalidated_textures_;
@@ -1063,15 +1145,33 @@ void TextureCache::Scavenge() {
     invalidated_textures_ = &invalidated_textures_sets_[0];
   }
   invalidated_textures_mutex_.unlock();
-  if (invalidated_textures.empty()) {
-    return;
+  if (!invalidated_textures.empty()) {
+    for (auto it = invalidated_textures.begin();
+         it != invalidated_textures.end(); ++it) {
+      if (!FreeTexture(*it)) {
+        // Texture wasn't deleted because it's still in use.
+        pending_delete_textures_.push_back(*it);
+      }
+
+      textures_.erase((*it)->texture_info.hash());
+    }
+
+    invalidated_textures.clear();
   }
 
-  for (auto& texture : invalidated_textures) {
-    textures_.erase(texture->texture_info.hash());
-    FreeTexture(texture);
+  invalidated_resolve_textures_mutex_.lock();
+  if (!invalidated_resolve_textures_.empty()) {
+    for (auto it = invalidated_resolve_textures_.begin();
+         it != invalidated_resolve_textures_.end(); ++it) {
+      if (!FreeTexture(*it)) {
+        // Texture wasn't deleted because it's still in use.
+        pending_delete_textures_.push_back(*it);
+      }
+    }
+
+    invalidated_resolve_textures_.clear();
   }
-  invalidated_textures.clear();
+  invalidated_resolve_textures_mutex_.unlock();
 }
 
 }  // namespace vulkan
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index b564fcc48..a78be6ed6 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -50,6 +50,9 @@ class TextureCache {
 
     uintptr_t access_watch_handle;
     bool pending_invalidation;
+
+    // Pointer to the latest usage fence.
+    std::shared_ptr<ui::vulkan::Fence> in_flight_fence;
   };
 
   struct TextureView {
@@ -168,30 +171,32 @@ class TextureCache {
 
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr;
-  std::vector<std::pair<VkDescriptorSet, std::shared_ptr<ui::vulkan::Fence>>>
+  std::unordered_map<VkDescriptorSet, std::shared_ptr<ui::vulkan::Fence>>
       in_flight_sets_;
 
   ui::vulkan::CircularBuffer staging_buffer_;
   std::unordered_map<uint64_t, Texture*> textures_;
   std::unordered_map<uint64_t, Sampler*> samplers_;
   std::vector<Texture*> resolve_textures_;
+  std::vector<Texture*> pending_delete_textures_;
 
   std::mutex invalidated_textures_mutex_;
   std::vector<Texture*>* invalidated_textures_;
   std::vector<Texture*> invalidated_textures_sets_[2];
 
+  std::mutex invalidated_resolve_textures_mutex_;
+  std::vector<Texture*> invalidated_resolve_textures_;
+
   struct UpdateSetInfo {
     // Bitmap of all 32 fetch constants and whether they have been setup yet.
     // This prevents duplication across the vertex and pixel shader.
     uint32_t has_setup_fetch_mask;
-    uint32_t image_1d_write_count = 0;
-    VkDescriptorImageInfo image_1d_infos[32];
-    uint32_t image_2d_write_count = 0;
-    VkDescriptorImageInfo image_2d_infos[32];
-    uint32_t image_3d_write_count = 0;
-    VkDescriptorImageInfo image_3d_infos[32];
-    uint32_t image_cube_write_count = 0;
-    VkDescriptorImageInfo image_cube_infos[32];
+    uint32_t image_write_count = 0;
+    struct ImageSetInfo {
+      Dimension dimension;
+      uint32_t tf_binding;
+      VkDescriptorImageInfo info;
+    } image_infos[32];
   } update_set_info_;
 };
 

From 7c5042add71ae6cc672ce6b5557ee3ef8636730c Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 3 May 2016 14:12:05 -0500
Subject: [PATCH 51/77] Vulkan CP: Add in separate swap-chain images Some other
 changes I can't remember

---
 .../gpu/vulkan/vulkan_command_processor.cc    | 330 +++++++++++++-----
 .../gpu/vulkan/vulkan_command_processor.h     |  14 +-
 .../gpu/vulkan/vulkan_graphics_system.cc      |  22 +-
 3 files changed, 278 insertions(+), 88 deletions(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index fd604733b..011c5b878 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -29,7 +29,7 @@ namespace vulkan {
 using namespace xe::gpu::xenos;
 using xe::ui::vulkan::CheckResult;
 
-constexpr size_t kDefaultBufferCacheCapacity = 256 * 1024 * 1024;
+constexpr size_t kDefaultBufferCacheCapacity = 128 * 1024 * 1024;
 
 VulkanCommandProcessor::VulkanCommandProcessor(
     VulkanGraphicsSystem* graphics_system, kernel::KernelState* kernel_state)
@@ -82,6 +82,11 @@ bool VulkanCommandProcessor::SetupContext() {
 void VulkanCommandProcessor::ShutdownContext() {
   // TODO(benvanik): wait until idle.
 
+  if (swap_state_.front_buffer_texture) {
+    // Free swap chain images.
+    DestroySwapImages();
+  }
+
   buffer_cache_.reset();
   pipeline_cache_.reset();
   render_cache_.reset();
@@ -131,59 +136,214 @@ void VulkanCommandProcessor::ReturnFromWait() {
   CommandProcessor::ReturnFromWait();
 }
 
+void VulkanCommandProcessor::CreateSwapImages(VkCommandBuffer setup_buffer,
+                                              VkExtent2D extents) {
+  VkImageCreateInfo image_info;
+  std::memset(&image_info, 0, sizeof(VkImageCreateInfo));
+  image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+  image_info.imageType = VK_IMAGE_TYPE_2D;
+  image_info.format = VK_FORMAT_R8G8B8A8_UNORM;
+  image_info.extent = {extents.width, extents.height, 1};
+  image_info.mipLevels = 1;
+  image_info.arrayLayers = 1;
+  image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+  image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
+  image_info.usage =
+      VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+  image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  image_info.queueFamilyIndexCount = 0;
+  image_info.pQueueFamilyIndices = nullptr;
+  image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+
+  VkImage image_fb, image_bb;
+  auto status = vkCreateImage(*device_, &image_info, nullptr, &image_fb);
+  CheckResult(status, "vkCreateImage");
+
+  status = vkCreateImage(*device_, &image_info, nullptr, &image_bb);
+  CheckResult(status, "vkCreateImage");
+
+  // Bind memory to images.
+  VkMemoryRequirements mem_requirements;
+  vkGetImageMemoryRequirements(*device_, image_fb, &mem_requirements);
+  fb_memory = device_->AllocateMemory(mem_requirements, 0);
+  assert_not_null(fb_memory);
+
+  status = vkBindImageMemory(*device_, image_fb, fb_memory, 0);
+  CheckResult(status, "vkBindImageMemory");
+
+  vkGetImageMemoryRequirements(*device_, image_fb, &mem_requirements);
+  bb_memory = device_->AllocateMemory(mem_requirements, 0);
+  assert_not_null(bb_memory);
+
+  status = vkBindImageMemory(*device_, image_bb, bb_memory, 0);
+  CheckResult(status, "vkBindImageMemory");
+
+  std::lock_guard<std::mutex> lock(swap_state_.mutex);
+  swap_state_.front_buffer_texture = reinterpret_cast<uintptr_t>(image_fb);
+  swap_state_.back_buffer_texture = reinterpret_cast<uintptr_t>(image_bb);
+
+  // Transition both images to general layout.
+  VkImageMemoryBarrier barrier;
+  std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier));
+  barrier.srcAccessMask = 0;
+  barrier.dstAccessMask = 0;
+  barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
+  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.image = image_fb;
+  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+
+  vkCmdPipelineBarrier(setup_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
+
+  barrier.image = image_bb;
+
+  vkCmdPipelineBarrier(setup_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
+}
+
+void VulkanCommandProcessor::DestroySwapImages() {
+  std::lock_guard<std::mutex> lock(swap_state_.mutex);
+  vkDestroyImage(*device_,
+                 reinterpret_cast<VkImage>(swap_state_.front_buffer_texture),
+                 nullptr);
+  vkDestroyImage(*device_,
+                 reinterpret_cast<VkImage>(swap_state_.back_buffer_texture),
+                 nullptr);
+  vkFreeMemory(*device_, fb_memory, nullptr);
+  vkFreeMemory(*device_, bb_memory, nullptr);
+
+  swap_state_.front_buffer_texture = 0;
+  swap_state_.back_buffer_texture = 0;
+  fb_memory = nullptr;
+  bb_memory = nullptr;
+}
+
 void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
                                          uint32_t frontbuffer_width,
                                          uint32_t frontbuffer_height) {
   SCOPE_profile_cpu_f("gpu");
 
+  // Build a final command buffer that copies the game's frontbuffer texture
+  // into our backbuffer texture.
+  VkCommandBuffer copy_commands = nullptr;
+  bool opened_batch;
+  if (command_buffer_pool_->has_open_batch()) {
+    copy_commands = command_buffer_pool_->AcquireEntry();
+    opened_batch = false;
+  } else {
+    command_buffer_pool_->BeginBatch();
+    copy_commands = command_buffer_pool_->AcquireEntry();
+    current_batch_fence_.reset(new ui::vulkan::Fence(*device_));
+    opened_batch = true;
+  }
+
+  VkCommandBufferBeginInfo begin_info;
+  std::memset(&begin_info, 0, sizeof(begin_info));
+  begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+  auto status = vkBeginCommandBuffer(copy_commands, &begin_info);
+  CheckResult(status, "vkBeginCommandBuffer");
+
+  if (!frontbuffer_ptr) {
+    // Trace viewer does this.
+    frontbuffer_ptr = last_copy_base_;
+  }
+
+  if (!swap_state_.back_buffer_texture) {
+    CreateSwapImages(copy_commands, {frontbuffer_width, frontbuffer_height});
+  }
+  auto swap_bb = reinterpret_cast<VkImage>(swap_state_.back_buffer_texture);
+
+  // Issue the commands to copy the game's frontbuffer to our backbuffer.
+  auto texture = texture_cache_->LookupAddress(
+      frontbuffer_ptr, xe::round_up(frontbuffer_width, 32),
+      xe::round_up(frontbuffer_height, 32), TextureFormat::k_8_8_8_8);
+  if (texture) {
+    texture->in_flight_fence = current_batch_fence_;
+
+    // Insert a barrier so the GPU finishes writing to the image.
+    VkImageMemoryBarrier barrier;
+    std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier));
+    barrier.srcAccessMask =
+        VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT;
+    barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+    barrier.oldLayout = texture->image_layout;
+    barrier.newLayout = texture->image_layout;
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.image = texture->image;
+    barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+
+    vkCmdPipelineBarrier(copy_commands, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                         nullptr, 1, &barrier);
+
+    // Now issue a blit command.
+    VkImageBlit blit;
+    std::memset(&blit, 0, sizeof(VkImageBlit));
+    blit.srcSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+    blit.srcOffsets[0] = {0, 0, 0};
+    blit.srcOffsets[1] = {int32_t(frontbuffer_width),
+                          int32_t(frontbuffer_height), 1};
+    blit.dstSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+    blit.dstOffsets[0] = {0, 0, 0};
+    blit.dstOffsets[1] = {int32_t(frontbuffer_width),
+                          int32_t(frontbuffer_height), 1};
+
+    vkCmdBlitImage(copy_commands, texture->image, texture->image_layout,
+                   swap_bb, VK_IMAGE_LAYOUT_GENERAL, 1, &blit,
+                   VK_FILTER_LINEAR);
+
+    std::lock_guard<std::mutex> lock(swap_state_.mutex);
+    swap_state_.width = frontbuffer_width;
+    swap_state_.height = frontbuffer_height;
+  }
+
+  status = vkEndCommandBuffer(copy_commands);
+  CheckResult(status, "vkEndCommandBuffer");
+
   // Queue up current command buffers.
   // TODO(benvanik): bigger batches.
+  std::vector<VkCommandBuffer> submit_buffers;
   if (current_command_buffer_) {
     if (current_render_state_) {
       render_cache_->EndRenderPass();
       current_render_state_ = nullptr;
     }
 
-    auto status = vkEndCommandBuffer(current_command_buffer_);
+    status = vkEndCommandBuffer(current_command_buffer_);
     CheckResult(status, "vkEndCommandBuffer");
     status = vkEndCommandBuffer(current_setup_buffer_);
     CheckResult(status, "vkEndCommandBuffer");
-    command_buffer_pool_->EndBatch(*current_batch_fence_);
 
+    // TODO(DrChat): If the setup buffer is empty, don't bother queueing it up.
+    submit_buffers.push_back(current_setup_buffer_);
+    submit_buffers.push_back(current_command_buffer_);
+
+    current_command_buffer_ = nullptr;
+    current_setup_buffer_ = nullptr;
+  }
+
+  submit_buffers.push_back(copy_commands);
+  if (!submit_buffers.empty()) {
     // TODO(benvanik): move to CP or to host (trace dump, etc).
     // This only needs to surround a vkQueueSubmit.
     if (queue_mutex_) {
       queue_mutex_->lock();
     }
 
-    // TODO(DrChat): If setup buffer is empty, don't bother queueing it up.
-    VkCommandBuffer command_buffers[] = {
-        current_setup_buffer_, current_command_buffer_,
-    };
-
     VkSubmitInfo submit_info;
+    std::memset(&submit_info, 0, sizeof(VkSubmitInfo));
     submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    submit_info.pNext = nullptr;
-    submit_info.waitSemaphoreCount = 0;
-    submit_info.pWaitSemaphores = nullptr;
-    submit_info.commandBufferCount = 2;
-    submit_info.pCommandBuffers = command_buffers;
-    submit_info.signalSemaphoreCount = 0;
-    submit_info.pSignalSemaphores = nullptr;
-    if (queue_mutex_) {
-      // queue_mutex_->lock();
-    }
+    submit_info.commandBufferCount = uint32_t(submit_buffers.size());
+    submit_info.pCommandBuffers = submit_buffers.data();
     status = vkQueueSubmit(queue_, 1, &submit_info, *current_batch_fence_);
-    if (queue_mutex_) {
-      // queue_mutex_->unlock();
-    }
     CheckResult(status, "vkQueueSubmit");
 
-    // TODO(DrChat): Disable this completely.
-    VkFence fences[] = {*current_batch_fence_};
-    status = vkWaitForFences(*device_, 1, fences, true, -1);
-    CheckResult(status, "vkWaitForFences");
-
     if (device_->is_renderdoc_attached() && capturing_) {
       device_->EndRenderDocFrameCapture();
       capturing_ = false;
@@ -197,45 +357,28 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
     if (queue_mutex_) {
       queue_mutex_->unlock();
     }
+  }
 
-    // Scavenging.
-    current_command_buffer_ = nullptr;
-    current_setup_buffer_ = nullptr;
+  command_buffer_pool_->EndBatch(current_batch_fence_);
+
+  // TODO(DrChat): Remove this.
+  VkFence fences[] = { *current_batch_fence_ };
+  vkWaitForFences(*device_, 1, fences, true, -1);
+
+  // Scavenging.
+  {
+#if FINE_GRAINED_DRAW_SCOPES
+    SCOPE_profile_cpu_i(
+        "gpu",
+        "xe::gpu::vulkan::VulkanCommandProcessor::PerformSwap Scavenging");
+#endif  // FINE_GRAINED_DRAW_SCOPES
     command_buffer_pool_->Scavenge();
 
     texture_cache_->Scavenge();
-    current_batch_fence_ = nullptr;
-
-    // TODO: Remove this when we stop waiting on the queue.
-    buffer_cache_->ClearCache();
+    buffer_cache_->Scavenge();
   }
 
-  if (!frontbuffer_ptr) {
-    if (!last_copy_base_) {
-      // Nothing to draw.
-      return;
-    }
-
-    // Trace viewer does this.
-    frontbuffer_ptr = last_copy_base_;
-  }
-
-  auto texture = texture_cache_->LookupAddress(
-      frontbuffer_ptr, xe::round_up(frontbuffer_width, 32),
-      xe::round_up(frontbuffer_height, 32), TextureFormat::k_8_8_8_8);
-  // There shouldn't be a case where the texture is null.
-  assert_not_null(texture);
-
-  if (texture) {
-    std::lock_guard<std::mutex> lock(swap_state_.mutex);
-    swap_state_.width = frontbuffer_width;
-    swap_state_.height = frontbuffer_height;
-    swap_state_.back_buffer_texture =
-        reinterpret_cast<uintptr_t>(texture->image);
-  }
-
-  // Remove any dead textures, etc.
-  texture_cache_->Scavenge();
+  current_batch_fence_ = nullptr;
 }
 
 Shader* VulkanCommandProcessor::LoadShader(ShaderType shader_type,
@@ -331,16 +474,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     started_command_buffer = true;
   }
   auto command_buffer = current_command_buffer_;
-
-  // Upload and set descriptors for all textures.
-  // We do this outside of the render pass so the texture cache can upload and
-  // convert textures.
-  // Setup buffer may be flushed to GPU if the texture cache needs it.
-  auto samplers =
-      PopulateSamplers(current_setup_buffer_, vertex_shader, pixel_shader);
-  if (!samplers) {
-    return false;
-  }
+  auto setup_buffer = current_setup_buffer_;
 
   // Begin the render pass.
   // This will setup our framebuffer and begin the pass in the command buffer.
@@ -362,6 +496,9 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     }
   }
 
+  // Update the render cache's tracking state.
+  render_cache_->UpdateState();
+
   // Configure the pipeline for drawing.
   // This encodes all render state (blend, depth, etc), our shader stages,
   // and our vertex input layout.
@@ -373,6 +510,13 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
       started_command_buffer) {
     vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
                       pipeline);
+  } else if (pipeline_status == PipelineCache::UpdateStatus::kError) {
+    render_cache_->EndRenderPass();
+    command_buffer_pool_->CancelBatch();
+    current_command_buffer_ = nullptr;
+    current_setup_buffer_ = nullptr;
+    current_batch_fence_ = nullptr;
+    return false;
   }
   pipeline_cache_->SetDynamicState(command_buffer, started_command_buffer);
 
@@ -407,9 +551,17 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
   }
 
   // Bind samplers/textures.
-  vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
-                          pipeline_cache_->pipeline_layout(), 1, 1, &samplers,
-                          0, nullptr);
+  // Uploads all textures that need it.
+  // Setup buffer may be flushed to GPU if the texture cache needs it.
+  if (!PopulateSamplers(command_buffer, setup_buffer, vertex_shader,
+                        pixel_shader)) {
+    render_cache_->EndRenderPass();
+    command_buffer_pool_->CancelBatch();
+    current_command_buffer_ = nullptr;
+    current_setup_buffer_ = nullptr;
+    current_batch_fence_ = nullptr;
+    return false;
+  }
 
   // Actually issue the draw.
   if (!index_buffer_info) {
@@ -444,7 +596,7 @@ bool VulkanCommandProcessor::PopulateConstants(VkCommandBuffer command_buffer,
   // These are optional, and if none are defined 0 will be returned.
   auto constant_offsets = buffer_cache_->UploadConstantRegisters(
       vertex_shader->constant_register_map(),
-      pixel_shader->constant_register_map());
+      pixel_shader->constant_register_map(), current_batch_fence_);
   if (constant_offsets.first == VK_WHOLE_SIZE ||
       constant_offsets.second == VK_WHOLE_SIZE) {
     // Shader wants constants but we couldn't upload them.
@@ -497,8 +649,8 @@ bool VulkanCommandProcessor::PopulateIndexBuffer(
   size_t source_length =
       info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
                                                        : sizeof(uint16_t));
-  auto buffer_ref =
-      buffer_cache_->UploadIndexBuffer(source_ptr, source_length, info.format);
+  auto buffer_ref = buffer_cache_->UploadIndexBuffer(
+      source_ptr, source_length, info.format, current_batch_fence_);
   if (buffer_ref.second == VK_WHOLE_SIZE) {
     // Failed to upload buffer.
     return false;
@@ -523,6 +675,11 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
 #endif  // FINE_GRAINED_DRAW_SCOPES
 
   auto& vertex_bindings = vertex_shader->vertex_bindings();
+  if (vertex_bindings.empty()) {
+    // No bindings.
+    return true;
+  }
+
   assert_true(vertex_bindings.size() <= 32);
   VkBuffer all_buffers[32];
   VkDeviceSize all_buffer_offsets[32];
@@ -556,8 +713,8 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
     const void* source_ptr =
         memory_->TranslatePhysical<const void*>(fetch->address << 2);
     size_t source_length = valid_range;
-    auto buffer_ref =
-        buffer_cache_->UploadVertexBuffer(source_ptr, source_length);
+    auto buffer_ref = buffer_cache_->UploadVertexBuffer(
+        source_ptr, source_length, current_batch_fence_);
     if (buffer_ref.second == VK_WHOLE_SIZE) {
       // Failed to upload buffer.
       return false;
@@ -576,9 +733,9 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
   return true;
 }
 
-VkDescriptorSet VulkanCommandProcessor::PopulateSamplers(
-    VkCommandBuffer command_buffer, VulkanShader* vertex_shader,
-    VulkanShader* pixel_shader) {
+bool VulkanCommandProcessor::PopulateSamplers(
+    VkCommandBuffer command_buffer, VkCommandBuffer setup_buffer,
+    VulkanShader* vertex_shader, VulkanShader* pixel_shader) {
 #if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // FINE_GRAINED_DRAW_SCOPES
@@ -588,10 +745,14 @@ VkDescriptorSet VulkanCommandProcessor::PopulateSamplers(
       pixel_shader->texture_bindings());
   if (!descriptor_set) {
     // Unable to bind set.
-    return nullptr;
+    return false;
   }
 
-  return descriptor_set;
+  vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                          pipeline_cache_->pipeline_layout(), 1, 1,
+                          &descriptor_set, 0, nullptr);
+
+  return true;
 }
 
 bool VulkanCommandProcessor::IssueCopy() {
@@ -760,6 +921,9 @@ bool VulkanCommandProcessor::IssueCopy() {
   tex_info.size_2d.input_pitch = copy_dest_pitch * 4;
   auto texture = texture_cache_->DemandResolveTexture(
       tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr);
+  assert_not_null(texture);
+  texture->in_flight_fence = current_batch_fence_;
+
   if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
     // Transition the image to a general layout.
     VkImageMemoryBarrier image_barrier;
@@ -820,10 +984,12 @@ bool VulkanCommandProcessor::IssueCopy() {
                             : static_cast<uint32_t>(depth_format);
   switch (copy_command) {
     case CopyCommand::kRaw:
+    /*
       render_cache_->RawCopyToImage(command_buffer, edram_base, texture->image,
                                     texture->image_layout, copy_src_select <= 3,
                                     resolve_offset, resolve_extent);
       break;
+    */
     case CopyCommand::kConvert:
       render_cache_->BlitToImage(
           command_buffer, edram_base, surface_pitch, resolve_extent.height,
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index 287e4f65e..4a7788e09 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -61,6 +61,9 @@ class VulkanCommandProcessor : public CommandProcessor {
   void PrepareForWait() override;
   void ReturnFromWait() override;
 
+  void CreateSwapImages(VkCommandBuffer setup_buffer, VkExtent2D extents);
+  void DestroySwapImages();
+
   void PerformSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width,
                    uint32_t frontbuffer_height) override;
 
@@ -77,13 +80,18 @@ class VulkanCommandProcessor : public CommandProcessor {
                            IndexBufferInfo* index_buffer_info);
   bool PopulateVertexBuffers(VkCommandBuffer command_buffer,
                              VulkanShader* vertex_shader);
-  VkDescriptorSet PopulateSamplers(VkCommandBuffer command_buffer,
-                                   VulkanShader* vertex_shader,
-                                   VulkanShader* pixel_shader);
+  bool PopulateSamplers(VkCommandBuffer command_buffer,
+                        VkCommandBuffer setup_buffer,
+                        VulkanShader* vertex_shader,
+                        VulkanShader* pixel_shader);
   bool IssueCopy() override;
 
   xe::ui::vulkan::VulkanDevice* device_ = nullptr;
 
+  // front buffer / back buffer memory
+  VkDeviceMemory fb_memory = nullptr;
+  VkDeviceMemory bb_memory = nullptr;
+
   // TODO(benvanik): abstract behind context?
   // Queue used to submit work. This may be a dedicated queue for the command
   // processor and no locking will be required for use. If a dedicated queue
diff --git a/src/xenia/gpu/vulkan/vulkan_graphics_system.cc b/src/xenia/gpu/vulkan/vulkan_graphics_system.cc
index 27b2ff073..08c6120d7 100644
--- a/src/xenia/gpu/vulkan/vulkan_graphics_system.cc
+++ b/src/xenia/gpu/vulkan/vulkan_graphics_system.cc
@@ -76,6 +76,23 @@ void VulkanGraphicsSystem::Swap(xe::ui::UIEvent* e) {
 
   auto swap_chain = display_context_->swap_chain();
   auto copy_cmd_buffer = swap_chain->copy_cmd_buffer();
+  auto front_buffer =
+      reinterpret_cast<VkImage>(swap_state.front_buffer_texture);
+
+  VkImageMemoryBarrier barrier;
+  std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier));
+  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+  barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+  barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL;
+  barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
+  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.image = front_buffer;
+  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+  vkCmdPipelineBarrier(copy_cmd_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
 
   VkImageBlit region;
   region.srcSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
@@ -88,9 +105,8 @@ void VulkanGraphicsSystem::Swap(xe::ui::UIEvent* e) {
   region.dstOffsets[1] = {static_cast<int32_t>(swap_chain->surface_width()),
                           static_cast<int32_t>(swap_chain->surface_height()),
                           1};
-  vkCmdBlitImage(copy_cmd_buffer,
-                 reinterpret_cast<VkImage>(swap_state.front_buffer_texture),
-                 VK_IMAGE_LAYOUT_GENERAL, swap_chain->surface_image(),
+  vkCmdBlitImage(copy_cmd_buffer, front_buffer, VK_IMAGE_LAYOUT_GENERAL,
+                 swap_chain->surface_image(),
                  VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region,
                  VK_FILTER_LINEAR);
 }

From 79f1193130a3db33843ff546390333bf67e75b8c Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 5 May 2016 23:42:36 -0500
Subject: [PATCH 52/77] Vulkan CP: Fix calculating an invalid copy destination
 base address when sizeof(texel) != 4

---
 src/xenia/gpu/texture_info.h                  | 60 +++++++++++++++++++
 .../gpu/vulkan/vulkan_command_processor.cc    | 15 +++--
 2 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h
index 500f22bb3..0cb2ed2ba 100644
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@@ -88,6 +88,66 @@ enum class TextureFormat : uint32_t {
   kUnknown = 0xFFFFFFFFu,
 };
 
+inline size_t GetTexelSize(TextureFormat format) {
+  switch (format) {
+    case TextureFormat::k_1_5_5_5:
+      return 2;
+      break;
+    case TextureFormat::k_2_10_10_10:
+      return 4;
+      break;
+    case TextureFormat::k_4_4_4_4:
+      return 2;
+      break;
+    case TextureFormat::k_5_6_5:
+      return 2;
+      break;
+    case TextureFormat::k_8:
+      return 1;
+      break;
+    case TextureFormat::k_8_8:
+      return 2;
+      break;
+    case TextureFormat::k_8_8_8_8:
+      return 4;
+      break;
+    case TextureFormat::k_16:
+      return 4;
+      break;
+    case TextureFormat::k_16_FLOAT:
+      return 4;
+      break;
+    case TextureFormat::k_16_16:
+      return 4;
+      break;
+    case TextureFormat::k_16_16_FLOAT:
+      return 4;
+      break;
+    case TextureFormat::k_16_16_16_16:
+      return 8;
+      break;
+    case TextureFormat::k_16_16_16_16_FLOAT:
+      return 8;
+      break;
+    case TextureFormat::k_32_FLOAT:
+      return 4;
+      break;
+    case TextureFormat::k_32_32_FLOAT:
+      return 8;
+      break;
+    case TextureFormat::k_32_32_32_32_FLOAT:
+      return 16;
+      break;
+    case TextureFormat::k_10_11_11:
+    case TextureFormat::k_11_11_10:
+      return 4;
+      break;
+    default:
+      assert_unhandled_case(format);
+      return 0;
+  }
+}
+
 inline TextureFormat ColorFormatToTextureFormat(ColorFormat color_format) {
   return static_cast<TextureFormat>(color_format);
 }
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 011c5b878..17f83f82c 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -362,7 +362,7 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
   command_buffer_pool_->EndBatch(current_batch_fence_);
 
   // TODO(DrChat): Remove this.
-  VkFence fences[] = { *current_batch_fence_ };
+  VkFence fences[] = {*current_batch_fence_};
   vkWaitForFences(*device_, 1, fences, true, -1);
 
   // Scavenging.
@@ -733,9 +733,10 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
   return true;
 }
 
-bool VulkanCommandProcessor::PopulateSamplers(
-    VkCommandBuffer command_buffer, VkCommandBuffer setup_buffer,
-    VulkanShader* vertex_shader, VulkanShader* pixel_shader) {
+bool VulkanCommandProcessor::PopulateSamplers(VkCommandBuffer command_buffer,
+                                              VkCommandBuffer setup_buffer,
+                                              VulkanShader* vertex_shader,
+                                              VulkanShader* pixel_shader) {
 #if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // FINE_GRAINED_DRAW_SCOPES
@@ -829,11 +830,13 @@ bool VulkanCommandProcessor::IssueCopy() {
     window_offset_y |= 0x8000;
   }
 
+  size_t read_size = GetTexelSize(ColorFormatToTextureFormat(copy_dest_format));
+
   // Adjust the copy base offset to point to the beginning of the texture, so
   // we don't run into hiccups down the road (e.g. resolving the last part going
   // backwards).
-  int32_t dest_offset = window_offset_y * copy_dest_pitch * 4;
-  dest_offset += window_offset_x * 32 * 4;
+  int32_t dest_offset = window_offset_y * copy_dest_pitch * int(read_size);
+  dest_offset += window_offset_x * 32 * int(read_size);
   copy_dest_base += dest_offset;
 
   // HACK: vertices to use are always in vf0.

From c06a7cdf81bd3bfb54e842541277610f893ef17c Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 7 May 2016 19:17:56 -0500
Subject: [PATCH 53/77] BaseFencedPool::has_open_batch (and other uncommitted
 changes)

---
 src/xenia/ui/vulkan/fenced_pools.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/xenia/ui/vulkan/fenced_pools.h b/src/xenia/ui/vulkan/fenced_pools.h
index 3a7bb01c4..a481edf10 100644
--- a/src/xenia/ui/vulkan/fenced_pools.h
+++ b/src/xenia/ui/vulkan/fenced_pools.h
@@ -14,6 +14,7 @@
 
 #include "xenia/base/assert.h"
 #include "xenia/ui/vulkan/vulkan.h"
+#include "xenia/ui/vulkan/vulkan_util.h"
 
 namespace xe {
 namespace ui {
@@ -40,13 +41,15 @@ class BaseFencedPool {
 
   // True if one or more batches are still pending on the GPU.
   bool has_pending() const { return pending_batch_list_head_ != nullptr; }
+  // True if a batch is open.
+  bool has_open_batch() const { return open_batch_ != nullptr; }
 
   // Checks all pending batches for completion and scavenges their entries.
   // This should be called as frequently as reasonable.
   void Scavenge() {
     while (pending_batch_list_head_) {
       auto batch = pending_batch_list_head_;
-      if (vkGetFenceStatus(device_, batch->fence) == VK_SUCCESS) {
+      if (vkGetFenceStatus(device_, *batch->fence) == VK_SUCCESS) {
         // Batch has completed. Reclaim.
         pending_batch_list_head_ = batch->next;
         if (batch == pending_batch_list_tail_) {
@@ -132,7 +135,7 @@ class BaseFencedPool {
 
   // Ends the current batch using the given fence to indicate when the batch
   // has completed execution on the GPU.
-  void EndBatch(VkFence fence) {
+  void EndBatch(std::shared_ptr<Fence> fence) {
     assert_not_null(open_batch_);
 
     // Close and see if we have anything.
@@ -194,7 +197,7 @@ class BaseFencedPool {
     Batch* next;
     Entry* entry_list_head;
     Entry* entry_list_tail;
-    VkFence fence;
+    std::shared_ptr<Fence> fence;
   };
 
   Batch* free_batch_list_head_ = nullptr;

From 2bb52ef86b8580f5819c81d09a1083f48bc3bc91 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 15 May 2016 12:01:38 -0500
Subject: [PATCH 54/77] SPIR-V: WIP shader compiler / optimizations / alpha
 test implementation

---
 src/xenia/gpu/premake5.lua                    |   2 +
 src/xenia/gpu/shader.h                        |  11 ++
 src/xenia/gpu/spirv/compiler.cc               |  36 +++++
 src/xenia/gpu/spirv/compiler.h                |  41 ++++++
 src/xenia/gpu/spirv/compiler_pass.h           |  37 +++++
 .../passes/control_flow_analysis_pass.cpp     |  30 ++++
 .../spirv/passes/control_flow_analysis_pass.h |  34 +++++
 .../control_flow_simplification_pass.cc       |  48 ++++++
 .../passes/control_flow_simplification_pass.h |  34 +++++
 src/xenia/gpu/spirv_shader_translator.cc      | 137 ++++++++++++++----
 src/xenia/gpu/spirv_shader_translator.h       |   2 +
 11 files changed, 383 insertions(+), 29 deletions(-)
 create mode 100644 src/xenia/gpu/spirv/compiler.cc
 create mode 100644 src/xenia/gpu/spirv/compiler.h
 create mode 100644 src/xenia/gpu/spirv/compiler_pass.h
 create mode 100644 src/xenia/gpu/spirv/passes/control_flow_analysis_pass.cpp
 create mode 100644 src/xenia/gpu/spirv/passes/control_flow_analysis_pass.h
 create mode 100644 src/xenia/gpu/spirv/passes/control_flow_simplification_pass.cc
 create mode 100644 src/xenia/gpu/spirv/passes/control_flow_simplification_pass.h

diff --git a/src/xenia/gpu/premake5.lua b/src/xenia/gpu/premake5.lua
index 1f6a1eea6..1c7870edc 100644
--- a/src/xenia/gpu/premake5.lua
+++ b/src/xenia/gpu/premake5.lua
@@ -22,6 +22,8 @@ project("xenia-gpu")
     project_root.."/third_party/gflags/src",
   })
   local_platform_files()
+  local_platform_files("spirv")
+  local_platform_files("spirv/passes")
 
 group("src")
 project("xenia-gpu-shader-compiler")
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index 476369e53..95abe4dfa 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -99,6 +99,17 @@ struct InstructionResult {
   bool has_all_writes() const {
     return write_mask[0] && write_mask[1] && write_mask[2] && write_mask[3];
   }
+  // Returns number of components written
+  uint32_t num_writes() const {
+    uint32_t total = 0;
+    for (int i = 0; i < 4; i++) {
+      if (write_mask[i]) {
+        total++;
+      }
+    }
+
+    return total;
+  }
   // Returns true if any non-constant components are written.
   bool stores_non_constants() const {
     for (int i = 0; i < 4; ++i) {
diff --git a/src/xenia/gpu/spirv/compiler.cc b/src/xenia/gpu/spirv/compiler.cc
new file mode 100644
index 000000000..d31b36996
--- /dev/null
+++ b/src/xenia/gpu/spirv/compiler.cc
@@ -0,0 +1,36 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/spirv/compiler.h"
+
+namespace xe {
+namespace gpu {
+namespace spirv {
+
+Compiler::Compiler() {}
+
+void Compiler::AddPass(std::unique_ptr<CompilerPass> pass) {
+  compiler_passes_.push_back(std::move(pass));
+}
+
+bool Compiler::Compile(spv::Module* module) {
+  for (auto& pass : compiler_passes_) {
+    if (!pass->Run(module)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void Compiler::Reset() { compiler_passes_.clear(); }
+
+}  // namespace spirv
+}  // namespace gpu
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/gpu/spirv/compiler.h b/src/xenia/gpu/spirv/compiler.h
new file mode 100644
index 000000000..fd27969ee
--- /dev/null
+++ b/src/xenia/gpu/spirv/compiler.h
@@ -0,0 +1,41 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SPIRV_COMPILER_H_
+#define XENIA_GPU_SPIRV_COMPILER_H_
+
+#include "xenia/base/arena.h"
+#include "xenia/gpu/spirv/compiler_pass.h"
+
+#include "third_party/glslang-spirv/SpvBuilder.h"
+#include "third_party/spirv/GLSL.std.450.hpp11"
+
+namespace xe {
+namespace gpu {
+namespace spirv {
+
+// SPIR-V Compiler. Designed to optimize SPIR-V code before feeding it into the
+// drivers.
+class Compiler {
+ public:
+  Compiler();
+
+  void AddPass(std::unique_ptr<CompilerPass> pass);
+  void Reset();
+  bool Compile(spv::Module* module);
+
+ private:
+  std::vector<std::unique_ptr<CompilerPass>> compiler_passes_;
+};
+
+}  // namespace spirv
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_SPIRV_COMPILER_H_
\ No newline at end of file
diff --git a/src/xenia/gpu/spirv/compiler_pass.h b/src/xenia/gpu/spirv/compiler_pass.h
new file mode 100644
index 000000000..0d81aeeee
--- /dev/null
+++ b/src/xenia/gpu/spirv/compiler_pass.h
@@ -0,0 +1,37 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SPIRV_COMPILER_PASS_H_
+#define XENIA_GPU_SPIRV_COMPILER_PASS_H_
+
+#include "xenia/base/arena.h"
+
+#include "third_party/glslang-spirv/SpvBuilder.h"
+#include "third_party/spirv/GLSL.std.450.hpp11"
+
+namespace xe {
+namespace gpu {
+namespace spirv {
+
+class CompilerPass {
+ public:
+  CompilerPass() = default;
+  virtual ~CompilerPass() {}
+
+  virtual bool Run(spv::Module* module) = 0;
+
+ private:
+  xe::Arena ir_arena_;
+};
+
+}  // namespace spirv
+}  // namespace gpu
+}  // namespace xe
+
+#endif
\ No newline at end of file
diff --git a/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.cpp b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.cpp
new file mode 100644
index 000000000..4d719f769
--- /dev/null
+++ b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.cpp
@@ -0,0 +1,30 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/spirv/passes/control_flow_analysis_pass.h"
+
+namespace xe {
+namespace gpu {
+namespace spirv {
+
+ControlFlowAnalysisPass::ControlFlowAnalysisPass() {}
+
+bool ControlFlowAnalysisPass::Run(spv::Module* module) {
+  for (auto function : module->getFunctions()) {
+    // For each OpBranchConditional, see if we can find a point where control
+    // flow converges and then append an OpSelectionMerge.
+    // Potential problems: while loops constructed from branch instructions
+  }
+
+  return true;
+}
+
+}  // namespace spirv
+}  // namespace gpu
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.h b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.h
new file mode 100644
index 000000000..6b279e251
--- /dev/null
+++ b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.h
@@ -0,0 +1,34 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_
+#define XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_
+
+#include "xenia/gpu/spirv/compiler_pass.h"
+
+namespace xe {
+namespace gpu {
+namespace spirv {
+
+// Control-flow analysis pass. Runs through control-flow and adds merge opcodes
+// where necessary.
+class ControlFlowAnalysisPass : public CompilerPass {
+ public:
+  ControlFlowAnalysisPass();
+
+  bool Run(spv::Module* module) override;
+
+ private:
+};
+
+}  // namespace spirv
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_
\ No newline at end of file
diff --git a/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.cc b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.cc
new file mode 100644
index 000000000..7b01aa5aa
--- /dev/null
+++ b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.cc
@@ -0,0 +1,48 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/spirv/passes/control_flow_simplification_pass.h"
+
+namespace xe {
+namespace gpu {
+namespace spirv {
+
+ControlFlowSimplificationPass::ControlFlowSimplificationPass() {}
+
+bool ControlFlowSimplificationPass::Run(spv::Module* module) {
+  for (auto function : module->getFunctions()) {
+    // Walk through the blocks in the function and merge any blocks which are
+    // unconditionally dominated.
+    for (auto it = function->getBlocks().end() - 1;
+         it != function->getBlocks().begin() - 1;) {
+      auto block = *it;
+      if (!block->isUnreachable() && block->getPredecessors().size() == 1) {
+        auto prev_block = block->getPredecessors()[0];
+        auto last_instr =
+            prev_block->getInstruction(prev_block->getInstructionCount() - 1);
+        if (last_instr->getOpCode() == spv::Op::OpBranch) {
+          if (prev_block->getSuccessors().size() == 1 &&
+              prev_block->getSuccessors()[0] == block) {
+            // We're dominated by this block. Merge into it.
+            prev_block->merge(block);
+            block->setUnreachable();
+          }
+        }
+      }
+
+      --it;
+    }
+  }
+
+  return true;
+}
+
+}  // namespace spirv
+}  // namespace gpu
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.h b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.h
new file mode 100644
index 000000000..f851d24f1
--- /dev/null
+++ b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.h
@@ -0,0 +1,34 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2016 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_SIMPLIFICATION_PASS_H_
+#define XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_SIMPLIFICATION_PASS_H_
+
+#include "xenia/gpu/spirv/compiler_pass.h"
+
+namespace xe {
+namespace gpu {
+namespace spirv {
+
+// Control-flow simplification pass. Combines adjacent blocks and marks
+// any unreachable blocks.
+class ControlFlowSimplificationPass : public CompilerPass {
+ public:
+  ControlFlowSimplificationPass();
+
+  bool Run(spv::Module* module) override;
+
+ private:
+};
+
+}  // namespace spirv
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_SIMPLIFICATION_PASS_H_
\ No newline at end of file
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 855df73f7..86bddcd80 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -12,16 +12,24 @@
 #include <cstring>
 
 #include "xenia/base/logging.h"
+#include "xenia/gpu/spirv/passes/control_flow_analysis_pass.h"
+#include "xenia/gpu/spirv/passes/control_flow_simplification_pass.h"
 
 namespace xe {
 namespace gpu {
 using namespace ucode;
 
+constexpr int kMaxInterpolators = 16;
+constexpr int kMaxTemporaryRegisters = 64;
+
 using spv::GLSLstd450;
 using spv::Id;
 using spv::Op;
 
-SpirvShaderTranslator::SpirvShaderTranslator() = default;
+SpirvShaderTranslator::SpirvShaderTranslator() {
+  compiler_.AddPass(std::make_unique<spirv::ControlFlowSimplificationPass>());
+  compiler_.AddPass(std::make_unique<spirv::ControlFlowAnalysisPass>());
+}
 
 SpirvShaderTranslator::~SpirvShaderTranslator() = default;
 
@@ -331,11 +339,19 @@ void SpirvShaderTranslator::StartTranslation() {
                               ps_param_gen_idx, b.makeUintConstant(-1));
     spv::Builder::If ifb(cond, b);
 
-    // Index is specified
-    auto reg_ptr = b.createAccessChain(spv::StorageClass::StorageClassFunction,
-                                       registers_ptr_,
-                                       std::vector<Id>({ps_param_gen_idx}));
-    b.createStore(param, reg_ptr);
+    // FYI: We do this instead of r[ps_param_gen_idx] because that causes
+    // nvidia to move all registers into local memory (slow!)
+    for (uint32_t i = 0; i < kMaxInterpolators; i++) {
+      auto reg_ptr = b.createAccessChain(
+          spv::StorageClass::StorageClassFunction, registers_ptr_,
+          std::vector<Id>({b.makeUintConstant(i)}));
+
+      auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, ps_param_gen_idx,
+                                b.makeUintConstant(i));
+      auto reg = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, cond, param,
+                               b.createLoad(reg_ptr));
+      b.createStore(reg, reg_ptr);
+    }
 
     ifb.makeEndIf();
   }
@@ -406,28 +422,64 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
     b.createStore(p, pos_);
   } else {
     // Alpha test
-    auto alpha_test_x = b.createCompositeExtract(push_consts_, float_type_,
-                                                 std::vector<uint32_t>{2, 0});
-    auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, alpha_test_x,
-                              b.makeFloatConstant(1.f));
+    auto alpha_test_enabled = b.createCompositeExtract(
+        push_consts_, float_type_, std::vector<uint32_t>{2, 0});
+    auto alpha_test_func = b.createCompositeExtract(
+        push_consts_, float_type_, std::vector<uint32_t>{2, 1});
+    auto alpha_test_ref = b.createCompositeExtract(push_consts_, float_type_,
+                                                   std::vector<uint32_t>{2, 2});
+    alpha_test_func =
+        b.createUnaryOp(spv::Op::OpConvertFToU, uint_type_, alpha_test_func);
+    auto oC0_alpha = b.createCompositeExtract(frag_outputs_, float_type_,
+                                              std::vector<uint32_t>({0, 3}));
 
+    auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_,
+                              alpha_test_enabled, b.makeFloatConstant(1.f));
     spv::Builder::If alpha_if(cond, b);
 
-    // TODO(DrChat): Apply alpha test.
+    std::vector<spv::Block*> switch_segments;
+    b.makeSwitch(alpha_test_func, 8, std::vector<int>({0, 1, 2, 3, 4, 5, 6, 7}),
+                 std::vector<int>({0, 1, 2, 3, 4, 5, 6, 7}), 7,
+                 switch_segments);
+
+    const static spv::Op alpha_op_map[] = {
+        spv::Op::OpNop,
+        spv::Op::OpFOrdGreaterThanEqual,
+        spv::Op::OpFOrdNotEqual,
+        spv::Op::OpFOrdGreaterThan,
+        spv::Op::OpFOrdLessThanEqual,
+        spv::Op::OpFOrdEqual,
+        spv::Op::OpFOrdLessThan,
+        spv::Op::OpNop,
+    };
+
     // if (alpha_func == 0) passes = false;
-    // if (alpha_func == 1 && oC[0].a <  alpha_ref) passes = true;
-    // if (alpha_func == 2 && oC[0].a == alpha_ref) passes = true;
-    // if (alpha_func == 3 && oC[0].a <= alpha_ref) passes = true;
-    // if (alpha_func == 4 && oC[0].a >  alpha_ref) passes = true;
-    // if (alpha_func == 5 && oC[0].a != alpha_ref) passes = true;
-    // if (alpha_func == 6 && oC[0].a >= alpha_ref) passes = true;
+    b.nextSwitchSegment(switch_segments, 0);
+    b.makeDiscard();
+    b.addSwitchBreak();
+
+    for (int i = 1; i < 7; i++) {
+      b.nextSwitchSegment(switch_segments, i);
+      auto cond =
+          b.createBinOp(alpha_op_map[i], bool_type_, oC0_alpha, alpha_test_ref);
+      spv::Builder::If discard_if(cond, b);
+      b.makeDiscard();
+      discard_if.makeEndIf();
+      b.addSwitchBreak();
+    }
+
     // if (alpha_func == 7) passes = true;
+    b.nextSwitchSegment(switch_segments, 7);
+    b.endSwitch(switch_segments);
 
     alpha_if.makeEndIf();
   }
 
   b.makeReturn(false);
 
+  // Compile the spv IR
+  compiler_.Compile(b.getModule());
+
   std::vector<uint32_t> spirv_words;
   b.dump(spirv_words);
 
@@ -555,8 +607,7 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
 
       auto next_block = cf_blocks_[instr.dword_index + 1];
       if (next_block.prev_dominates) {
-        b.createNoResultOp(spv::Op::OpSelectionMerge,
-                           {next_block.block->getId(), 0});
+        b.createSelectionMerge(next_block.block, spv::SelectionControlMaskNone);
       }
       b.createConditionalBranch(cond, body, next_block.block);
     } break;
@@ -570,8 +621,7 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
 
       auto next_block = cf_blocks_[instr.dword_index + 1];
       if (next_block.prev_dominates) {
-        b.createNoResultOp(spv::Op::OpSelectionMerge,
-                           {next_block.block->getId(), 0});
+        b.createSelectionMerge(next_block.block, spv::SelectionControlMaskNone);
       }
       b.createConditionalBranch(cond, body, next_block.block);
 
@@ -756,8 +806,8 @@ void SpirvShaderTranslator::ProcessVertexFetchInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
-    b.createNoResultOp(spv::Op::OpSelectionMerge,
-                       {predicated_block_end_->getId(), 0});
+    b.createSelectionMerge(predicated_block_end_,
+                           spv::SelectionControlMaskNone);
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
@@ -771,6 +821,7 @@ void SpirvShaderTranslator::ProcessVertexFetchInstruction(
   auto shader_vertex_id = b.createLoad(vertex_id_);
   auto cond =
       b.createBinOp(spv::Op::OpIEqual, bool_type_, vertex_id, shader_vertex_id);
+  cond = b.smearScalar(spv::NoPrecision, cond, vec4_bool_type_);
 
   // Skip loading if it's an indexed fetch.
   auto vertex_ptr = vertex_binding_map_[instr.operands[1].storage_index]
@@ -778,6 +829,30 @@ void SpirvShaderTranslator::ProcessVertexFetchInstruction(
   assert_not_zero(vertex_ptr);
   auto vertex = b.createLoad(vertex_ptr);
 
+  switch (instr.attributes.data_format) {
+    case VertexFormat::k_8_8_8_8:
+    case VertexFormat::k_16_16:
+    case VertexFormat::k_16_16_16_16:
+    case VertexFormat::k_16_16_16_16_FLOAT:
+    case VertexFormat::k_32:
+    case VertexFormat::k_32_32:
+    case VertexFormat::k_32_32_32_32:
+    case VertexFormat::k_32_FLOAT:
+    case VertexFormat::k_32_32_FLOAT:
+    case VertexFormat::k_32_32_32_FLOAT:
+    case VertexFormat::k_32_32_32_32_FLOAT:
+      // These are handled, for now.
+      break;
+
+    case VertexFormat::k_10_11_11: {
+      // No conversion needed. Natively supported.
+    } break;
+
+    case VertexFormat::k_11_11_10: {
+      // This needs to be converted.
+    } break;
+  }
+
   auto vertex_components = b.getNumComponents(vertex);
   Id alt_vertex = 0;
   switch (vertex_components) {
@@ -836,8 +911,8 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
-    b.createNoResultOp(spv::Op::OpSelectionMerge,
-                       {predicated_block_end_->getId(), 0});
+    b.createSelectionMerge(predicated_block_end_,
+                           spv::SelectionControlMaskNone);
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
@@ -940,8 +1015,8 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
-    b.createNoResultOp(spv::Op::OpSelectionMerge,
-                       {predicated_block_end_->getId(), 0});
+    b.createSelectionMerge(predicated_block_end_,
+                           spv::SelectionControlMaskNone);
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
@@ -1170,6 +1245,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto c_and =
           b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1);
       auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0);
+      c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_);
       auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3);
 
       // p0
@@ -1194,6 +1270,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto c_and =
           b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1);
       auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0);
+      c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_);
       auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3);
 
       // p0
@@ -1218,6 +1295,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto c_and =
           b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1);
       auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0);
+      c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_);
       auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3);
 
       // p0
@@ -1242,6 +1320,7 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction(
       auto c_and =
           b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1);
       auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0);
+      c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_);
       auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3);
 
       // p0
@@ -1376,8 +1455,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     predicated_block_cond_ = instr.predicate_condition;
     predicated_block_end_ = &b.makeNewBlock();
 
-    b.createNoResultOp(spv::Op::OpSelectionMerge,
-                       {predicated_block_end_->getId(), 0});
+    b.createSelectionMerge(predicated_block_end_,
+                           spv::SelectionControlMaskNone);
     b.createConditionalBranch(pred_cond, block, predicated_block_end_);
     b.setBuildPoint(block);
   }
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 39d3899c1..b6a761a24 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -17,6 +17,7 @@
 #include "third_party/glslang-spirv/SpvBuilder.h"
 #include "third_party/spirv/GLSL.std.450.hpp11"
 #include "xenia/gpu/shader_translator.h"
+#include "xenia/gpu/spirv/compiler.h"
 #include "xenia/ui/spirv/spirv_disassembler.h"
 #include "xenia/ui/spirv/spirv_validator.h"
 
@@ -97,6 +98,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
 
   xe::ui::spirv::SpirvDisassembler disassembler_;
   xe::ui::spirv::SpirvValidator validator_;
+  xe::gpu::spirv::Compiler compiler_;
 
   // True if there's an open predicated block
   bool open_predicated_block_ = false;

From b9a40d1a00f8f5c6c6ea1cfa98ed1c0c9acdba6c Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 15 May 2016 12:08:29 -0500
Subject: [PATCH 55/77] Use Vulkan as the default graphics backend.

---
 src/xenia/app/xenia_main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xenia/app/xenia_main.cc b/src/xenia/app/xenia_main.cc
index bc9b662c5..80ed35551 100644
--- a/src/xenia/app/xenia_main.cc
+++ b/src/xenia/app/xenia_main.cc
@@ -78,7 +78,7 @@ std::unique_ptr<gpu::GraphicsSystem> CreateGraphicsSystem() {
     std::unique_ptr<gpu::GraphicsSystem> best;
 
     best = std::unique_ptr<gpu::GraphicsSystem>(
-        new xe::gpu::gl4::GL4GraphicsSystem());
+        new xe::gpu::vulkan::VulkanGraphicsSystem());
     if (best) {
       return best;
     }

From 44284a780c80d0009e0b31c92cde0c57d38b76c2 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 15 May 2016 14:27:44 -0500
Subject: [PATCH 56/77] SPIR-V: Misc. IR utility functions

---
 third_party/glslang-spirv/SpvBuilder.cpp |  6 ++
 third_party/glslang-spirv/SpvBuilder.h   |  4 +-
 third_party/glslang-spirv/spvIR.h        | 77 +++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/third_party/glslang-spirv/SpvBuilder.cpp b/third_party/glslang-spirv/SpvBuilder.cpp
index 0a2fa2139..13a6c946a 100644
--- a/third_party/glslang-spirv/SpvBuilder.cpp
+++ b/third_party/glslang-spirv/SpvBuilder.cpp
@@ -1166,6 +1166,7 @@ void Builder::createMemoryBarrier(unsigned executionScope, unsigned memorySemant
 // An opcode that has one operands, a result id, and a type
 Id Builder::createUnaryOp(Op opCode, Id typeId, Id operand)
 {
+    assert(operand != 0);
     Instruction* op = new Instruction(getUniqueId(), typeId, opCode);
     op->addIdOperand(operand);
     buildPoint->addInstruction(std::unique_ptr<Instruction>(op));
@@ -1175,6 +1176,8 @@ Id Builder::createUnaryOp(Op opCode, Id typeId, Id operand)
 
 Id Builder::createBinOp(Op opCode, Id typeId, Id left, Id right)
 {
+    assert(left != 0);
+    assert(right != 0);
     Instruction* op = new Instruction(getUniqueId(), typeId, opCode);
     op->addIdOperand(left);
     op->addIdOperand(right);
@@ -1185,6 +1188,9 @@ Id Builder::createBinOp(Op opCode, Id typeId, Id left, Id right)
 
 Id Builder::createTriOp(Op opCode, Id typeId, Id op1, Id op2, Id op3)
 {
+    assert(op1 != 0);
+    assert(op2 != 0);
+    assert(op3 != 0);
     Instruction* op = new Instruction(getUniqueId(), typeId, opCode);
     op->addIdOperand(op1);
     op->addIdOperand(op2);
diff --git a/third_party/glslang-spirv/SpvBuilder.h b/third_party/glslang-spirv/SpvBuilder.h
index d6dc61218..7eae4fe91 100644
--- a/third_party/glslang-spirv/SpvBuilder.h
+++ b/third_party/glslang-spirv/SpvBuilder.h
@@ -93,6 +93,8 @@ public:
         return id;
     }
 
+    Module* getModule() { return &module; }
+
     // For creating new types (will return old type if the requested one was already made).
     Id makeVoidType();
     Id makeBoolType();
@@ -517,6 +519,7 @@ public:
     void createBranch(Block* block);
     void createConditionalBranch(Id condition, Block* thenBlock, Block* elseBlock);
     void createLoopMerge(Block* mergeBlock, Block* continueBlock, unsigned int control);
+    void createSelectionMerge(Block* mergeBlock, unsigned int control);
 
  protected:
     Id makeIntConstant(Id typeId, unsigned value, bool specConstant);
@@ -527,7 +530,6 @@ public:
     void transferAccessChainSwizzle(bool dynamic);
     void simplifyAccessChainSwizzle();
     void createAndSetNoPredecessorBlock(const char*);
-    void createSelectionMerge(Block* mergeBlock, unsigned int control);
     void dumpInstructions(std::vector<unsigned int>&, const std::vector<std::unique_ptr<Instruction> >&) const;
 
     SourceLanguage source;
diff --git a/third_party/glslang-spirv/spvIR.h b/third_party/glslang-spirv/spvIR.h
index 98f4971b4..63e460ebb 100644
--- a/third_party/glslang-spirv/spvIR.h
+++ b/third_party/glslang-spirv/spvIR.h
@@ -180,6 +180,11 @@ public:
     void addInstruction(std::unique_ptr<Instruction> inst);
     void addPredecessor(Block* pred) { predecessors.push_back(pred); pred->successors.push_back(this);}
     void addLocalVariable(std::unique_ptr<Instruction> inst) { localVariables.push_back(std::move(inst)); }
+    void insertInstruction(size_t pos, std::unique_ptr<Instruction> inst);
+
+    size_t getInstructionCount() { return instructions.size(); }
+    Instruction* getInstruction(size_t i) { return instructions[i].get(); }
+    void removeInstruction(size_t i) { instructions.erase(instructions.begin() + i); }
     const std::vector<Block*>& getPredecessors() const { return predecessors; }
     const std::vector<Block*>& getSuccessors() const { return successors; }
     void setUnreachable() { unreachable = true; }
@@ -200,6 +205,10 @@ public:
 
     bool isTerminated() const
     {
+        if (instructions.size() == 0) {
+          return false;
+        }
+
         switch (instructions.back()->getOpCode()) {
         case OpBranch:
         case OpBranchConditional:
@@ -215,6 +224,7 @@ public:
 
     void dump(std::vector<unsigned int>& out) const
     {
+        // OpLabel
         instructions[0]->dump(out);
         for (int i = 0; i < (int)localVariables.size(); ++i)
             localVariables[i]->dump(out);
@@ -222,7 +232,51 @@ public:
             instructions[i]->dump(out);
     }
 
-protected:
+    // Moves all instructions from a target block into this block, and removes
+    // the target block from our list of successors.
+    // This function assumes this block unconditionally branches to the target
+    // block directly.
+    void merge(Block* target_block) {
+      if (isTerminated()) {
+        instructions.erase(instructions.end() - 1);
+      }
+
+      // Find the target block in our successors first.
+      for (auto it = successors.begin(); it != successors.end(); ++it) {
+        if (*it == target_block) {
+          it = successors.erase(it);
+          break;
+        }
+      }
+
+      // Add target block's successors to our successors.
+      successors.insert(successors.end(), target_block->successors.begin(),
+                        target_block->successors.end());
+
+      // For each successor, replace the target block in their predecessors with
+      // us.
+      for (auto block : successors) {
+        std::replace(block->predecessors.begin(), block->predecessors.end(),
+                     target_block, this);
+      }
+
+      // Move instructions from target block into this block.
+      for (auto it = target_block->instructions.begin();
+           it != target_block->instructions.end();) {
+        if ((*it)->getOpCode() == spv::Op::OpLabel) {
+          ++it;
+          continue;
+        }
+
+        instructions.push_back(std::move(*it));
+        it = target_block->instructions.erase(it);
+      }
+
+      target_block->predecessors.clear();
+      target_block->successors.clear();
+    }
+
+   protected:
     Block(const Block&);
     Block& operator=(Block&);
 
@@ -275,6 +329,17 @@ public:
     Module& getParent() const { return parent; }
     Block* getEntryBlock() const { return blocks.front(); }
     Block* getLastBlock() const { return blocks.back(); }
+    Block* findBlockById(Id id)
+    {
+      for (auto block : blocks) {
+        if (block->getId() == id) {
+          return block;
+        }
+      }
+
+      return nullptr;
+    }
+    std::vector<Block*>& getBlocks() { return blocks; }
     void addLocalVariable(std::unique_ptr<Instruction> inst);
     Id getReturnType() const { return functionInstruction.getTypeId(); }
     void dump(std::vector<unsigned int>& out) const
@@ -315,6 +380,8 @@ public:
     }
 
     void addFunction(Function *fun) { functions.push_back(fun); }
+    const std::vector<Function*>& getFunctions() const { return functions; }
+    std::vector<Function*>& getFunctions() { return functions; }
 
     void mapInstruction(Instruction *instruction)
     {
@@ -398,6 +465,14 @@ __inline void Block::addInstruction(std::unique_ptr<Instruction> inst)
         parent.getParent().mapInstruction(raw_instruction);
 }
 
+__inline void Block::insertInstruction(size_t pos, std::unique_ptr<Instruction> inst) {
+    Instruction* raw_instruction = inst.get();
+    instructions.insert(instructions.begin() + pos, std::move(inst));
+    raw_instruction->setBlock(this);
+    if (raw_instruction->getResultId())
+        parent.getParent().mapInstruction(raw_instruction);
+}
+
 };  // end spv namespace
 
 #endif // spvIR_H

From b025790207ff15757d745a07a7ffa0a130fcb937 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 17 May 2016 05:58:52 -0500
Subject: [PATCH 57/77] Disable Vulkan native MSAA by default for now.

---
 src/xenia/gpu/vulkan/vulkan_gpu_flags.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc b/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc
index 52bc10c84..fd2fe7789 100644
--- a/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc
+++ b/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc
@@ -11,6 +11,6 @@
 
 DEFINE_bool(vulkan_renderdoc_capture_all, false,
             "Capture everything with RenderDoc.");
-DEFINE_bool(vulkan_native_msaa, true, "Use native MSAA");
+DEFINE_bool(vulkan_native_msaa, false, "Use native MSAA");
 DEFINE_bool(vulkan_dump_disasm, false,
             "Dump shader disassembly. NVIDIA only supported.");

From 1faf5a813aa5be69227c1103da48b9811d57fd3c Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 22 May 2016 19:57:05 -0500
Subject: [PATCH 58/77] Fix ALU scalar swizzles (Possibly)

---
 src/xenia/gpu/shader_translator.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index f6bfbdd65..79381d909 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -1047,9 +1047,8 @@ void ParseAluInstructionOperand(const AluInstruction& op, int i,
     uint32_t a = swizzle & 0x3;
     out_op->components[0] = GetSwizzleFromComponentIndex(a);
   } else if (swizzle_component_count == 2) {
-    swizzle >>= 4;
-    uint32_t a = ((swizzle >> 2) + 3) & 0x3;
-    uint32_t b = (swizzle + 2) & 0x3;
+    uint32_t a = ((swizzle >> 6) + 3) & 0x3;
+    uint32_t b = ((swizzle >> 0) + 0) & 0x3;
     out_op->components[0] = GetSwizzleFromComponentIndex(a);
     out_op->components[1] = GetSwizzleFromComponentIndex(b);
   } else {
@@ -1129,6 +1128,10 @@ void ShaderTranslator::ParseAluVectorInstruction(
         } else {
           // Unimplemented.
           // assert_always();
+          XELOGE(
+              "ShaderTranslator::ParseAluVectorInstruction: Unsupported write "
+              "to export %d",
+              dest_num);
           i.result.storage_target = InstructionStorageTarget::kNone;
           i.result.storage_index = 0;
         }

From d94ff6eb2510a9826a741ca92de7b0992211ad80 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 22 May 2016 19:58:50 -0500
Subject: [PATCH 59/77] Shaders: Track the register count from the program
 control register (if available)

---
 src/xenia/gpu/shader.h             |  4 ++++
 src/xenia/gpu/shader_translator.cc | 14 ++++++++++++++
 src/xenia/gpu/shader_translator.h  |  7 +++++++
 3 files changed, 25 insertions(+)

diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index 95abe4dfa..7e0cd3ab2 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -558,6 +558,9 @@ class Shader {
   // True if the shader was translated and prepared without error.
   bool is_valid() const { return is_valid_; }
 
+  // True if the shader has already been translated.
+  bool is_translated() const { return is_translated_; }
+
   // Errors that occurred during translation.
   const std::vector<Error>& errors() const { return errors_; }
 
@@ -602,6 +605,7 @@ class Shader {
   bool writes_color_targets_[4] = {false, false, false, false};
 
   bool is_valid_ = false;
+  bool is_translated_ = false;
   std::vector<Error> errors_;
 
   std::string ucode_disassembly_;
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 79381d909..5bb9ba016 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -51,6 +51,7 @@ void ShaderTranslator::Reset() {
   ucode_disasm_buffer_.Reset();
   ucode_disasm_line_number_ = 0;
   previous_ucode_disasm_scan_offset_ = 0;
+  register_count_ = 64;
   total_attrib_count_ = 0;
   vertex_bindings_.clear();
   texture_bindings_.clear();
@@ -95,9 +96,21 @@ bool ShaderTranslator::GatherAllBindingInformation(Shader* shader) {
   return true;
 }
 
+bool ShaderTranslator::Translate(Shader* shader,
+                                 xenos::xe_gpu_program_cntl_t cntl) {
+  Reset();
+  register_count_ = shader->type() == ShaderType::kVertex ? cntl.vs_regs + 1
+                                                          : cntl.ps_regs + 1;
+
+  return TranslateInternal(shader);
+}
+
 bool ShaderTranslator::Translate(Shader* shader) {
   Reset();
+  return TranslateInternal(shader);
+}
 
+bool ShaderTranslator::TranslateInternal(Shader* shader) {
   shader_type_ = shader->type();
   ucode_dwords_ = shader->ucode_dwords();
   ucode_dword_count_ = shader->ucode_dword_count();
@@ -155,6 +168,7 @@ bool ShaderTranslator::Translate(Shader* shader) {
   }
 
   shader->is_valid_ = true;
+  shader->is_translated_ = true;
   for (const auto& error : shader->errors_) {
     if (error.is_fatal) {
       shader->is_valid_ = false;
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index 5df53bc0a..9801cb2d6 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -30,6 +30,7 @@ class ShaderTranslator {
   // DEPRECATED(benvanik): remove this when shader cache is removed.
   bool GatherAllBindingInformation(Shader* shader);
 
+  bool Translate(Shader* shader, xenos::xe_gpu_program_cntl_t cntl);
   bool Translate(Shader* shader);
 
  protected:
@@ -38,6 +39,8 @@ class ShaderTranslator {
   // Resets translator state before beginning translation.
   virtual void Reset();
 
+  // Register count.
+  uint32_t register_count() const { return register_count_; }
   // True if the current shader is a vertex shader.
   bool is_vertex_shader() const { return shader_type_ == ShaderType::kVertex; }
   // True if the current shader is a pixel shader.
@@ -132,6 +135,8 @@ class ShaderTranslator {
     int src_swizzle_component_count;
   };
 
+  bool TranslateInternal(Shader* shader);
+
   void MarkUcodeInstruction(uint32_t dword_offset);
   void AppendUcodeDisasm(char c);
   void AppendUcodeDisasm(const char* value);
@@ -184,6 +189,8 @@ class ShaderTranslator {
   ShaderType shader_type_;
   const uint32_t* ucode_dwords_;
   size_t ucode_dword_count_;
+  xenos::xe_gpu_program_cntl_t program_cntl_;
+  uint32_t register_count_;
 
   // Accumulated translation errors.
   std::vector<Shader::Error> errors_;

From d1b4d61b52f94ab831cda6fb8fccfdf4af6fd8f0 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 22 May 2016 20:01:42 -0500
Subject: [PATCH 60/77] SPIR-V: Use the register count from the program control
 register Workaround for broken OpBitFieldUExtract on NVIDIA drivers
 kRcpc/kRcpf/kRsqc/kRsqf Fix broken ps_ usage

---
 src/xenia/gpu/spirv_shader_translator.cc | 83 +++++++++++++++++++-----
 1 file changed, 66 insertions(+), 17 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 86bddcd80..229951c8e 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -19,8 +19,8 @@ namespace xe {
 namespace gpu {
 using namespace ucode;
 
-constexpr int kMaxInterpolators = 16;
-constexpr int kMaxTemporaryRegisters = 64;
+constexpr uint32_t kMaxInterpolators = 16;
+constexpr uint32_t kMaxTemporaryRegisters = 64;
 
 using spv::GLSLstd450;
 using spv::Id;
@@ -47,6 +47,7 @@ void SpirvShaderTranslator::StartTranslation() {
                    spv::MemoryModel::MemoryModelGLSL450);
   b.addCapability(spv::Capability::CapabilityShader);
   b.addCapability(spv::Capability::CapabilityGenericPointer);
+
   if (is_vertex_shader()) {
     b.addCapability(spv::Capability::CapabilityClipDistance);
     b.addCapability(spv::Capability::CapabilityCullDistance);
@@ -79,8 +80,8 @@ void SpirvShaderTranslator::StartTranslation() {
       std::vector<Id>({b.makeFloatConstant(0.f), b.makeFloatConstant(0.f),
                        b.makeFloatConstant(0.f), b.makeFloatConstant(0.f)}));
 
-  registers_type_ =
-      b.makeArrayType(vec4_float_type_, b.makeUintConstant(64), 0);
+  registers_type_ = b.makeArrayType(vec4_float_type_,
+                                    b.makeUintConstant(register_count()), 0);
   registers_ptr_ = b.createVariable(spv::StorageClass::StorageClassFunction,
                                     registers_type_, "r");
 
@@ -197,8 +198,8 @@ void SpirvShaderTranslator::StartTranslation() {
   }
 
   // Interpolators.
-  Id interpolators_type =
-      b.makeArrayType(vec4_float_type_, b.makeUintConstant(16), 0);
+  Id interpolators_type = b.makeArrayType(
+      vec4_float_type_, b.makeUintConstant(kMaxInterpolators), 0);
   if (is_vertex_shader()) {
     // Vertex inputs/outputs.
     for (const auto& binding : vertex_bindings()) {
@@ -248,7 +249,8 @@ void SpirvShaderTranslator::StartTranslation() {
     interpolators_ = b.createVariable(spv::StorageClass::StorageClassOutput,
                                       interpolators_type, "interpolators");
     b.addDecoration(interpolators_, spv::Decoration::DecorationLocation, 0);
-    for (uint32_t i = 0; i < 16; i++) {
+    for (uint32_t i = 0; i < std::min(register_count(), kMaxInterpolators);
+         i++) {
       // Zero interpolators.
       auto ptr = b.createAccessChain(spv::StorageClass::StorageClassOutput,
                                      interpolators_,
@@ -300,7 +302,8 @@ void SpirvShaderTranslator::StartTranslation() {
     // b.createNoResultOp(spv::Op::OpCopyMemorySized,
     //                   {registers_ptr_, interpolators_,
     //                    b.makeUintConstant(16 * 4 * sizeof(float))});
-    for (int i = 0; i < 16; i++) {
+    for (uint32_t i = 0; i < std::min(register_count(), kMaxInterpolators);
+         i++) {
       // For now, copy interpolators register-by-register :/
       auto idx = b.makeUintConstant(i);
       auto i_a = b.createAccessChain(spv::StorageClass::StorageClassInput,
@@ -341,7 +344,8 @@ void SpirvShaderTranslator::StartTranslation() {
 
     // FYI: We do this instead of r[ps_param_gen_idx] because that causes
     // nvidia to move all registers into local memory (slow!)
-    for (uint32_t i = 0; i < kMaxInterpolators; i++) {
+    for (uint32_t i = 0; i < std::min(register_count(), kMaxInterpolators);
+         i++) {
       auto reg_ptr = b.createAccessChain(
           spv::StorageClass::StorageClassFunction, registers_ptr_,
           std::vector<Id>({b.makeUintConstant(i)}));
@@ -586,7 +590,6 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
     } break;
     case ParsedExecInstruction::Type::kConditional: {
       // Based off of bool_consts
-      // FIXME: Nvidia compiler is complaining about this.
       std::vector<Id> offsets;
       offsets.push_back(b.makeUintConstant(2));  // bool_consts
       offsets.push_back(b.makeUintConstant(instr.bool_constant_index / 32));
@@ -595,15 +598,25 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin(
       v = b.createLoad(v);
 
       // Bitfield extract the bool constant.
+      // FIXME: NVidia's compiler seems to be broken on this instruction?
+      /*
       v = b.createTriOp(spv::Op::OpBitFieldUExtract, uint_type_, v,
                         b.makeUintConstant(instr.bool_constant_index % 32),
                         b.makeUintConstant(1));
 
+      auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
+                                b.makeUintConstant(instr.condition ? 1 : 0));
+      */
+      v = b.createBinOp(
+          spv::Op::OpBitwiseAnd, uint_type_, v,
+          b.makeUintConstant(1 << (instr.bool_constant_index % 32)));
+      auto cond = b.createBinOp(
+          instr.condition ? spv::Op::OpINotEqual : spv::Op::OpIEqual,
+          bool_type_, v, b.makeUintConstant(0));
+
       // Conditional branch
       assert_true(cf_blocks_.size() > instr.dword_index + 1);
       body = &b.makeNewBlock();
-      auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
-                                b.makeUintConstant(instr.condition ? 1 : 0));
 
       auto next_block = cf_blocks_[instr.dword_index + 1];
       if (next_block.prev_dominates) {
@@ -731,6 +744,8 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
                                    consts_, offsets);
       v = b.createLoad(v);
 
+      // FIXME: NVidia's compiler seems to be broken on this instruction?
+      /*
       // Bitfield extract the bool constant.
       v = b.createTriOp(spv::Op::OpBitFieldUExtract, uint_type_, v,
                         b.makeUintConstant(instr.bool_constant_index % 32),
@@ -739,6 +754,14 @@ void SpirvShaderTranslator::ProcessJumpInstruction(
       // Conditional branch
       auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v,
                                 b.makeUintConstant(instr.condition ? 1 : 0));
+      */
+      v = b.createBinOp(
+          spv::Op::OpBitwiseAnd, uint_type_, v,
+          b.makeUintConstant(1 << (instr.bool_constant_index % 32)));
+      auto cond = b.createBinOp(
+          instr.condition ? spv::Op::OpINotEqual : spv::Op::OpIEqual,
+          bool_type_, v, b.makeUintConstant(0));
+
       b.createConditionalBranch(cond, cf_blocks_[instr.target_address].block,
                                 cf_blocks_[instr.dword_index + 1].block);
     } break;
@@ -1473,7 +1496,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
 
     case AluScalarOpcode::kAddsPrev: {
       // dest = src0 + ps
-      dest = b.createBinOp(spv::Op::OpFAdd, float_type_, sources[0], ps_);
+      dest = b.createBinOp(spv::Op::OpFAdd, float_type_, sources[0],
+                           b.createLoad(ps_));
     } break;
 
     case AluScalarOpcode::kCos: {
@@ -1636,7 +1660,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
 
     case AluScalarOpcode::kMulsPrev: {
       // dest = src0 * ps
-      dest = b.createBinOp(spv::Op::OpFMul, float_type_, sources[0], ps_);
+      dest = b.createBinOp(spv::Op::OpFMul, float_type_, sources[0],
+                           b.createLoad(ps_));
     } break;
 
     case AluScalarOpcode::kMulsPrev2: {
@@ -1644,11 +1669,22 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     case AluScalarOpcode::kRcpc: {
-      // TODO: dest = src0 != 0.0 ? 1.0 / src0 : FLT_MAX;
+      dest = b.createBinOp(spv::Op::OpFDiv, float_type_,
+                           b.makeFloatConstant(1.f), sources[0]);
+      dest = CreateGlslStd450InstructionCall(
+          spv::NoPrecision, float_type_, spv::GLSLstd450::kFClamp,
+          {dest, b.makeFloatConstant(-FLT_MAX), b.makeFloatConstant(FLT_MAX)});
     } break;
 
-    case AluScalarOpcode::kRcp:
     case AluScalarOpcode::kRcpf: {
+      dest = b.createBinOp(spv::Op::OpFDiv, float_type_,
+                           b.makeFloatConstant(1.f), sources[0]);
+      auto c = b.createUnaryOp(spv::Op::OpIsInf, bool_type_, dest);
+      dest = b.createTriOp(spv::Op::OpSelect, float_type_, c,
+                           b.makeFloatConstant(0.f), dest);
+    } break;
+
+    case AluScalarOpcode::kRcp: {
       // dest = src0 != 0.0 ? 1.0 / src0 : 0.0;
       auto c = b.createBinOp(spv::Op::OpFOrdEqual, float_type_, sources[0],
                              b.makeFloatConstant(0.f));
@@ -1659,9 +1695,21 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     case AluScalarOpcode::kRsqc: {
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             spv::GLSLstd450::kInverseSqrt,
+                                             {sources[0]});
+      dest = CreateGlslStd450InstructionCall(
+          spv::NoPrecision, float_type_, spv::GLSLstd450::kFClamp,
+          {dest, b.makeFloatConstant(-FLT_MAX), b.makeFloatConstant(FLT_MAX)});
     } break;
 
     case AluScalarOpcode::kRsqf: {
+      dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_,
+                                             spv::GLSLstd450::kInverseSqrt,
+                                             {sources[0]});
+      auto c = b.createUnaryOp(spv::Op::OpIsInf, bool_type_, dest);
+      dest = b.createTriOp(spv::Op::OpSelect, float_type_, c,
+                           b.makeFloatConstant(0.f), dest);
     } break;
 
     case AluScalarOpcode::kRsq: {
@@ -1817,7 +1865,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction(
     } break;
 
     case AluScalarOpcode::kSubsPrev: {
-      dest = b.createBinOp(spv::Op::OpFSub, float_type_, sources[0], ps_);
+      dest = b.createBinOp(spv::Op::OpFSub, float_type_, sources[0],
+                           b.createLoad(ps_));
     } break;
 
     case AluScalarOpcode::kTruncs: {

From bd27835a3df64cd1a3f7b3b1937c33bdadbee086 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 22 May 2016 20:03:13 -0500
Subject: [PATCH 61/77] Pipeline Cache: Translate shaders when program cntl
 register is available

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 96 +++++++++++++++-----------
 src/xenia/gpu/vulkan/pipeline_cache.h  |  2 +
 2 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index 70054f5e2..b790b7cc1 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -157,32 +157,6 @@ VulkanShader* PipelineCache::LoadShader(ShaderType shader_type,
                                           host_address, dword_count);
   shader_map_.insert({data_hash, shader});
 
-  // Perform translation.
-  // If this fails the shader will be marked as invalid and ignored later.
-  if (!shader_translator_.Translate(shader)) {
-    XELOGE("Shader translation failed; marking shader as ignored");
-    return shader;
-  }
-
-  // Prepare the shader for use (creates our VkShaderModule).
-  // It could still fail at this point.
-  if (!shader->Prepare()) {
-    XELOGE("Shader preparation failed; marking shader as ignored");
-    return shader;
-  }
-
-  if (shader->is_valid()) {
-    XELOGGPU("Generated %s shader at 0x%.8X (%db) - hash %.16" PRIX64 ":\n%s\n",
-             shader_type == ShaderType::kVertex ? "vertex" : "pixel",
-             guest_address, dword_count * 4, shader->ucode_data_hash(),
-             shader->ucode_disassembly().c_str());
-  }
-
-  // Dump shader files if desired.
-  if (!FLAGS_dump_shaders.empty()) {
-    shader->Dump(FLAGS_dump_shaders, "vk");
-  }
-
   return shader;
 }
 
@@ -302,6 +276,37 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state,
   return pipeline;
 }
 
+bool PipelineCache::TranslateShader(VulkanShader* shader,
+                                    xenos::xe_gpu_program_cntl_t cntl) {
+  // Perform translation.
+  // If this fails the shader will be marked as invalid and ignored later.
+  if (!shader_translator_.Translate(shader, cntl)) {
+    XELOGE("Shader translation failed; marking shader as ignored");
+    return false;
+  }
+
+  // Prepare the shader for use (creates our VkShaderModule).
+  // It could still fail at this point.
+  if (!shader->Prepare()) {
+    XELOGE("Shader preparation failed; marking shader as ignored");
+    return false;
+  }
+
+  if (shader->is_valid()) {
+    XELOGGPU("Generated %s shader (%db) - hash %.16" PRIX64 ":\n%s\n",
+             shader->type() == ShaderType::kVertex ? "vertex" : "pixel",
+             shader->ucode_dword_count() * 4, shader->ucode_data_hash(),
+             shader->ucode_disassembly().c_str());
+  }
+
+  // Dump shader files if desired.
+  if (!FLAGS_dump_shaders.empty()) {
+    shader->Dump(FLAGS_dump_shaders, "vk");
+  }
+
+  return shader->is_valid();
+}
+
 void PipelineCache::DumpShaderDisasmNV(
     const VkGraphicsPipelineCreateInfo& pipeline_info) {
   // !! HACK !!: This only works on NVidia drivers. Dumps shader disasm.
@@ -510,8 +515,6 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
         break;
     }
 
-    // window_width_scalar = window_height_scalar = 1;
-
     // Whether each of the viewport settings are enabled.
     // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
     bool vport_xscale_enable = (regs.pa_cl_vte_cntl & (1 << 0)) > 0;
@@ -525,10 +528,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
                 vport_yoffset_enable == vport_zoffset_enable);
 
     VkViewport viewport_rect;
-    viewport_rect.x = 0;
-    viewport_rect.y = 0;
-    viewport_rect.width = 100;
-    viewport_rect.height = 100;
+    std::memset(&viewport_rect, 0, sizeof(VkViewport));
     viewport_rect.minDepth = 0;
     viewport_rect.maxDepth = 1;
 
@@ -655,7 +655,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer,
     push_constants.vtx_fmt[3] = vtx_w0_fmt;
 
     // Alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE
-    // Deprecated in Vulkan, implemented in shader.
+    // Emulated in shader.
     // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard;
     // ALPHATESTENABLE
     push_constants.alpha_test[0] =
@@ -754,6 +754,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages(
   bool dirty = false;
   dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
                              XE_GPU_REG_PA_SU_SC_MODE_CNTL);
+  dirty |= SetShadowRegister(&regs.sq_program_cntl, XE_GPU_REG_SQ_PROGRAM_CNTL);
   dirty |= regs.vertex_shader != vertex_shader;
   dirty |= regs.pixel_shader != pixel_shader;
   dirty |= regs.primitive_type != primitive_type;
@@ -765,6 +766,21 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages(
     return UpdateStatus::kCompatible;
   }
 
+  xenos::xe_gpu_program_cntl_t sq_program_cntl;
+  sq_program_cntl.dword_0 = regs.sq_program_cntl;
+
+  if (!vertex_shader->is_translated() &&
+      !TranslateShader(vertex_shader, sq_program_cntl)) {
+    XELOGE("Failed to translate the vertex shader!");
+    return UpdateStatus::kError;
+  }
+
+  if (!pixel_shader->is_translated() &&
+      !TranslateShader(pixel_shader, sq_program_cntl)) {
+    XELOGE("Failed to translate the pixel shader!");
+    return UpdateStatus::kError;
+  }
+
   update_shader_stages_stage_count_ = 0;
 
   auto& vertex_pipeline_stage =
@@ -868,6 +884,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState(
         case VertexFormat::k_11_11_10:
           // Converted in-shader.
           // TODO(DrChat)
+          assert_always();
           // vertex_attrib_descr.format = VK_FORMAT_R32_UINT;
           vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32;
           break;
@@ -901,19 +918,19 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState(
               is_signed ? VK_FORMAT_R32G32B32A32_SINT : VK_FORMAT_R32_UINT;
           break;
         case VertexFormat::k_32_FLOAT:
-          assert_true(is_signed);
+          // assert_true(is_signed);
           vertex_attrib_descr.format = VK_FORMAT_R32_SFLOAT;
           break;
         case VertexFormat::k_32_32_FLOAT:
-          assert_true(is_signed);
+          // assert_true(is_signed);
           vertex_attrib_descr.format = VK_FORMAT_R32G32_SFLOAT;
           break;
         case VertexFormat::k_32_32_32_FLOAT:
-          assert_true(is_signed);
+          // assert_true(is_signed);
           vertex_attrib_descr.format = VK_FORMAT_R32G32B32_SFLOAT;
           break;
         case VertexFormat::k_32_32_32_32_FLOAT:
-          assert_true(is_signed);
+          // assert_true(is_signed);
           vertex_attrib_descr.format = VK_FORMAT_R32G32B32A32_SFLOAT;
           break;
         default:
@@ -1060,8 +1077,9 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
   // Discard rasterizer output in depth-only mode.
   // TODO(DrChat): Figure out how to make this work properly.
   auto enable_mode = static_cast<xenos::ModeControl>(regs.rb_modecontrol & 0x7);
-  state_info.rasterizerDiscardEnable =
-      enable_mode == xenos::ModeControl::kColorDepth ? VK_FALSE : VK_TRUE;
+  state_info.rasterizerDiscardEnable = VK_FALSE;
+  // state_info.rasterizerDiscardEnable =
+  //    enable_mode == xenos::ModeControl::kColorDepth ? VK_FALSE : VK_TRUE;
 
   // KILL_PIX_POST_EARLY_Z
   if (regs.pa_sc_viz_query & 0x80) {
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h
index e5645f638..c2335028f 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/pipeline_cache.h
@@ -75,6 +75,7 @@ class PipelineCache {
   // state.
   VkPipeline GetPipeline(const RenderState* render_state, uint64_t hash_key);
 
+  bool TranslateShader(VulkanShader* shader, xenos::xe_gpu_program_cntl_t cntl);
   void DumpShaderDisasmNV(const VkGraphicsPipelineCreateInfo& info);
 
   // Gets a geometry shader used to emulate the given primitive type.
@@ -157,6 +158,7 @@ class PipelineCache {
   struct UpdateShaderStagesRegisters {
     PrimitiveType primitive_type;
     uint32_t pa_su_sc_mode_cntl;
+    uint32_t sq_program_cntl;
     VulkanShader* vertex_shader;
     VulkanShader* pixel_shader;
 

From 7004f836657773c1a34a2d9adff03992225459d3 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 22 May 2016 20:05:47 -0500
Subject: [PATCH 62/77] CP: Don't check for shader validity here Fix a lousy
 typo in PrepareTextureSet

---
 src/xenia/gpu/vulkan/vulkan_command_processor.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 17f83f82c..d29c6e8bf 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -95,7 +95,7 @@ void VulkanCommandProcessor::ShutdownContext() {
   // Free all pools. This must come after all of our caches clean up.
   command_buffer_pool_.reset();
 
-  // Release queue, if were using an acquired one.
+  // Release queue, if we were using an acquired one.
   if (!queue_mutex_) {
     device_->ReleaseQueue(queue_);
     queue_ = nullptr;
@@ -185,6 +185,7 @@ void VulkanCommandProcessor::CreateSwapImages(VkCommandBuffer setup_buffer,
   // Transition both images to general layout.
   VkImageMemoryBarrier barrier;
   std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier));
+  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
   barrier.srcAccessMask = 0;
   barrier.dstAccessMask = 0;
   barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -268,6 +269,7 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
     // Insert a barrier so the GPU finishes writing to the image.
     VkImageMemoryBarrier barrier;
     std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier));
+    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
     barrier.srcAccessMask =
         VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT;
     barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
@@ -315,10 +317,10 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
       current_render_state_ = nullptr;
     }
 
-    status = vkEndCommandBuffer(current_command_buffer_);
-    CheckResult(status, "vkEndCommandBuffer");
     status = vkEndCommandBuffer(current_setup_buffer_);
     CheckResult(status, "vkEndCommandBuffer");
+    status = vkEndCommandBuffer(current_command_buffer_);
+    CheckResult(status, "vkEndCommandBuffer");
 
     // TODO(DrChat): If the setup buffer is empty, don't bother queueing it up.
     submit_buffers.push_back(current_setup_buffer_);
@@ -417,7 +419,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
   // We need them to do just about anything so validate here.
   auto vertex_shader = static_cast<VulkanShader*>(active_vertex_shader());
   auto pixel_shader = static_cast<VulkanShader*>(active_pixel_shader());
-  if (!vertex_shader || !vertex_shader->is_valid()) {
+  if (!vertex_shader) {
     // Always need a vertex shader.
     return true;
   }
@@ -426,7 +428,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     // Use a dummy pixel shader when required.
     // TODO(benvanik): dummy pixel shader.
     assert_not_null(pixel_shader);
-  } else if (!pixel_shader || !pixel_shader->is_valid()) {
+  } else if (!pixel_shader) {
     // Need a pixel shader in normal color mode.
     return true;
   }
@@ -742,7 +744,7 @@ bool VulkanCommandProcessor::PopulateSamplers(VkCommandBuffer command_buffer,
 #endif  // FINE_GRAINED_DRAW_SCOPES
 
   auto descriptor_set = texture_cache_->PrepareTextureSet(
-      command_buffer, current_batch_fence_, vertex_shader->texture_bindings(),
+      setup_buffer, current_batch_fence_, vertex_shader->texture_bindings(),
       pixel_shader->texture_bindings());
   if (!descriptor_set) {
     // Unable to bind set.

From c6e905db2fb1736a55fca6996246d49ce4820c12 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 22 May 2016 22:14:45 -0500
Subject: [PATCH 63/77] Fix a memory leak in fenced pools.

---
 src/xenia/ui/vulkan/fenced_pools.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xenia/ui/vulkan/fenced_pools.h b/src/xenia/ui/vulkan/fenced_pools.h
index a481edf10..d62ad7452 100644
--- a/src/xenia/ui/vulkan/fenced_pools.h
+++ b/src/xenia/ui/vulkan/fenced_pools.h
@@ -158,6 +158,7 @@ class BaseFencedPool {
     }
     if (pending_batch_list_tail_) {
       pending_batch_list_tail_->next = batch;
+      pending_batch_list_tail_ = batch;
     } else {
       pending_batch_list_tail_ = batch;
     }

From 6e21d882501fef826d0f96fb993b449e150d7221 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Mon, 23 May 2016 09:58:46 -0500
Subject: [PATCH 64/77] Fixup circular buffers for full rotation

---
 src/xenia/ui/vulkan/circular_buffer.cc | 42 ++++++--------------------
 1 file changed, 10 insertions(+), 32 deletions(-)

diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc
index 404f7a503..883e4d98d 100644
--- a/src/xenia/ui/vulkan/circular_buffer.cc
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@@ -103,23 +103,22 @@ bool CircularBuffer::CanAcquire(VkDeviceSize length) {
   length = xe::round_up(length, alignment_);
   if (allocations_.empty()) {
     // Read head has caught up to write head (entire buffer available for write)
-    assert(read_head_ == write_head_);
-    return capacity_ > length;
+    return capacity_ >= length;
   } else if (write_head_ < read_head_) {
     // Write head wrapped around and is behind read head.
     // |  write  |---- read ----|
-    return (read_head_ - write_head_) > length;
-  } else {
+    return (read_head_ - write_head_) >= length;
+  } else if (write_head_ > read_head_) {
     // Read head behind write head.
     // 1. Check if there's enough room from write -> capacity
     // |  |---- read ----|    write     |
-    if ((capacity_ - write_head_) > length) {
+    if ((capacity_ - write_head_) >= length) {
       return true;
     }
 
     // 2. Check if there's enough room from 0 -> read
     // |    write     |---- read ----|  |
-    if ((read_head_) > length) {
+    if ((read_head_ - 0) >= length) {
       return true;
     }
   }
@@ -129,29 +128,13 @@ bool CircularBuffer::CanAcquire(VkDeviceSize length) {
 
 CircularBuffer::Allocation* CircularBuffer::Acquire(
     VkDeviceSize length, std::shared_ptr<Fence> fence) {
-  if (!CanAcquire(length)) {
+  VkDeviceSize aligned_length = xe::round_up(length, alignment_);
+  if (!CanAcquire(aligned_length)) {
     return nullptr;
   }
 
-  VkDeviceSize aligned_length = xe::round_up(length, alignment_);
   assert_true(write_head_ % alignment_ == 0);
-  if (allocations_.empty()) {
-    // Entire buffer available.
-    assert(read_head_ == write_head_);
-    assert(capacity_ > aligned_length);
-
-    write_head_ = aligned_length;
-
-    auto alloc = new Allocation();
-    alloc->host_ptr = host_base_ + 0;
-    alloc->gpu_memory = gpu_memory_;
-    alloc->offset = gpu_base_ + 0;
-    alloc->length = length;
-    alloc->aligned_length = aligned_length;
-    alloc->fence = fence;
-    allocations_.push_back(alloc);
-    return alloc;
-  } else if (write_head_ < read_head_) {
+  if (write_head_ < read_head_) {
     // Write head behind read head.
     assert_true(read_head_ - write_head_ >= aligned_length);
 
@@ -167,7 +150,7 @@ CircularBuffer::Allocation* CircularBuffer::Acquire(
 
     return alloc;
   } else {
-    // Write head after read head
+    // Write head equal to/after read head
     if (capacity_ - write_head_ >= aligned_length) {
       // Free space from write -> capacity
       auto alloc = new Allocation();
@@ -181,7 +164,7 @@ CircularBuffer::Allocation* CircularBuffer::Acquire(
       allocations_.push_back(alloc);
 
       return alloc;
-    } else if ((read_head_ - 0) > aligned_length) {
+    } else if ((read_head_ - 0) >= aligned_length) {
       // Free space from begin -> read
       auto alloc = new Allocation();
       alloc->host_ptr = host_base_ + write_head_;
@@ -236,11 +219,6 @@ void CircularBuffer::Scavenge() {
     delete *it;
     it = allocations_.erase(it);
   }
-
-  if (allocations_.empty()) {
-    // Reset R/W heads.
-    read_head_ = write_head_ = 0;
-  }
 }
 
 }  // namespace vulkan

From 729152a58ba116fc7669275f8486ca9f2a2ee81d Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Mon, 23 May 2016 09:59:37 -0500
Subject: [PATCH 65/77] VK: Enable independentBlend feature

---
 src/xenia/ui/vulkan/vulkan_device.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xenia/ui/vulkan/vulkan_device.cc b/src/xenia/ui/vulkan/vulkan_device.cc
index 42077ca82..0b3a6c2ff 100644
--- a/src/xenia/ui/vulkan/vulkan_device.cc
+++ b/src/xenia/ui/vulkan/vulkan_device.cc
@@ -95,6 +95,7 @@ bool VulkanDevice::Initialize(DeviceInfo device_info) {
   ENABLE_AND_EXPECT(depthClamp);
   ENABLE_AND_EXPECT(alphaToOne);
   ENABLE_AND_EXPECT(multiViewport);
+  ENABLE_AND_EXPECT(independentBlend);
   // TODO(benvanik): add other features.
   if (any_features_missing) {
     XELOGE(

From 5f764730ae8d6dcc2012ac38313a7c3edb2b6a0c Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Mon, 23 May 2016 13:16:13 -0500
Subject: [PATCH 66/77] Vulkan CP: Override frame traces if renderdoc is
 attached

---
 src/xenia/gpu/command_processor.h             |  6 ++---
 .../gpu/vulkan/vulkan_command_processor.cc    | 23 ++++++++++++-------
 .../gpu/vulkan/vulkan_command_processor.h     |  2 ++
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h
index f2fbb6c54..c2784480b 100644
--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@@ -84,9 +84,9 @@ class CommandProcessor {
     swap_request_handler_ = fn;
   }
 
-  void RequestFrameTrace(const std::wstring& root_path);
-  void BeginTracing(const std::wstring& root_path);
-  void EndTracing();
+  virtual void RequestFrameTrace(const std::wstring& root_path);
+  virtual void BeginTracing(const std::wstring& root_path);
+  virtual void EndTracing();
 
   void InitializeRingBuffer(uint32_t ptr, uint32_t page_count);
   void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index d29c6e8bf..a213ea6e5 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -37,9 +37,22 @@ VulkanCommandProcessor::VulkanCommandProcessor(
 
 VulkanCommandProcessor::~VulkanCommandProcessor() = default;
 
+void VulkanCommandProcessor::RequestFrameTrace(const std::wstring& root_path) {
+  // Override traces if renderdoc is attached.
+  if (device_->is_renderdoc_attached()) {
+    trace_requested_ = true;
+    return;
+  }
+
+  return CommandProcessor::RequestFrameTrace(root_path);
+}
+
 void VulkanCommandProcessor::ClearCaches() {
   CommandProcessor::ClearCaches();
 
+  auto status = vkQueueWaitIdle(queue_);
+  CheckResult(status, "vkQueueWaitIdle");
+
   buffer_cache_->ClearCache();
   pipeline_cache_->ClearCache();
   render_cache_->ClearCache();
@@ -349,12 +362,6 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
     if (device_->is_renderdoc_attached() && capturing_) {
       device_->EndRenderDocFrameCapture();
       capturing_ = false;
-
-      // HACK(DrChat): Used b/c I disabled trace saving code in the CP.
-      // Remove later.
-      if (!trace_writer_.is_open()) {
-        trace_state_ = TraceState::kDisabled;
-      }
     }
     if (queue_mutex_) {
       queue_mutex_->unlock();
@@ -459,13 +466,13 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
 
     static uint32_t frame = 0;
     if (device_->is_renderdoc_attached() && !capturing_ &&
-        (FLAGS_vulkan_renderdoc_capture_all ||
-         trace_state_ == TraceState::kSingleFrame)) {
+        (FLAGS_vulkan_renderdoc_capture_all || trace_requested_)) {
       if (queue_mutex_) {
         queue_mutex_->lock();
       }
 
       capturing_ = true;
+      trace_requested_ = false;
       device_->BeginRenderDocFrameCapture();
 
       if (queue_mutex_) {
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index 4a7788e09..f58e2319b 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -49,6 +49,7 @@ class VulkanCommandProcessor : public CommandProcessor {
                          kernel::KernelState* kernel_state);
   ~VulkanCommandProcessor() override;
 
+  virtual void RequestFrameTrace(const std::wstring& root_path) override;
   void ClearCaches() override;
 
   RenderCache* render_cache() { return render_cache_.get(); }
@@ -103,6 +104,7 @@ class VulkanCommandProcessor : public CommandProcessor {
   // Last copy base address, for debugging only.
   uint32_t last_copy_base_ = 0;
   bool capturing_ = false;
+  bool trace_requested_ = false;
 
   std::unique_ptr<BufferCache> buffer_cache_;
   std::unique_ptr<PipelineCache> pipeline_cache_;

From ad83a1994dd5e0e37c6166761db1f21cb2cee7ab Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 24 May 2016 12:53:25 -0500
Subject: [PATCH 67/77] VK: Remove alphaToOne feature requirement

---
 src/xenia/ui/vulkan/vulkan_device.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/xenia/ui/vulkan/vulkan_device.cc b/src/xenia/ui/vulkan/vulkan_device.cc
index 0b3a6c2ff..7b1dc7f8d 100644
--- a/src/xenia/ui/vulkan/vulkan_device.cc
+++ b/src/xenia/ui/vulkan/vulkan_device.cc
@@ -93,7 +93,6 @@ bool VulkanDevice::Initialize(DeviceInfo device_info) {
   }
   ENABLE_AND_EXPECT(geometryShader);
   ENABLE_AND_EXPECT(depthClamp);
-  ENABLE_AND_EXPECT(alphaToOne);
   ENABLE_AND_EXPECT(multiViewport);
   ENABLE_AND_EXPECT(independentBlend);
   // TODO(benvanik): add other features.

From 9d1e66ab16ac4cff94027c549fb57e3ebb706dc3 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Tue, 24 May 2016 21:58:02 -0500
Subject: [PATCH 68/77] Don't write to color targets in depth-only mode.

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 16 ++++++----------
 src/xenia/gpu/vulkan/pipeline_cache.h  |  2 +-
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index b790b7cc1..eecad03d9 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -388,7 +388,7 @@ void PipelineCache::DumpShaderDisasmNV(
       disasm_fp = std::string("Shader disassembly not available.");
     }
 
-    XELOGI("%s\n=====================================\n%s", disasm_vp.c_str(),
+    XELOGI("%s\n=====================================\n%s\n", disasm_vp.c_str(),
            disasm_fp.c_str());
   }
 
@@ -1060,7 +1060,6 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
   dirty |= SetShadowRegister(&regs.pa_sc_viz_query, XE_GPU_REG_PA_SC_VIZ_QUERY);
   dirty |= SetShadowRegister(&regs.multi_prim_ib_reset_index,
                              XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX);
-  dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
   regs.primitive_type = primitive_type;
   XXH64_update(&hash_state_, &regs, sizeof(regs));
   if (!dirty) {
@@ -1074,13 +1073,6 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
   // TODO(benvanik): right setting?
   state_info.depthClampEnable = VK_FALSE;
 
-  // Discard rasterizer output in depth-only mode.
-  // TODO(DrChat): Figure out how to make this work properly.
-  auto enable_mode = static_cast<xenos::ModeControl>(regs.rb_modecontrol & 0x7);
-  state_info.rasterizerDiscardEnable = VK_FALSE;
-  // state_info.rasterizerDiscardEnable =
-  //    enable_mode == xenos::ModeControl::kColorDepth ? VK_FALSE : VK_TRUE;
-
   // KILL_PIX_POST_EARLY_Z
   if (regs.pa_sc_viz_query & 0x80) {
     state_info.rasterizerDiscardEnable = VK_TRUE;
@@ -1298,6 +1290,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateColorBlendState() {
       SetShadowRegister(&regs.rb_blendcontrol[2], XE_GPU_REG_RB_BLENDCONTROL_2);
   dirty |=
       SetShadowRegister(&regs.rb_blendcontrol[3], XE_GPU_REG_RB_BLENDCONTROL_3);
+  dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
   XXH64_update(&hash_state_, &regs, sizeof(regs));
   if (!dirty) {
     return UpdateStatus::kCompatible;
@@ -1310,6 +1303,8 @@ PipelineCache::UpdateStatus PipelineCache::UpdateColorBlendState() {
   state_info.logicOpEnable = VK_FALSE;
   state_info.logicOp = VK_LOGIC_OP_NO_OP;
 
+  auto enable_mode = static_cast<xenos::ModeControl>(regs.rb_modecontrol & 0x7);
+
   static const VkBlendFactor kBlendFactorMap[] = {
       /*  0 */ VK_BLEND_FACTOR_ZERO,
       /*  1 */ VK_BLEND_FACTOR_ONE,
@@ -1362,7 +1357,8 @@ PipelineCache::UpdateStatus PipelineCache::UpdateColorBlendState() {
     // A2XX_RB_COLOR_MASK_WRITE_* == D3DRS_COLORWRITEENABLE
     // Lines up with VkColorComponentFlagBits, where R=bit 1, G=bit 2, etc..
     uint32_t write_mask = (regs.rb_color_mask >> (i * 4)) & 0xF;
-    attachment_state.colorWriteMask = write_mask;
+    attachment_state.colorWriteMask =
+        enable_mode == xenos::ModeControl::kColorDepth ? write_mask : 0;
   }
 
   state_info.attachmentCount = 4;
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h
index c2335028f..49144f50f 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/pipeline_cache.h
@@ -216,7 +216,6 @@ class PipelineCache {
     uint32_t pa_sc_screen_scissor_br;
     uint32_t pa_sc_viz_query;
     uint32_t multi_prim_ib_reset_index;
-    uint32_t rb_modecontrol;
 
     UpdateRasterizationStateRegisters() { Reset(); }
     void Reset() { std::memset(this, 0, sizeof(*this)); }
@@ -246,6 +245,7 @@ class PipelineCache {
     uint32_t rb_colorcontrol;
     uint32_t rb_color_mask;
     uint32_t rb_blendcontrol[4];
+    uint32_t rb_modecontrol;
 
     UpdateColorBlendStateRegisters() { Reset(); }
     void Reset() { std::memset(this, 0, sizeof(*this)); }

From a187a4931a966ffbb80a23c63ff39c43f15264b3 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 25 May 2016 13:14:03 -0500
Subject: [PATCH 69/77] Whoops - fix setting the wrong host base for some
 graphics allocations.

---
 src/xenia/ui/vulkan/circular_buffer.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc
index 883e4d98d..94d2996ce 100644
--- a/src/xenia/ui/vulkan/circular_buffer.cc
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@@ -103,6 +103,7 @@ bool CircularBuffer::CanAcquire(VkDeviceSize length) {
   length = xe::round_up(length, alignment_);
   if (allocations_.empty()) {
     // Read head has caught up to write head (entire buffer available for write)
+    assert_true(read_head_ == write_head_);
     return capacity_ >= length;
   } else if (write_head_ < read_head_) {
     // Write head wrapped around and is behind read head.
@@ -167,7 +168,7 @@ CircularBuffer::Allocation* CircularBuffer::Acquire(
     } else if ((read_head_ - 0) >= aligned_length) {
       // Free space from begin -> read
       auto alloc = new Allocation();
-      alloc->host_ptr = host_base_ + write_head_;
+      alloc->host_ptr = host_base_ + 0;
       alloc->gpu_memory = gpu_memory_;
       alloc->offset = gpu_base_ + 0;
       alloc->length = length;

From 008167fa6670d0a4851ca8e11483342446d63bcd Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 25 May 2016 13:49:36 -0500
Subject: [PATCH 70/77] VK: Enable full rotation / Set render_state to nullptr
 on failure / Fix format info in texture resolves

---
 src/xenia/gpu/vulkan/vulkan_command_processor.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index a213ea6e5..9c8e268a5 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -370,10 +370,6 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
 
   command_buffer_pool_->EndBatch(current_batch_fence_);
 
-  // TODO(DrChat): Remove this.
-  VkFence fences[] = {*current_batch_fence_};
-  vkWaitForFences(*device_, 1, fences, true, -1);
-
   // Scavenging.
   {
 #if FINE_GRAINED_DRAW_SCOPES
@@ -525,6 +521,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     current_command_buffer_ = nullptr;
     current_setup_buffer_ = nullptr;
     current_batch_fence_ = nullptr;
+    current_render_state_ = nullptr;
     return false;
   }
   pipeline_cache_->SetDynamicState(command_buffer, started_command_buffer);
@@ -536,6 +533,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     current_command_buffer_ = nullptr;
     current_setup_buffer_ = nullptr;
     current_batch_fence_ = nullptr;
+    current_render_state_ = nullptr;
     return false;
   }
 
@@ -546,6 +544,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     current_command_buffer_ = nullptr;
     current_setup_buffer_ = nullptr;
     current_batch_fence_ = nullptr;
+    current_render_state_ = nullptr;
     return false;
   }
 
@@ -556,6 +555,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     current_command_buffer_ = nullptr;
     current_setup_buffer_ = nullptr;
     current_batch_fence_ = nullptr;
+    current_render_state_ = nullptr;
     return false;
   }
 
@@ -569,6 +569,7 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     current_command_buffer_ = nullptr;
     current_setup_buffer_ = nullptr;
     current_batch_fence_ = nullptr;
+    current_render_state_ = nullptr;
     return false;
   }
 
@@ -924,6 +925,8 @@ bool VulkanCommandProcessor::IssueCopy() {
   tex_info.height = dest_logical_height - 1;
   tex_info.dimension = gpu::Dimension::k2D;
   tex_info.input_length = copy_dest_pitch * copy_dest_height * 4;
+  tex_info.format_info =
+      FormatInfo::Get(uint32_t(ColorFormatToTextureFormat(copy_dest_format)));
   tex_info.size_2d.logical_width = dest_logical_width;
   tex_info.size_2d.logical_height = dest_logical_height;
   tex_info.size_2d.block_width = dest_block_width;

From 861141721992b5c54b1a6ce5510020066cc83e29 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 25 May 2016 17:45:38 -0500
Subject: [PATCH 71/77] Initialize rasterDiscardEnable to VK_FALSE

---
 src/xenia/gpu/vulkan/pipeline_cache.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index eecad03d9..e80cb4675 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -1072,6 +1072,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState(
 
   // TODO(benvanik): right setting?
   state_info.depthClampEnable = VK_FALSE;
+  state_info.rasterizerDiscardEnable = VK_FALSE;
 
   // KILL_PIX_POST_EARLY_Z
   if (regs.pa_sc_viz_query & 0x80) {

From c85756981b381308b745e849a40325d27738eb32 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Wed, 25 May 2016 19:49:56 -0500
Subject: [PATCH 72/77] TextureCache: Fix a few null pointer bugs Ordering of
 in-flight descriptor sets Change staging buffer size Free all samplers on
 exit

---
 src/xenia/gpu/vulkan/texture_cache.cc | 82 ++++++++++++++-------------
 src/xenia/gpu/vulkan/texture_cache.h  |  4 +-
 2 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index ee82cb74a..0108f6100 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -25,6 +25,7 @@ namespace vulkan {
 using xe::ui::vulkan::CheckResult;
 
 constexpr uint32_t kMaxTextureSamplers = 32;
+constexpr VkDeviceSize kStagingBufferSize = 64 * 1024 * 1024;
 
 struct TextureConfig {
   TextureFormat guest_format;
@@ -85,9 +86,9 @@ static const TextureConfig texture_configs[64] = {
     // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
     {TextureFormat::k_DXN, VK_FORMAT_BC5_UNORM_BLOCK},  // ?
     {TextureFormat::k_8_8_8_8_AS_16_16_16_16, VK_FORMAT_R8G8B8A8_UNORM},
-    {TextureFormat::k_DXT1_AS_16_16_16_16, VK_FORMAT_BC1_RGB_SRGB_BLOCK},
-    {TextureFormat::k_DXT2_3_AS_16_16_16_16, VK_FORMAT_BC2_SRGB_BLOCK},
-    {TextureFormat::k_DXT4_5_AS_16_16_16_16, VK_FORMAT_BC3_SRGB_BLOCK},
+    {TextureFormat::k_DXT1_AS_16_16_16_16, VK_FORMAT_BC1_RGB_UNORM_BLOCK},
+    {TextureFormat::k_DXT2_3_AS_16_16_16_16, VK_FORMAT_BC2_UNORM_BLOCK},
+    {TextureFormat::k_DXT4_5_AS_16_16_16_16, VK_FORMAT_BC3_UNORM_BLOCK},
     {TextureFormat::k_2_10_10_10_AS_16_16_16_16,
      VK_FORMAT_A2R10G10B10_UNORM_PACK32},
     {TextureFormat::k_10_11_11_AS_16_16_16_16,
@@ -151,28 +152,23 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
                                     nullptr, &texture_descriptor_set_layout_);
   CheckResult(err, "vkCreateDescriptorSetLayout");
 
-  int width = 4096;
-  int height = 4096;
-  if (!staging_buffer_.Initialize(width * height * 4,
+  if (!staging_buffer_.Initialize(kStagingBufferSize,
                                   VK_BUFFER_USAGE_TRANSFER_SRC_BIT)) {
     assert_always();
   }
 
-  // Upload a grid into the staging buffer.
-  auto gpu_data = reinterpret_cast<uint32_t*>(staging_buffer_.host_base());
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      gpu_data[y * width + x] =
-          ((y % 32 < 16) ^ (x % 32 >= 16)) ? 0xFF0000FF : 0xFFFFFFFF;
-    }
-  }
-
   invalidated_textures_sets_[0].reserve(64);
   invalidated_textures_sets_[1].reserve(64);
   invalidated_textures_ = &invalidated_textures_sets_[0];
 }
 
 TextureCache::~TextureCache() {
+  for (auto it = samplers_.begin(); it != samplers_.end(); ++it) {
+    vkDestroySampler(*device_, it->second->sampler, nullptr);
+    delete it->second;
+  }
+  samplers_.clear();
+
   vkDestroyDescriptorSetLayout(*device_, texture_descriptor_set_layout_,
                                nullptr);
   vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr);
@@ -202,15 +198,11 @@ TextureCache::Texture* TextureCache::AllocateTexture(
       return nullptr;
   }
 
-  VkFormat format = VK_FORMAT_UNDEFINED;
-  if (texture_info.format_info) {
-    auto& config = texture_configs[int(texture_info.format_info->format)];
-    format = config.host_format != VK_FORMAT_UNDEFINED
-                 ? config.host_format
-                 : VK_FORMAT_R8G8B8A8_UNORM;
-  } else {
-    format = VK_FORMAT_R8G8B8A8_UNORM;
-  }
+  assert_not_null(texture_info.format_info);
+  auto& config = texture_configs[int(texture_info.format_info->format)];
+  VkFormat format = config.host_format != VK_FORMAT_UNDEFINED
+                        ? config.host_format
+                        : VK_FORMAT_R8G8B8A8_UNORM;
 
   VkFormatProperties props;
   uint32_t required_flags = VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
@@ -298,7 +290,8 @@ TextureCache::Texture* TextureCache::AllocateTexture(
 }
 
 bool TextureCache::FreeTexture(Texture* texture) {
-  if (texture->in_flight_fence->status() != VK_SUCCESS) {
+  if (texture->in_flight_fence &&
+      texture->in_flight_fence->status() != VK_SUCCESS) {
     // Texture still in flight.
     return false;
   }
@@ -388,7 +381,10 @@ TextureCache::Texture* TextureCache::Demand(
       texture->is_full_texture = true;
       texture->texture_info = texture_info;
 
-      memory_->CancelAccessWatch(texture->access_watch_handle);
+      if (texture->access_watch_handle) {
+        memory_->CancelAccessWatch(texture->access_watch_handle);
+      }
+
       texture->access_watch_handle = memory_->AddPhysicalAccessWatch(
           texture_info.guest_address, texture_info.input_length,
           cpu::MMIOHandler::kWatchWrite,
@@ -443,7 +439,6 @@ TextureCache::Texture* TextureCache::Demand(
   }
 
   if (!uploaded) {
-    // TODO: Destroy the texture.
     FreeTexture(texture);
     return nullptr;
   }
@@ -777,7 +772,10 @@ bool TextureCache::UploadTexture2D(
     VkCommandBuffer command_buffer,
     std::shared_ptr<ui::vulkan::Fence> completion_fence, Texture* dest,
     TextureInfo src) {
+#if FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   assert_true(src.dimension == Dimension::k2D);
 
   if (!staging_buffer_.CanAcquire(src.input_length)) {
@@ -959,6 +957,10 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
       vkAllocateDescriptorSets(*device_, &set_alloc_info, &descriptor_set);
   CheckResult(err, "vkAllocateDescriptorSets");
 
+  if (err != VK_SUCCESS) {
+    return nullptr;
+  }
+
   // Write all updated descriptors.
   // TODO(benvanik): optimize? split into multiple sets? set per type?
   // First: Reorganize and pool image update infos.
@@ -1029,7 +1031,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet(
                            descriptor_writes.data(), 0, nullptr);
   }
 
-  in_flight_sets_[descriptor_set] = completion_fence;
+  in_flight_sets_.push_back({descriptor_set, completion_fence});
   return descriptor_set;
 }
 
@@ -1056,6 +1058,10 @@ bool TextureCache::SetupTextureBinding(
     VkCommandBuffer command_buffer,
     std::shared_ptr<ui::vulkan::Fence> completion_fence,
     UpdateSetInfo* update_set_info, const Shader::TextureBinding& binding) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
   auto& regs = *register_file_;
   int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6;
   auto group =
@@ -1106,7 +1112,7 @@ bool TextureCache::SetupTextureBinding(
 }
 
 void TextureCache::ClearCache() {
-  // TODO(benvanik): caching.
+  // TODO(DrChat): Nuke everything.
 }
 
 void TextureCache::Scavenge() {
@@ -1119,7 +1125,9 @@ void TextureCache::Scavenge() {
       continue;
     }
 
-    ++it;
+    // We've encountered an item that hasn't been used yet, so any items
+    // afterwards are guaranteed to be unused.
+    break;
   }
 
   staging_buffer_.Scavenge();
@@ -1148,25 +1156,21 @@ void TextureCache::Scavenge() {
   if (!invalidated_textures.empty()) {
     for (auto it = invalidated_textures.begin();
          it != invalidated_textures.end(); ++it) {
-      if (!FreeTexture(*it)) {
-        // Texture wasn't deleted because it's still in use.
-        pending_delete_textures_.push_back(*it);
-      }
-
+      pending_delete_textures_.push_back(*it);
       textures_.erase((*it)->texture_info.hash());
     }
 
     invalidated_textures.clear();
   }
 
+  // Invalidated resolve textures.
   invalidated_resolve_textures_mutex_.lock();
   if (!invalidated_resolve_textures_.empty()) {
     for (auto it = invalidated_resolve_textures_.begin();
          it != invalidated_resolve_textures_.end(); ++it) {
-      if (!FreeTexture(*it)) {
-        // Texture wasn't deleted because it's still in use.
-        pending_delete_textures_.push_back(*it);
-      }
+      pending_delete_textures_.push_back(*it);
+      resolve_textures_.erase(
+          std::find(resolve_textures_.begin(), resolve_textures_.end(), *it));
     }
 
     invalidated_resolve_textures_.clear();
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index a78be6ed6..8f47f33df 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -171,14 +171,14 @@ class TextureCache {
 
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr;
-  std::unordered_map<VkDescriptorSet, std::shared_ptr<ui::vulkan::Fence>>
+  std::list<std::pair<VkDescriptorSet, std::shared_ptr<ui::vulkan::Fence>>>
       in_flight_sets_;
 
   ui::vulkan::CircularBuffer staging_buffer_;
   std::unordered_map<uint64_t, Texture*> textures_;
   std::unordered_map<uint64_t, Sampler*> samplers_;
   std::vector<Texture*> resolve_textures_;
-  std::vector<Texture*> pending_delete_textures_;
+  std::list<Texture*> pending_delete_textures_;
 
   std::mutex invalidated_textures_mutex_;
   std::vector<Texture*>* invalidated_textures_;

From f8d9472872027a62417f81ea359d030744b66618 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Thu, 26 May 2016 14:46:18 -0500
Subject: [PATCH 73/77] TextureCache: Fix trying to erase a resolve texture
 that isn't in the resolve textures list.

---
 src/xenia/gpu/vulkan/texture_cache.cc | 30 ++++++---------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 0108f6100..a6f6dab17 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -791,9 +791,6 @@ bool TextureCache::UploadTexture2D(
   auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
   assert_not_null(alloc);
 
-  // TODO: Support compression.
-  // assert_false(src.is_compressed());
-
   // Upload texture into GPU memory.
   // TODO: If the GPU supports it, we can submit a compute batch to convert the
   // texture and copy it to its destination. Otherwise, fallback to conversion
@@ -856,25 +853,6 @@ bool TextureCache::UploadTexture2D(
 
   staging_buffer_.Flush(alloc);
 
-  // Insert a memory barrier into the command buffer to ensure the upload has
-  // finished before we copy it into the destination texture.
-  /*
-  VkBufferMemoryBarrier upload_barrier = {
-      VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
-      NULL,
-      VK_ACCESS_HOST_WRITE_BIT,
-      VK_ACCESS_TRANSFER_READ_BIT,
-      VK_QUEUE_FAMILY_IGNORED,
-      VK_QUEUE_FAMILY_IGNORED,
-      staging_buffer_.gpu_buffer(),
-      alloc->offset,
-      alloc->aligned_length,
-  };
-  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1,
-                       &upload_barrier, 0, nullptr);
-  //*/
-
   // Transition the texture into a transfer destination layout.
   VkImageMemoryBarrier barrier;
   barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
@@ -1169,8 +1147,12 @@ void TextureCache::Scavenge() {
     for (auto it = invalidated_resolve_textures_.begin();
          it != invalidated_resolve_textures_.end(); ++it) {
       pending_delete_textures_.push_back(*it);
-      resolve_textures_.erase(
-          std::find(resolve_textures_.begin(), resolve_textures_.end(), *it));
+
+      auto tex =
+          std::find(resolve_textures_.begin(), resolve_textures_.end(), *it);
+      if (tex != resolve_textures_.end()) {
+        resolve_textures_.erase(tex);
+      }
     }
 
     invalidated_resolve_textures_.clear();

From fd37112db84c7b639d06d56f9cfccad4044236df Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 28 May 2016 10:25:18 -0500
Subject: [PATCH 74/77] VK Immediate Drawer: Properly transition texture
 layouts Support wrapping of textures not created here

---
 .../ui/vulkan/vulkan_immediate_drawer.cc      | 97 +++++++++++++++++--
 src/xenia/ui/vulkan/vulkan_immediate_drawer.h |  4 +
 2 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc b/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc
index a68b44c5f..49b0cbc4d 100644
--- a/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc
+++ b/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc
@@ -136,6 +136,46 @@ class LightweightCircularBuffer {
 
 class VulkanImmediateTexture : public ImmediateTexture {
  public:
+  VulkanImmediateTexture(VulkanDevice* device, VkDescriptorPool descriptor_pool,
+                         VkDescriptorSetLayout descriptor_set_layout,
+                         VkImageView image_view, VkSampler sampler,
+                         uint32_t width, uint32_t height)
+      : ImmediateTexture(width, height),
+        device_(*device),
+        descriptor_pool_(descriptor_pool),
+        image_view_(image_view),
+        sampler_(sampler) {
+    handle = reinterpret_cast<uintptr_t>(this);
+
+    // Create descriptor set used just for this texture.
+    // It never changes, so we can reuse it and not worry with updates.
+    VkDescriptorSetAllocateInfo set_alloc_info;
+    set_alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    set_alloc_info.pNext = nullptr;
+    set_alloc_info.descriptorPool = descriptor_pool_;
+    set_alloc_info.descriptorSetCount = 1;
+    set_alloc_info.pSetLayouts = &descriptor_set_layout;
+    auto err =
+        vkAllocateDescriptorSets(device_, &set_alloc_info, &descriptor_set_);
+    CheckResult(err, "vkAllocateDescriptorSets");
+
+    // Initialize descriptor with our texture.
+    VkDescriptorImageInfo texture_info;
+    texture_info.sampler = sampler_;
+    texture_info.imageView = image_view_;
+    texture_info.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+    VkWriteDescriptorSet descriptor_write;
+    descriptor_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    descriptor_write.pNext = nullptr;
+    descriptor_write.dstSet = descriptor_set_;
+    descriptor_write.dstBinding = 0;
+    descriptor_write.dstArrayElement = 0;
+    descriptor_write.descriptorCount = 1;
+    descriptor_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+    descriptor_write.pImageInfo = &texture_info;
+    vkUpdateDescriptorSets(device_, 1, &descriptor_write, 0, nullptr);
+  }
+
   VulkanImmediateTexture(VulkanDevice* device, VkDescriptorPool descriptor_pool,
                          VkDescriptorSetLayout descriptor_set_layout,
                          VkSampler sampler, uint32_t width, uint32_t height)
@@ -161,7 +201,7 @@ class VulkanImmediateTexture : public ImmediateTexture {
     image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
     image_info.queueFamilyIndexCount = 0;
     image_info.pQueueFamilyIndices = nullptr;
-    image_info.initialLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+    image_info.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
     auto err = vkCreateImage(device_, &image_info, nullptr, &image_);
     CheckResult(err, "vkCreateImage");
 
@@ -221,9 +261,12 @@ class VulkanImmediateTexture : public ImmediateTexture {
 
   ~VulkanImmediateTexture() override {
     vkFreeDescriptorSets(device_, descriptor_pool_, 1, &descriptor_set_);
-    vkDestroyImageView(device_, image_view_, nullptr);
-    vkDestroyImage(device_, image_, nullptr);
-    vkFreeMemory(device_, device_memory_, nullptr);
+
+    if (device_memory_) {
+      vkDestroyImageView(device_, image_view_, nullptr);
+      vkDestroyImage(device_, image_, nullptr);
+      vkFreeMemory(device_, device_memory_, nullptr);
+    }
   }
 
   void Upload(const uint8_t* src_data) {
@@ -238,25 +281,49 @@ class VulkanImmediateTexture : public ImmediateTexture {
     vkGetImageSubresourceLayout(device_, image_, &subresource, &layout);
 
     // Map memory for upload.
-    void* gpu_data = nullptr;
-    auto err =
-        vkMapMemory(device_, device_memory_, 0, layout.size, 0, &gpu_data);
+    uint8_t* gpu_data = nullptr;
+    auto err = vkMapMemory(device_, device_memory_, 0, layout.size, 0,
+                           reinterpret_cast<void**>(&gpu_data));
     CheckResult(err, "vkMapMemory");
 
     // Copy the entire texture, hoping its layout matches what we expect.
-    std::memcpy(gpu_data, src_data, layout.size);
+    std::memcpy(gpu_data + layout.offset, src_data, layout.size);
 
     vkUnmapMemory(device_, device_memory_);
   }
 
+  // Queues a command to transition this texture to a new layout. This assumes
+  // the command buffer WILL be queued and executed by the device.
+  void TransitionLayout(VkCommandBuffer command_buffer,
+                        VkImageLayout new_layout) {
+    VkImageMemoryBarrier image_barrier;
+    image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    image_barrier.pNext = nullptr;
+    image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    image_barrier.srcAccessMask = 0;
+    image_barrier.dstAccessMask = 0;
+    image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    image_barrier.newLayout = new_layout;
+    image_barrier.image = image_;
+    image_barrier.subresourceRange = {0, 0, 1, 0, 1};
+    image_barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    image_layout_ = new_layout;
+
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                         nullptr, 1, &image_barrier);
+  }
+
   VkDescriptorSet descriptor_set() const { return descriptor_set_; }
+  VkImageLayout layout() const { return image_layout_; }
 
  private:
   VkDevice device_ = nullptr;
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkSampler sampler_ = nullptr;  // Not owned.
   VkImage image_ = nullptr;
-  VkImageLayout image_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
+  VkImageLayout image_layout_ = VK_IMAGE_LAYOUT_PREINITIALIZED;
   VkDeviceMemory device_memory_ = nullptr;
   VkImageView image_view_ = nullptr;
   VkDescriptorSet descriptor_set_ = nullptr;
@@ -604,6 +671,14 @@ std::unique_ptr<ImmediateTexture> VulkanImmediateDrawer::CreateTexture(
   return std::unique_ptr<ImmediateTexture>(texture.release());
 }
 
+std::unique_ptr<ImmediateTexture> VulkanImmediateDrawer::WrapTexture(
+    VkImageView image_view, VkSampler sampler, uint32_t width,
+    uint32_t height) {
+  return std::make_unique<VulkanImmediateTexture>(
+      context_->device(), descriptor_pool_, texture_set_layout_, image_view,
+      sampler, width, height);
+}
+
 void VulkanImmediateDrawer::UpdateTexture(ImmediateTexture* texture,
                                           const uint8_t* data) {
   static_cast<VulkanImmediateTexture*>(texture)->Upload(data);
@@ -686,6 +761,10 @@ void VulkanImmediateDrawer::Draw(const ImmediateDraw& draw) {
   // Setup texture binding.
   auto texture = reinterpret_cast<VulkanImmediateTexture*>(draw.texture_handle);
   if (texture) {
+    if (texture->layout() != VK_IMAGE_LAYOUT_GENERAL) {
+      texture->TransitionLayout(current_cmd_buffer_, VK_IMAGE_LAYOUT_GENERAL);
+    }
+
     auto texture_set = texture->descriptor_set();
     vkCmdBindDescriptorSets(current_cmd_buffer_,
                             VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout_,
diff --git a/src/xenia/ui/vulkan/vulkan_immediate_drawer.h b/src/xenia/ui/vulkan/vulkan_immediate_drawer.h
index d14a6eb7c..1db47f0d8 100644
--- a/src/xenia/ui/vulkan/vulkan_immediate_drawer.h
+++ b/src/xenia/ui/vulkan/vulkan_immediate_drawer.h
@@ -32,6 +32,10 @@ class VulkanImmediateDrawer : public ImmediateDrawer {
                                                   ImmediateTextureFilter filter,
                                                   bool repeat,
                                                   const uint8_t* data) override;
+  std::unique_ptr<ImmediateTexture> WrapTexture(VkImageView image_view,
+                                                VkSampler sampler,
+                                                uint32_t width,
+                                                uint32_t height);
   void UpdateTexture(ImmediateTexture* texture, const uint8_t* data) override;
 
   void Begin(int render_target_width, int render_target_height) override;

From 6dab81d0cd18b9caf1dde266fc5e8b82793595bb Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 3 Jun 2016 13:10:57 -0500
Subject: [PATCH 75/77] Find the Vulkan SDK from %VULKAN_SDK% environmental
 variable.

---
 xenia-build | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xenia-build b/xenia-build
index 4587374c4..98330b6a5 100755
--- a/xenia-build
+++ b/xenia-build
@@ -642,8 +642,7 @@ class GenSpirvCommand(Command):
     print('Generating SPIR-V binaries...')
     print('')
 
-    # TODO(benvanik): actually find vulkan SDK. Env var? etc?
-    vulkan_sdk_path = 'C:\\VulkanSDK\\1.0.3.1'
+    vulkan_sdk_path = os.environ['VULKAN_SDK']
     vulkan_bin_path = os.path.join(vulkan_sdk_path, 'bin')
     glslang = os.path.join(vulkan_bin_path, 'glslangValidator')
     spirv_dis = os.path.join(vulkan_bin_path, 'spirv-dis')

From 2a924d2b05b769e52d5fa15df6b0091702b99251 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 3 Jun 2016 20:00:28 -0500
Subject: [PATCH 76/77] Pass vertex buffer endianness into the BufferCache
 IssueCopy: Actually issue the pipeline barrier to transition the image

---
 src/xenia/gpu/vulkan/buffer_cache.cc          | 14 ++---
 src/xenia/gpu/vulkan/buffer_cache.h           |  2 +-
 .../gpu/vulkan/vulkan_command_processor.cc    | 53 ++++++++++---------
 3 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc
index 4ae98c864..02bd88a83 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.cc
+++ b/src/xenia/gpu/vulkan/buffer_cache.cc
@@ -22,9 +22,6 @@ namespace vulkan {
 
 using xe::ui::vulkan::CheckResult;
 
-// Space kept between tail and head when wrapping.
-constexpr VkDeviceSize kDeadZone = 4 * 1024;
-
 constexpr VkDeviceSize kConstantRegisterUniformRange =
     512 * 4 * 4 + 8 * 4 + 32 * 4;
 
@@ -250,7 +247,7 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
 }
 
 std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
-    const void* source_ptr, size_t source_length,
+    const void* source_ptr, size_t source_length, Endian endian,
     std::shared_ptr<ui::vulkan::Fence> fence) {
   // TODO(benvanik): check cache.
 
@@ -263,9 +260,12 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
 
   // Copy data into the buffer.
   // TODO(benvanik): memcpy then use compute shaders to swap?
-  // Endian::k8in32, swap words.
-  xe::copy_and_swap_32_aligned(transient_buffer_->host_base() + offset,
-                               source_ptr, source_length / 4);
+  assert_true(endian == Endian::k8in32);
+  if (endian == Endian::k8in32) {
+    // Endian::k8in32, swap words.
+    xe::copy_and_swap_32_aligned(transient_buffer_->host_base() + offset,
+                                 source_ptr, source_length / 4);
+  }
 
   return {transient_buffer_->gpu_buffer(), offset};
 }
diff --git a/src/xenia/gpu/vulkan/buffer_cache.h b/src/xenia/gpu/vulkan/buffer_cache.h
index ee09585b5..8695fc36d 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.h
+++ b/src/xenia/gpu/vulkan/buffer_cache.h
@@ -67,7 +67,7 @@ class BufferCache {
   // Returns a buffer and offset that can be used with vkCmdBindVertexBuffers.
   // Size will be VK_WHOLE_SIZE if the data could not be uploaded (OOM).
   std::pair<VkBuffer, VkDeviceSize> UploadVertexBuffer(
-      const void* source_ptr, size_t source_length,
+      const void* source_ptr, size_t source_length, Endian endian,
       std::shared_ptr<ui::vulkan::Fence> fence);
 
   // Flushes all pending data to the GPU.
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 9c8e268a5..f31b28142 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -29,7 +29,7 @@ namespace vulkan {
 using namespace xe::gpu::xenos;
 using xe::ui::vulkan::CheckResult;
 
-constexpr size_t kDefaultBufferCacheCapacity = 128 * 1024 * 1024;
+constexpr size_t kDefaultBufferCacheCapacity = 256 * 1024 * 1024;
 
 VulkanCommandProcessor::VulkanCommandProcessor(
     VulkanGraphicsSystem* graphics_system, kernel::KernelState* kernel_state)
@@ -501,9 +501,6 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type,
     }
   }
 
-  // Update the render cache's tracking state.
-  render_cache_->UpdateState();
-
   // Configure the pipeline for drawing.
   // This encodes all render state (blend, depth, etc), our shader stages,
   // and our vertex input layout.
@@ -711,7 +708,6 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
         fetch = &group->vertex_fetch_2;
         break;
     }
-    assert_true(fetch->endian == 2);
 
     // TODO(benvanik): compute based on indices or vertex count.
     //     THIS CAN BE MASSIVELY INCORRECT (too large).
@@ -724,7 +720,8 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
         memory_->TranslatePhysical<const void*>(fetch->address << 2);
     size_t source_length = valid_range;
     auto buffer_ref = buffer_cache_->UploadVertexBuffer(
-        source_ptr, source_length, current_batch_fence_);
+        source_ptr, source_length, static_cast<Endian>(fetch->endian),
+        current_batch_fence_);
     if (buffer_ref.second == VK_WHOLE_SIZE) {
       // Failed to upload buffer.
       return false;
@@ -939,26 +936,6 @@ bool VulkanCommandProcessor::IssueCopy() {
   assert_not_null(texture);
   texture->in_flight_fence = current_batch_fence_;
 
-  if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
-    // Transition the image to a general layout.
-    VkImageMemoryBarrier image_barrier;
-    image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-    image_barrier.pNext = nullptr;
-    image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    image_barrier.srcAccessMask = 0;
-    image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-    image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
-    image_barrier.image = texture->image;
-    image_barrier.subresourceRange = {0, 0, 1, 0, 1};
-    image_barrier.subresourceRange.aspectMask =
-        copy_src_select <= 3
-            ? VK_IMAGE_ASPECT_COLOR_BIT
-            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
-    texture->image_layout = VK_IMAGE_LAYOUT_GENERAL;
-  }
-
   // For debugging purposes only (trace viewer)
   last_copy_base_ = texture->texture_info.guest_address;
 
@@ -988,6 +965,30 @@ bool VulkanCommandProcessor::IssueCopy() {
   }
   auto command_buffer = current_command_buffer_;
 
+  if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
+    // Transition the image to a general layout.
+    VkImageMemoryBarrier image_barrier;
+    image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    image_barrier.pNext = nullptr;
+    image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    image_barrier.srcAccessMask = 0;
+    image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
+    image_barrier.image = texture->image;
+    image_barrier.subresourceRange = {0, 0, 1, 0, 1};
+    image_barrier.subresourceRange.aspectMask =
+        copy_src_select <= 3
+            ? VK_IMAGE_ASPECT_COLOR_BIT
+            : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+    texture->image_layout = VK_IMAGE_LAYOUT_GENERAL;
+
+    vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                         nullptr, 1, &image_barrier);
+  }
+
   VkOffset3D resolve_offset = {dest_min_x, dest_min_y, 0};
   VkExtent3D resolve_extent = {uint32_t(dest_max_x - dest_min_x),
                                uint32_t(dest_max_y - dest_min_y), 1};

From 254acf2a67d1beb46872eb6fd881691d73ac611d Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 3 Jun 2016 20:01:49 -0500
Subject: [PATCH 77/77] RenderCache: Hardcode surface height to 2560 Fix a
 couple of other things

---
 src/xenia/gpu/vulkan/render_cache.cc | 108 ++++++++++++++-------------
 src/xenia/gpu/vulkan/render_cache.h  |   4 -
 2 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc
index 7d73951b5..f3d3288a7 100644
--- a/src/xenia/gpu/vulkan/render_cache.cc
+++ b/src/xenia/gpu/vulkan/render_cache.cc
@@ -309,8 +309,15 @@ bool CachedFramebuffer::IsCompatible(
     const RenderConfiguration& desired_config) const {
   // We already know all render pass things line up, so let's verify dimensions,
   // edram offsets, etc. We need an exact match.
-  if (desired_config.surface_pitch_px != width ||
-      desired_config.surface_height_px != height) {
+  uint32_t surface_pitch_px = desired_config.surface_msaa != MsaaSamples::k4X
+                                  ? desired_config.surface_pitch_px
+                                  : desired_config.surface_pitch_px * 2;
+  uint32_t surface_height_px = desired_config.surface_msaa == MsaaSamples::k1X
+                                   ? desired_config.surface_height_px
+                                   : desired_config.surface_height_px * 2;
+  surface_pitch_px = std::min(surface_pitch_px, 2560u);
+  surface_height_px = std::min(surface_height_px, 2560u);
+  if (surface_pitch_px != width || surface_height_px != height) {
     return false;
   }
   // TODO(benvanik): separate image views from images in tiles and store in fb?
@@ -445,7 +452,8 @@ CachedRenderPass::~CachedRenderPass() {
 
 bool CachedRenderPass::IsCompatible(
     const RenderConfiguration& desired_config) const {
-  if (config.surface_msaa != desired_config.surface_msaa) {
+  if (config.surface_msaa != desired_config.surface_msaa &&
+      FLAGS_vulkan_native_msaa) {
     return false;
   }
 
@@ -548,8 +556,6 @@ bool RenderCache::dirty() const {
            regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
   dirty |= cur_regs.pa_sc_window_scissor_br !=
            regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
-  dirty |= (cur_regs.rb_depthcontrol & (0x4 | 0x2)) <
-           (regs[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
   return dirty;
 }
 
@@ -580,11 +586,6 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL);
   dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_br,
                              XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR);
-  dirty |=
-      (regs.rb_depthcontrol & (0x4 | 0x2)) <
-      (register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2));
-  regs.rb_depthcontrol =
-      register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2);
   if (!dirty && current_state_.render_pass) {
     // No registers have changed so we can reuse the previous render pass -
     // just begin with what we had.
@@ -602,18 +603,17 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
       return nullptr;
     }
 
-    // Initial state update.
-    UpdateState();
-
     current_state_.render_pass = render_pass;
     current_state_.render_pass_handle = render_pass->handle;
     current_state_.framebuffer = framebuffer;
     current_state_.framebuffer_handle = framebuffer->handle;
 
+    // TODO(DrChat): Determine if we actually need an EDRAM buffer.
+    /*
     // Depth
     auto depth_target = current_state_.framebuffer->depth_stencil_attachment;
     if (depth_target && current_state_.config.depth_stencil.used) {
-      // UpdateTileView(command_buffer, depth_target, true);
+      UpdateTileView(command_buffer, depth_target, true);
     }
 
     // Color
@@ -623,8 +623,9 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
         continue;
       }
 
-      // UpdateTileView(command_buffer, target, true);
+      UpdateTileView(command_buffer, target, true);
     }
+    */
   }
   if (!render_pass) {
     return nullptr;
@@ -647,6 +648,15 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer,
   render_pass_begin_info.renderArea.extent.width = config->surface_pitch_px;
   render_pass_begin_info.renderArea.extent.height = config->surface_height_px;
 
+  if (config->surface_msaa == MsaaSamples::k2X) {
+    render_pass_begin_info.renderArea.extent.height =
+        std::min(config->surface_height_px * 2, 2560u);
+  } else if (config->surface_msaa == MsaaSamples::k4X) {
+    render_pass_begin_info.renderArea.extent.width *= 2;
+    render_pass_begin_info.renderArea.extent.height =
+        std::min(config->surface_height_px * 2, 2560u);
+  }
+
   // Configure clear color, if clearing.
   // TODO(benvanik): enable clearing here during resolve?
   render_pass_begin_info.clearValueCount = 0;
@@ -677,9 +687,15 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) {
   // Guess the height from the scissor height.
   // It's wildly inaccurate, but I've never seen it be bigger than the
   // EDRAM tiling.
+  /*
   uint32_t ws_y = (regs.pa_sc_window_scissor_tl >> 16) & 0x7FFF;
   uint32_t ws_h = ((regs.pa_sc_window_scissor_br >> 16) & 0x7FFF) - ws_y;
   config->surface_height_px = std::min(2560u, xe::round_up(ws_h, 16));
+  */
+
+  // TODO(DrChat): Find an accurate way to get the surface height. Until we do,
+  // we're going to hardcode it to 2560, as that's the absolute maximum.
+  config->surface_height_px = 2560;
 
   // Color attachment configuration.
   if (config->mode_control == ModeControl::kColorDepth) {
@@ -781,9 +797,9 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
       color_key.tile_offset = config->color[i].edram_base;
       color_key.tile_width =
           xe::round_up(config->surface_pitch_px, tile_width) / tile_width;
-      color_key.tile_height = std::min(
-          2560 / tile_height, 160u);  // xe::round_up(config->surface_height_px,
-                                      // tile_height) / tile_height;
+      // color_key.tile_height =
+      //     xe::round_up(config->surface_height_px, tile_height) / tile_height;
+      color_key.tile_height = 160;
       color_key.color_or_depth = 1;
       color_key.msaa_samples =
           0;  // static_cast<uint16_t>(config->surface_msaa);
@@ -800,9 +816,9 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
     depth_stencil_key.tile_offset = config->depth_stencil.edram_base;
     depth_stencil_key.tile_width =
         xe::round_up(config->surface_pitch_px, tile_width) / tile_width;
-    depth_stencil_key.tile_height = std::min(
-        2560 / tile_height, 160u);  // xe::round_up(config->surface_height_px,
-                                    // tile_height) / tile_height;
+    // depth_stencil_key.tile_height =
+    //     xe::round_up(config->surface_height_px, tile_height) / tile_height;
+    depth_stencil_key.tile_height = 160;
     depth_stencil_key.color_or_depth = 0;
     depth_stencil_key.msaa_samples =
         0;  // static_cast<uint16_t>(config->surface_msaa);
@@ -815,10 +831,17 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer,
       return false;
     }
 
+    uint32_t surface_pitch_px = config->surface_msaa != MsaaSamples::k4X
+                                    ? config->surface_pitch_px
+                                    : config->surface_pitch_px * 2;
+    uint32_t surface_height_px = config->surface_msaa == MsaaSamples::k1X
+                                     ? config->surface_height_px
+                                     : config->surface_height_px * 2;
+    surface_pitch_px = std::min(surface_pitch_px, 2560u);
+    surface_height_px = std::min(surface_height_px, 2560u);
     framebuffer = new CachedFramebuffer(
-        *device_, render_pass->handle, config->surface_pitch_px,
-        config->surface_height_px, target_color_attachments,
-        target_depth_stencil_attachment);
+        *device_, render_pass->handle, surface_pitch_px, surface_height_px,
+        target_color_attachments, target_depth_stencil_attachment);
     render_pass->cached_framebuffers.push_back(framebuffer);
   }
 
@@ -923,6 +946,8 @@ void RenderCache::EndRenderPass() {
   // contents of another render target by mistake! Need to reorder copy commands
   // to avoid this.
 
+  // TODO(DrChat): Determine if we actually need an EDRAM buffer.
+  /*
   std::vector<CachedTileView*> cached_views;
 
   // Depth
@@ -946,27 +971,13 @@ void RenderCache::EndRenderPass() {
       [](CachedTileView const* a, CachedTileView const* b) { return *a < *b; });
 
   for (auto view : cached_views) {
-    // UpdateTileView(current_command_buffer_, view, false, false);
+    UpdateTileView(current_command_buffer_, view, false, false);
   }
+  */
 
   current_command_buffer_ = nullptr;
 }
 
-void RenderCache::UpdateState() {
-  // Keep track of whether color attachments were used or not in this pass.
-  uint32_t rb_color_mask = register_file_->values[XE_GPU_REG_RB_COLOR_MASK].u32;
-  uint32_t rb_depthcontrol =
-      register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32;
-  for (int i = 0; i < 4; i++) {
-    uint32_t color_mask = (rb_color_mask >> (i * 4)) & 0xF;
-    current_state_.config.color[i].used |=
-        current_state_.config.mode_control == xenos::ModeControl::kColorDepth &&
-        color_mask != 0;
-  }
-
-  current_state_.config.depth_stencil.used |= !!(rb_depthcontrol & (0x4 | 0x2));
-}
-
 void RenderCache::ClearCache() {
   // TODO(benvanik): caching.
 }
@@ -1073,9 +1084,8 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
   key.edram_format = format;
   key.tile_offset = edram_base;
   key.tile_width = xe::round_up(pitch, tile_width) / tile_width;
-  key.tile_height =
-      std::min(2560 / tile_height,
-               160u);  // xe::round_up(height, tile_height) / tile_height;
+  // key.tile_height = xe::round_up(height, tile_height) / tile_height;
+  key.tile_height = 160;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
@@ -1115,7 +1125,7 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer,
         color_or_depth
             ? VK_IMAGE_ASPECT_COLOR_BIT
             : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
-    image_blit.srcOffsets[0] = {0, 0, 0};
+    image_blit.srcOffsets[0] = {0, 0, offset.z};
     image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height),
                                 int32_t(extents.depth)};
 
@@ -1191,9 +1201,8 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer,
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
   key.tile_width = xe::round_up(pitch, tile_width) / tile_width;
-  key.tile_height =
-      std::min(2560 / tile_height,
-               160u);  // xe::round_up(height, tile_height) / tile_height;
+  // key.tile_height = xe::round_up(height, tile_height) / tile_height;
+  key.tile_height = 160;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
@@ -1228,9 +1237,8 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer,
   key.edram_format = static_cast<uint16_t>(format);
   key.tile_offset = edram_base;
   key.tile_width = xe::round_up(pitch, tile_width) / tile_width;
-  key.tile_height =
-      std::min(2560 / tile_height,
-               160u);  // xe::round_up(height, tile_height) / tile_height;
+  // key.tile_height = xe::round_up(height, tile_height) / tile_height;
+  key.tile_height = 160;
   auto tile_view = FindOrCreateTileView(command_buffer, key);
   assert_not_null(tile_view);
 
diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h
index 4eeca42bf..c9f0adf98 100644
--- a/src/xenia/gpu/vulkan/render_cache.h
+++ b/src/xenia/gpu/vulkan/render_cache.h
@@ -278,9 +278,6 @@ class RenderCache {
   // The command buffer will be transitioned out of the render pass phase.
   void EndRenderPass();
 
-  // Updates current render state. Call this every draw with an open render pass
-  void UpdateState();
-
   // Clears all cached content.
   void ClearCache();
 
@@ -363,7 +360,6 @@ class RenderCache {
     uint32_t rb_color2_info;
     uint32_t rb_color3_info;
     uint32_t rb_depth_info;
-    uint32_t rb_depthcontrol;
     uint32_t pa_sc_window_scissor_tl;
     uint32_t pa_sc_window_scissor_br;