From ef523823d556554677efb95c659a9d78c9314193 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Fri, 11 Jan 2019 17:07:33 +0300
Subject: [PATCH] [D3D12] Force early Z with DSV, fix blend disabled flag in
 rb_colorcontrol ignored

---
 .../gpu/d3d12/d3d12_command_processor.cc      |   4 +-
 src/xenia/gpu/d3d12/d3d12_shader.h            |  14 ++
 src/xenia/gpu/d3d12/pipeline_cache.cc         |  38 +++-
 src/xenia/gpu/d3d12/pipeline_cache.h          |   1 +
 src/xenia/gpu/dxbc_shader_translator.cc       |  52 +++++-
 src/xenia/gpu/dxbc_shader_translator.h        |   4 +
 src/xenia/gpu/shader.h                        |   5 +
 src/xenia/gpu/shader_translator.cc            | 168 +++++++++---------
 src/xenia/gpu/shader_translator.h             |   5 +
 9 files changed, 202 insertions(+), 89 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 1a2e07475..93750b2b3 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2199,8 +2199,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
 
   // Alpha test.
   int32_t alpha_test;
-  if (rb_colorcontrol & 0x8) {
-    uint32_t alpha_test_function = rb_colorcontrol & 0x7;
+  uint32_t alpha_test_function = rb_colorcontrol & 0x7;
+  if ((rb_colorcontrol & 0x8) && alpha_test_function != 0x7) {
     // 0: Never - fail in [-inf, +inf].
     // 1: Less - fail in [ref, +inf].
     // 2: Equal - pass in [ref, ref].
diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h
index 8a9a31a54..cbed15f9e 100644
--- a/src/xenia/gpu/d3d12/d3d12_shader.h
+++ b/src/xenia/gpu/d3d12/d3d12_shader.h
@@ -40,6 +40,17 @@ class D3D12Shader : public Shader {
       const DxbcShaderTranslator::SamplerBinding* sampler_bindings,
       uint32_t sampler_binding_count);
 
+  void SetForcedEarlyZShaderObject(const std::vector<uint8_t>& shader_object) {
+    forced_early_z_shader_ = shader_object;
+  }
+  // Returns the shader with forced early depth/stencil set with
+  // SetForcedEarlyZShader after translation. If there's none (for example,
+  // if the shader discards pixels or writes to the depth buffer), an empty
+  // vector is returned.
+  const std::vector<uint8_t>& GetForcedEarlyZShaderObject() const {
+    return forced_early_z_shader_;
+  }
+
   bool DisassembleDxbc(const ui::d3d12::D3D12Provider* provider);
 
   static constexpr uint32_t kMaxTextureSRVIndexBits =
@@ -78,9 +89,12 @@ class D3D12Shader : public Shader {
 
  private:
   PrimitiveType domain_shader_primitive_type_ = PrimitiveType::kNone;
+
   std::vector<TextureSRV> texture_srvs_;
   uint32_t used_texture_mask_ = 0;
   std::vector<SamplerBinding> sampler_bindings_;
+
+  std::vector<uint8_t> forced_early_z_shader_;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc
index 51564a16f..695cbc039 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@@ -331,6 +331,15 @@ bool PipelineCache::TranslateShader(D3D12Shader* shader,
              shader->ucode_disassembly().c_str());
   }
 
+  // If may be useful, create a version of the shader with early depth/stencil
+  // forced.
+  if (shader->type() == ShaderType::kPixel && !edram_rov_used_ &&
+      shader->early_z_allowed()) {
+    shader->SetForcedEarlyZShaderObject(
+        std::move(DxbcShaderTranslator::ForceEarlyDepthStencil(
+            shader->translated_binary().data())));
+  }
+
   // Disassemble the shader for dumping.
   if (FLAGS_d3d12_dxbc_disasm) {
     auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
@@ -569,6 +578,8 @@ bool PipelineCache::GetCurrentStateDescription(
   }
 
   if (!edram_rov_used_) {
+    uint32_t rb_colorcontrol = regs[XE_GPU_REG_RB_COLORCONTROL].u32;
+
     // Depth/stencil. No stencil, always passing depth test and no depth writing
     // means depth disabled.
     if (render_targets[4].format != DXGI_FORMAT_UNKNOWN) {
@@ -616,6 +627,16 @@ bool PipelineCache::GetCurrentStateDescription(
       description_out.depth_func = 0b111;
     }
 
+    // Forced early Z if the shader allows that and alpha testing is disabled.
+    // TODO(Triang3l): For memexporting shaders, possibly choose this according
+    // to the early Z toggle in RB_DEPTHCONTROL (the correct behavior is still
+    // unknown).
+    if (pixel_shader != nullptr &&
+        pixel_shader->GetForcedEarlyZShaderObject().size() != 0 &&
+        (!(rb_colorcontrol & 0x8) || (rb_colorcontrol & 0x7) == 0x7)) {
+      description_out.force_early_z = 1;
+    }
+
     // Render targets and blending state. 32 because of 0x1F mask, for safety
     // (all unknown to zero).
     uint32_t color_mask = command_processor_->GetCurrentColorMask(pixel_shader);
@@ -695,7 +716,7 @@ bool PipelineCache::GetCurrentStateDescription(
       rt.format = RenderTargetCache::GetBaseColorFormat(
           ColorRenderTargetFormat((color_info >> 16) & 0xF));
       rt.write_mask = (color_mask >> (guest_rt_index * 4)) & 0xF;
-      if (rt.write_mask) {
+      if (!(rb_colorcontrol & 0x20) && rt.write_mask) {
         rt.src_blend = kBlendFactorMap[blendcontrol & 0x1F];
         rt.dest_blend = kBlendFactorMap[(blendcontrol >> 8) & 0x1F];
         rt.blend_op = BlendOp((blendcontrol >> 5) & 0x7);
@@ -874,10 +895,17 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState(
       assert_always();
       return nullptr;
     }
-    state_desc.PS.pShaderBytecode =
-        description.pixel_shader->translated_binary().data();
-    state_desc.PS.BytecodeLength =
-        description.pixel_shader->translated_binary().size();
+    const auto& forced_early_z_shader =
+        description.pixel_shader->GetForcedEarlyZShaderObject();
+    if (description.force_early_z && forced_early_z_shader.size() != 0) {
+      state_desc.PS.pShaderBytecode = forced_early_z_shader.data();
+      state_desc.PS.BytecodeLength = forced_early_z_shader.size();
+    } else {
+      state_desc.PS.pShaderBytecode =
+          description.pixel_shader->translated_binary().data();
+      state_desc.PS.BytecodeLength =
+          description.pixel_shader->translated_binary().size();
+    }
   } else if (edram_rov_used_) {
     state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
     state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h
index 6615400d3..f09b518d1 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@@ -155,6 +155,7 @@ class PipelineCache {
     uint32_t depth_write : 1;                                   // 21
     uint32_t stencil_enable : 1;                                // 22
     uint32_t stencil_read_mask : 8;                             // 30
+    uint32_t force_early_z : 1;                                 // 31
 
     uint32_t stencil_write_mask : 8;           // 8
     uint32_t stencil_front_fail_op : 3;        // 11
diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index b153fae79..5d4204f89 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -89,6 +89,55 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id,
 }
 DxbcShaderTranslator::~DxbcShaderTranslator() = default;
 
+std::vector<uint8_t> DxbcShaderTranslator::ForceEarlyDepthStencil(
+    const uint8_t* shader) {
+  const uint32_t* old_shader = reinterpret_cast<const uint32_t*>(shader);
+
+  // To return something anyway even if patching fails.
+  std::vector<uint8_t> new_shader;
+  uint32_t shader_size_bytes = old_shader[6];
+  new_shader.resize(shader_size_bytes);
+  std::memcpy(new_shader.data(), shader, shader_size_bytes);
+
+  // Find the SHEX chunk.
+  uint32_t chunk_count = old_shader[7];
+  for (uint32_t i = 0; i < chunk_count; ++i) {
+    uint32_t chunk_offset_bytes = old_shader[8 + i];
+    const uint32_t* chunk = old_shader + chunk_offset_bytes / sizeof(uint32_t);
+    if (chunk[0] != 'XEHS') {
+      continue;
+    }
+    // Find dcl_globalFlags and patch it.
+    uint32_t code_size_dwords = chunk[3];
+    chunk += 4;
+    for (uint32_t j = 0; j < code_size_dwords;) {
+      uint32_t opcode_token = chunk[j];
+      uint32_t opcode = DECODE_D3D10_SB_OPCODE_TYPE(opcode_token);
+      if (opcode == D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) {
+        opcode_token |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL;
+        std::memcpy(new_shader.data() +
+                        (chunk_offset_bytes + (4 + j) * sizeof(uint32_t)),
+                    &opcode_token, sizeof(uint32_t));
+        // Recalculate the checksum since the shader was modified.
+        CalculateDXBCChecksum(
+            reinterpret_cast<unsigned char*>(new_shader.data()),
+            shader_size_bytes,
+            reinterpret_cast<unsigned int*>(new_shader.data() +
+                                            sizeof(uint32_t)));
+        break;
+      }
+      if (opcode == D3D10_SB_OPCODE_CUSTOMDATA) {
+        j += chunk[j + 1];
+      } else {
+        j += DECODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(opcode_token);
+      }
+    }
+    break;
+  }
+
+  return std::move(new_shader);
+}
+
 std::vector<uint8_t> DxbcShaderTranslator::CreateDepthOnlyPixelShader() {
   Reset();
   is_depth_only_pixel_shader_ = true;
@@ -4034,7 +4083,8 @@ void DxbcShaderTranslator::WriteShaderCode() {
   }
 
   // Don't allow refactoring when converting to native code to maintain position
-  // invariance (needed even in pixel shaders for oDepth invariance).
+  // invariance (needed even in pixel shaders for oDepth invariance). Also this
+  // dcl will be modified by ForceEarlyDepthStencil.
   shader_object_.push_back(
       ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) |
       ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h
index 0c893c567..c5d97c1fe 100644
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@@ -491,6 +491,10 @@ class DxbcShaderTranslator : public ShaderTranslator {
     kEDRAM,
   };
 
+  // Creates a copy of the shader with early depth/stencil testing forced,
+  // overriding that alpha testing is used in the shader.
+  static std::vector<uint8_t> ForceEarlyDepthStencil(const uint8_t* shader);
+
   // Returns the bits that need to be added to the RT flags constant - needs to
   // be done externally, not in SetColorFormatConstants, because the flags
   // contain other state.
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index 78505e6a1..1ed7ac23e 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -607,6 +607,10 @@ class Shader {
   // Returns true if the given color target index [0-3].
   bool writes_color_target(int i) const { return writes_color_targets_[i]; }
 
+  // Returns true if the pixel shader can potentially have early depth/stencil
+  // testing enabled, provided alpha testing is disabled.
+  bool early_z_allowed() const { return early_z_allowed_; }
+
   // True if the shader was translated and prepared without error.
   bool is_valid() const { return is_valid_; }
 
@@ -655,6 +659,7 @@ class Shader {
   std::vector<TextureBinding> texture_bindings_;
   ConstantRegisterMap constant_register_map_ = {0};
   bool writes_color_targets_[4] = {false, false, false, false};
+  bool early_z_allowed_ = true;
   std::vector<uint32_t> memexport_stream_constants_;
 
   bool is_valid_ = false;
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index e829e665f..e22a8efea 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -65,6 +65,7 @@ void ShaderTranslator::Reset() {
     writes_color_targets_[i] = false;
   }
   writes_depth_ = false;
+  early_z_allowed_ = true;
   memexport_alloc_count_ = 0;
   memexport_eA_written_ = 0;
   std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_));
@@ -189,6 +190,7 @@ bool ShaderTranslator::TranslateInternal(Shader* shader) {
   for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
     shader->writes_color_targets_[i] = writes_color_targets_[i];
   }
+  shader->early_z_allowed_ = early_z_allowed_;
   shader->memexport_stream_constants_.clear();
   for (uint32_t memexport_stream_constant : memexport_stream_constants_) {
     shader->memexport_stream_constants_.push_back(memexport_stream_constant);
@@ -288,6 +290,7 @@ void ShaderTranslator::GatherInstructionInformation(
           if (op.has_vector_op()) {
             const auto& opcode_info =
                 alu_vector_opcode_infos_[static_cast<int>(op.vector_opcode())];
+            early_z_allowed_ &= !opcode_info.disable_early_z;
             for (size_t i = 0; i < opcode_info.argument_count; ++i) {
               if (op.src_is_temp(i + 1) && (op.src_reg(i + 1) & 0x40)) {
                 uses_register_dynamic_addressing_ = true;
@@ -299,6 +302,7 @@ void ShaderTranslator::GatherInstructionInformation(
                   writes_color_targets_[op.vector_dest()] = true;
                 } else if (op.vector_dest() == 61) {
                   writes_depth_ = true;
+                  early_z_allowed_ = false;
                 }
               }
               if (memexport_alloc_count_ > 0 &&
@@ -335,6 +339,7 @@ void ShaderTranslator::GatherInstructionInformation(
           if (op.has_scalar_op()) {
             const auto& opcode_info =
                 alu_scalar_opcode_infos_[static_cast<int>(op.scalar_opcode())];
+            early_z_allowed_ &= !opcode_info.disable_early_z;
             if (opcode_info.argument_count == 1 && op.src_is_temp(3) &&
                 (op.src_reg(3) & 0x40)) {
               uses_register_dynamic_addressing_ = true;
@@ -345,6 +350,7 @@ void ShaderTranslator::GatherInstructionInformation(
                   writes_color_targets_[op.scalar_dest()] = true;
                 } else if (op.scalar_dest() == 61) {
                   writes_depth_ = true;
+                  early_z_allowed_ = false;
                 }
               }
               if (memexport_alloc_count_ > 0 &&
@@ -1030,91 +1036,91 @@ void ShaderTranslator::ParseTextureFetchInstruction(
 
 const ShaderTranslator::AluOpcodeInfo
     ShaderTranslator::alu_vector_opcode_infos_[0x20] = {
-        {"add", 2, 4},           // 0
-        {"mul", 2, 4},           // 1
-        {"max", 2, 4},           // 2
-        {"min", 2, 4},           // 3
-        {"seq", 2, 4},           // 4
-        {"sgt", 2, 4},           // 5
-        {"sge", 2, 4},           // 6
-        {"sne", 2, 4},           // 7
-        {"frc", 1, 4},           // 8
-        {"trunc", 1, 4},         // 9
-        {"floor", 1, 4},         // 10
-        {"mad", 3, 4},           // 11
-        {"cndeq", 3, 4},         // 12
-        {"cndge", 3, 4},         // 13
-        {"cndgt", 3, 4},         // 14
-        {"dp4", 2, 4},           // 15
-        {"dp3", 2, 4},           // 16
-        {"dp2add", 3, 4},        // 17
-        {"cube", 2, 4},          // 18
-        {"max4", 1, 4},          // 19
-        {"setp_eq_push", 2, 4},  // 20
-        {"setp_ne_push", 2, 4},  // 21
-        {"setp_gt_push", 2, 4},  // 22
-        {"setp_ge_push", 2, 4},  // 23
-        {"kill_eq", 2, 4},       // 24
-        {"kill_gt", 2, 4},       // 25
-        {"kill_ge", 2, 4},       // 26
-        {"kill_ne", 2, 4},       // 27
-        {"dst", 2, 4},           // 28
-        {"maxa", 2, 4},          // 29
+        {"add", 2, 4, false},           // 0
+        {"mul", 2, 4, false},           // 1
+        {"max", 2, 4, false},           // 2
+        {"min", 2, 4, false},           // 3
+        {"seq", 2, 4, false},           // 4
+        {"sgt", 2, 4, false},           // 5
+        {"sge", 2, 4, false},           // 6
+        {"sne", 2, 4, false},           // 7
+        {"frc", 1, 4, false},           // 8
+        {"trunc", 1, 4, false},         // 9
+        {"floor", 1, 4, false},         // 10
+        {"mad", 3, 4, false},           // 11
+        {"cndeq", 3, 4, false},         // 12
+        {"cndge", 3, 4, false},         // 13
+        {"cndgt", 3, 4, false},         // 14
+        {"dp4", 2, 4, false},           // 15
+        {"dp3", 2, 4, false},           // 16
+        {"dp2add", 3, 4, false},        // 17
+        {"cube", 2, 4, false},          // 18
+        {"max4", 1, 4, false},          // 19
+        {"setp_eq_push", 2, 4, false},  // 20
+        {"setp_ne_push", 2, 4, false},  // 21
+        {"setp_gt_push", 2, 4, false},  // 22
+        {"setp_ge_push", 2, 4, false},  // 23
+        {"kill_eq", 2, 4, true},        // 24
+        {"kill_gt", 2, 4, true},        // 25
+        {"kill_ge", 2, 4, true},        // 26
+        {"kill_ne", 2, 4, true},        // 27
+        {"dst", 2, 4, false},           // 28
+        {"maxa", 2, 4, false},          // 29
 };
 
 const ShaderTranslator::AluOpcodeInfo
     ShaderTranslator::alu_scalar_opcode_infos_[0x40] = {
-        {"adds", 1, 2},         // 0
-        {"adds_prev", 1, 1},    // 1
-        {"muls", 1, 2},         // 2
-        {"muls_prev", 1, 1},    // 3
-        {"muls_prev2", 1, 2},   // 4
-        {"maxs", 1, 2},         // 5
-        {"mins", 1, 2},         // 6
-        {"seqs", 1, 1},         // 7
-        {"sgts", 1, 1},         // 8
-        {"sges", 1, 1},         // 9
-        {"snes", 1, 1},         // 10
-        {"frcs", 1, 1},         // 11
-        {"truncs", 1, 1},       // 12
-        {"floors", 1, 1},       // 13
-        {"exp", 1, 1},          // 14
-        {"logc", 1, 1},         // 15
-        {"log", 1, 1},          // 16
-        {"rcpc", 1, 1},         // 17
-        {"rcpf", 1, 1},         // 18
-        {"rcp", 1, 1},          // 19
-        {"rsqc", 1, 1},         // 20
-        {"rsqf", 1, 1},         // 21
-        {"rsq", 1, 1},          // 22
-        {"maxas", 1, 2},        // 23
-        {"maxasf", 1, 2},       // 24
-        {"subs", 1, 2},         // 25
-        {"subs_prev", 1, 1},    // 26
-        {"setp_eq", 1, 1},      // 27
-        {"setp_ne", 1, 1},      // 28
-        {"setp_gt", 1, 1},      // 29
-        {"setp_ge", 1, 1},      // 30
-        {"setp_inv", 1, 1},     // 31
-        {"setp_pop", 1, 1},     // 32
-        {"setp_clr", 1, 1},     // 33
-        {"setp_rstr", 1, 1},    // 34
-        {"kills_eq", 1, 1},     // 35
-        {"kills_gt", 1, 1},     // 36
-        {"kills_ge", 1, 1},     // 37
-        {"kills_ne", 1, 1},     // 38
-        {"kills_one", 1, 1},    // 39
-        {"sqrt", 1, 1},         // 40
-        {"UNKNOWN", 0, 0},      // 41
-        {"mulsc", 2, 1},        // 42
-        {"mulsc", 2, 1},        // 43
-        {"addsc", 2, 1},        // 44
-        {"addsc", 2, 1},        // 45
-        {"subsc", 2, 1},        // 46
-        {"subsc", 2, 1},        // 47
-        {"sin", 1, 1},          // 48
-        {"cos", 1, 1},          // 49
-        {"retain_prev", 1, 1},  // 50
+        {"adds", 1, 2, false},         // 0
+        {"adds_prev", 1, 1, false},    // 1
+        {"muls", 1, 2, false},         // 2
+        {"muls_prev", 1, 1, false},    // 3
+        {"muls_prev2", 1, 2, false},   // 4
+        {"maxs", 1, 2, false},         // 5
+        {"mins", 1, 2, false},         // 6
+        {"seqs", 1, 1, false},         // 7
+        {"sgts", 1, 1, false},         // 8
+        {"sges", 1, 1, false},         // 9
+        {"snes", 1, 1, false},         // 10
+        {"frcs", 1, 1, false},         // 11
+        {"truncs", 1, 1, false},       // 12
+        {"floors", 1, 1, false},       // 13
+        {"exp", 1, 1, false},          // 14
+        {"logc", 1, 1, false},         // 15
+        {"log", 1, 1, false},          // 16
+        {"rcpc", 1, 1, false},         // 17
+        {"rcpf", 1, 1, false},         // 18
+        {"rcp", 1, 1, false},          // 19
+        {"rsqc", 1, 1, false},         // 20
+        {"rsqf", 1, 1, false},         // 21
+        {"rsq", 1, 1, false},          // 22
+        {"maxas", 1, 2, false},        // 23
+        {"maxasf", 1, 2, false},       // 24
+        {"subs", 1, 2, false},         // 25
+        {"subs_prev", 1, 1, false},    // 26
+        {"setp_eq", 1, 1, false},      // 27
+        {"setp_ne", 1, 1, false},      // 28
+        {"setp_gt", 1, 1, false},      // 29
+        {"setp_ge", 1, 1, false},      // 30
+        {"setp_inv", 1, 1, false},     // 31
+        {"setp_pop", 1, 1, false},     // 32
+        {"setp_clr", 1, 1, false},     // 33
+        {"setp_rstr", 1, 1, false},    // 34
+        {"kills_eq", 1, 1, true},      // 35
+        {"kills_gt", 1, 1, true},      // 36
+        {"kills_ge", 1, 1, true},      // 37
+        {"kills_ne", 1, 1, true},      // 38
+        {"kills_one", 1, 1, true},     // 39
+        {"sqrt", 1, 1, false},         // 40
+        {"UNKNOWN", 0, 0, false},      // 41
+        {"mulsc", 2, 1, false},        // 42
+        {"mulsc", 2, 1, false},        // 43
+        {"addsc", 2, 1, false},        // 44
+        {"addsc", 2, 1, false},        // 45
+        {"subsc", 2, 1, false},        // 46
+        {"subsc", 2, 1, false},        // 47
+        {"sin", 1, 1, false},          // 48
+        {"cos", 1, 1, false},          // 49
+        {"retain_prev", 1, 1, false},  // 50
 };
 
 void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) {
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index 9aacba3bb..48775ade9 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -58,6 +58,9 @@ class ShaderTranslator {
   bool writes_color_target(int i) const { return writes_color_targets_[i]; }
   // True if the current shader overrides the pixel depth.
   bool writes_depth() const { return writes_depth_; }
+  // True if the pixel shader can potentially have early depth/stencil testing
+  // enabled, provided alpha testing is disabled.
+  bool early_z_allowed() const { return early_z_allowed_; }
   // A list of all vertex bindings, populated before translation occurs.
   const std::vector<Shader::VertexBinding>& vertex_bindings() const {
     return vertex_bindings_;
@@ -160,6 +163,7 @@ class ShaderTranslator {
     const char* name;
     size_t argument_count;
     int src_swizzle_component_count;
+    bool disable_early_z;
   };
 
   bool TranslateInternal(Shader* shader);
@@ -245,6 +249,7 @@ class ShaderTranslator {
   bool uses_register_dynamic_addressing_ = false;
   bool writes_color_targets_[4] = {false, false, false, false};
   bool writes_depth_ = false;
+  bool early_z_allowed_ = true;
 
   uint32_t memexport_alloc_count_ = 0;
   // For register allocation in implementations - what was used after each