diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc
index 74faf6e13..b2d24f89b 100644
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       break;
     case AluVectorOpcode::kMul:
     case AluVectorOpcode::kMad: {
-      bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad;
-      if (is_mad) {
-        DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]);
-      } else {
-        DxbcOpMul(per_component_dest, operands[0], operands[1]);
-      }
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      uint32_t absolute_different =
+      // Not using DXBC mad to prevent fused multiply-add (mul followed by add
+      // may be optimized into non-fused mad by the driver in the identical
+      // operands case also).
+      DxbcOpMul(per_component_dest, operands[0], operands[1]);
+      uint32_t multiplicands_different =
           used_result_components &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+          ~instr.vector_operands[0].GetIdenticalComponents(
               instr.vector_operands[1]);
-      if (absolute_different) {
+      if (multiplicands_different) {
+        // Shader Model 3: +-0 or denormal * anything = +0.
         uint32_t is_zero_temp = PushSystemTemp();
-        DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different),
                   operands[0].Abs(), operands[1].Abs());
         // min isn't required to flush denormals, eq is.
-        DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different),
                  DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f));
-        DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different),
-                   DxbcSrc::R(is_zero_temp),
-                   is_mad ? operands[2] : DxbcSrc::LF(0.0f),
+        // Not replacing true `0 + term` with movc of the term because +0 + -0
+        // should result in +0, not -0.
+        DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different),
+                   DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f),
                    DxbcSrc::R(system_temp_result_));
         // Release is_zero_temp.
         PopSystemTemp();
       }
+      if (instr.vector_opcode == AluVectorOpcode::kMad) {
+        DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_),
+                  operands[2]);
+      }
     } break;
 
     case AluVectorOpcode::kMax:
@@ -179,69 +181,40 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
         component_count = 4;
       }
       result_swizzle = DxbcSrc::kXXXX;
-      uint32_t absolute_different =
-          uint32_t((1 << component_count) - 1) &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
-              instr.vector_operands[1]);
-      if (absolute_different) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
-        // Add component products only if non-zero. For dp4, 16 scalar
-        // operations in the worst case (as opposed to always 20 for
-        // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands
-        // for dp4).
-        DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001),
-                  operands[0].SelectFromSwizzled(0),
-                  operands[1].SelectFromSwizzled(0));
-        if (absolute_different & 0b0001) {
-          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010),
-                    operands[0].SelectFromSwizzled(0).Abs(),
-                    operands[1].SelectFromSwizzled(0).Abs());
-          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010),
-                   DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
+      uint32_t different = uint32_t((1 << component_count) - 1) &
+                           ~instr.vector_operands[0].GetIdenticalComponents(
+                               instr.vector_operands[1]);
+      for (uint32_t i = 0; i < component_count; ++i) {
+        DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
+                  operands[0].SelectFromSwizzled(i),
+                  operands[1].SelectFromSwizzled(i));
+        if ((different & (1 << i)) != 0) {
+          // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing
+          // true `0 + term` with movc of the term because +0 + -0 should result
+          // in +0, not -0).
+          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
+                    operands[0].SelectFromSwizzled(i).Abs(),
+                    operands[1].SelectFromSwizzled(i).Abs());
+          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
+                   DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
                    DxbcSrc::LF(0.0f));
-          DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
-                     DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
-                     DxbcSrc::LF(0.0f),
-                     DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
-        }
-        for (uint32_t i = 1; i < component_count; ++i) {
-          bool component_different = (absolute_different & (1 << i)) != 0;
-          DxbcOpMAd(DxbcDest::R(system_temp_result_,
-                                component_different ? 0b0010 : 0b0001),
-                    operands[0].SelectFromSwizzled(i),
-                    operands[1].SelectFromSwizzled(i),
-                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
-          if (component_different) {
-            DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
-                      operands[0].SelectFromSwizzled(i).Abs(),
-                      operands[1].SelectFromSwizzled(i).Abs());
-            DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
+          DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
                      DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
-                     DxbcSrc::LF(0.0f));
-            DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
-          }
+                     DxbcSrc::LF(0.0f),
+                     DxbcSrc::R(system_temp_result_,
+                                i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX));
         }
-      } else {
-        if (component_count == 2) {
-          DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
-        } else if (component_count == 3) {
-          DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
-        } else {
-          assert_true(component_count == 4);
-          DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
+        if (i) {
+          // Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar
+          // as of 2020 anyway, and not using mad for the same reason (mul
+          // followed by add may be optimized into non-fused mad by the driver
+          // in the identical operands case also).
+          DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
+                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
+                    DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
         }
       }
       if (component_count == 2) {
-        // Add the third operand. Since floating-point addition isn't
-        // associative, even though adding this in multiply-add for the first
-        // component would be faster, it's safer to add here, in the end.
         DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
                   DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
                   operands[2].SelectFromSwizzled(0));
@@ -592,14 +565,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
         DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f));
       }
       if (used_result_components & 0b0010) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
         DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010),
                   operands[0].SelectFromSwizzled(1),
                   operands[1].SelectFromSwizzled(1));
-        if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+        if (!(instr.vector_operands[0].GetIdenticalComponents(
                   instr.vector_operands[1]) &
               0b0010)) {
+          // Shader Model 3: +-0 or denormal * anything = +0.
           DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
                     operands[0].SelectFromSwizzled(1).Abs(),
                     operands[1].SelectFromSwizzled(1).Abs());
@@ -700,8 +672,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
       DxbcOpMul(ps_dest, operand_0_a, operand_0_b);
       if (instr.scalar_operands[0].components[0] !=
           instr.scalar_operands[0].components[1]) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
+        // Shader Model 3: +-0 or denormal * anything = +0.
         uint32_t is_zero_temp = PushSystemTemp();
         DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                   operand_0_b.Abs());
@@ -714,58 +685,50 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
         PopSystemTemp();
       }
       break;
-    case AluScalarOpcode::kMulsPrev: {
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      uint32_t is_zero_temp = PushSystemTemp();
-      DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
-                ps_src.Abs());
-      // min isn't required to flush denormals, eq is.
-      DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001),
-               DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
-      DxbcOpMul(ps_dest, operand_0_a, ps_src);
-      DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX),
-                 DxbcSrc::LF(0.0f), ps_src);
-      // Release is_zero_temp.
-      PopSystemTemp();
-    } break;
+    case AluScalarOpcode::kMulsPrev:
     case AluScalarOpcode::kMulsPrev2: {
       uint32_t test_temp = PushSystemTemp();
-      // Check if need to select the src0.a * ps case.
-      // ps != -FLT_MAX.
-      DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
-      // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX
-      // is already loaded to an SGPR, this is also false if it's NaN.
-      DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
-               DxbcSrc::LF(-FLT_MAX));
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      // isfinite(src0.b).
-      DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(),
-               DxbcSrc::LF(-FLT_MAX));
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
-      // for NaN).
-      DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b);
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // ps is already known to be not NaN or Infinity, so multiplying it by 0
-      // will result in 0. However, src0.a can be anything, so the result should
-      // be zero if ps is zero.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f));
+      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
+        // Check if need to select the src0.a * ps case.
+        // ps != -FLT_MAX.
+        DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
+        // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since
+        // -FLT_MAX is already loaded to an SGPR, this is also false if it's
+        // NaN.
+        DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
+                 DxbcSrc::LF(-FLT_MAX));
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        // isfinite(src0.b).
+        DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(),
+                 DxbcSrc::LF(-FLT_MAX));
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
+        // for NaN).
+        DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f),
+                 operand_0_b);
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
+      }
+      // Shader Model 3: +-0 or denormal * anything = +0.
+      DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(),
+                ps_src.Abs());
+      // min isn't required to flush denormals, eq is.
+      DxbcOpEq(DxbcDest::R(test_temp, 0b0001),
+               DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
       DxbcOpMul(ps_dest, operand_0_a, ps_src);
       DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
                  DxbcSrc::LF(0.0f), ps_src);
-      DxbcOpElse();
-      DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
-      DxbcOpEndIf();
+      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
+        DxbcOpElse();
+        DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
+        DxbcOpEndIf();
+      }
       // Release test_temp.
       PopSystemTemp();
     } break;
@@ -1023,11 +986,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
     case AluScalarOpcode::kMulsc0:
     case AluScalarOpcode::kMulsc1:
       DxbcOpMul(ps_dest, operand_0_a, operand_1);
-      if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents(
+      if (!(instr.scalar_operands[0].GetIdenticalComponents(
                 instr.scalar_operands[1]) &
             0b0001)) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
+        // Shader Model 3: +-0 or denormal * anything = +0.
         uint32_t is_zero_temp = PushSystemTemp();
         DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                   operand_1.Abs());
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index ab1ab32c9..23998c307 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -235,14 +235,18 @@ struct InstructionOperand {
     return false;
   }
 
-  // Returns which components of two operands are identical, but may have
-  // different signs (for simplicity of usage with GetComponent, treating the
-  // rightmost component as replicated).
-  uint32_t GetAbsoluteIdenticalComponents(
-      const InstructionOperand& other) const {
+  // Returns which components of two operands will always be bitwise equal
+  // (disregarding component_count for simplicity of usage with GetComponent,
+  // treating the rightmost component as replicated). This, strictly with all
+  // conditions, must be used when emulating Shader Model 3 +-0 * x = +0
+  // multiplication behavior with IEEE-compliant multiplication (because
+  // -0 * |-0|, or -0 * +0, is -0, while the result must be +0).
+  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
     if (storage_source != other.storage_source ||
         storage_index != other.storage_index ||
-        storage_addressing_mode != other.storage_addressing_mode) {
+        storage_addressing_mode != other.storage_addressing_mode ||
+        is_negated != other.is_negated ||
+        is_absolute_value != other.is_absolute_value) {
       return 0;
     }
     uint32_t identical_components = 0;
@@ -252,16 +256,6 @@ struct InstructionOperand {
     }
     return identical_components;
   }
-  // Returns which components of two operands will always be bitwise equal, but
-  // may have different signs (disregarding component_count for simplicity of
-  // usage with GetComponent, treating the rightmost component as replicated).
-  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
-    if (is_negated != other.is_negated ||
-        is_absolute_value != other.is_absolute_value) {
-      return 0;
-    }
-    return GetAbsoluteIdenticalComponents(other);
-  }
 };
 
 struct ParsedExecInstruction {
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index c0c035167..21ccbaff9 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12);
 //   Both are valid only within the current ALU clause. They are not modified
 //   when the instruction that would write them fails its predication check.
 // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
-//   multiplication (0 or denormal * anything = 0) wherever it's present (mul,
-//   mad, dp, etc.) and for NaN in min/max. It's very important to respect this
-//   rule for multiplication, as games often rely on it in vector normalization
-//   (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in
-//   games - causes white screen in Halo 3, white specular on characters in GTA
-//   IV.
-// TODO(Triang3l): Investigate signed zero handling in multiplication.
+//   multiplication (+-0 or denormal * anything = +0) wherever it's present
+//   (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect
+//   this rule for multiplication, as games often rely on it in vector
+//   normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of
+//   things in games - causes white screen in Halo 3, white specular on
+//   characters in GTA IV. The result is always positive zero in this case, no
+//   matter what the signs of the other operands are, according to R5xx
+//   Acceleration section 8.7.5 "Legacy multiply behavior" and testing on
+//   Adreno 200. This means that the following need to be taken into account
+//   (according to 8.7.2 "ALU Non-Transcendental Floating Point"):
+//   - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3
+//     handling, it should result in +0.
+//   - +0 + -0 is +0, so multiply-add should not be replaced with conditional
+//     move of the third operand in case of zero multiplicands, because the term
+//     may be -0, while the result should be +0 in this case.
+//   http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf
+//   Multiply-add also appears to be not fused (the SM3 behavior instruction on
+//   GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators
+//   should not use instructions that may be interpreted by the host GPU as
+//   fused multiply-add.
 
 enum class AluScalarOpcode : uint32_t {
   // Floating-Point Add
diff --git a/tools/build/premake b/tools/build/premake
index 14e3d5ebc..9113958a5 100644
--- a/tools/build/premake
+++ b/tools/build/premake
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3 
+#!/usr/bin/env python3
 
 # Copyright 2015 Ben Vanik. All Rights Reserved.
 
@@ -107,13 +107,14 @@ def has_bin(bin):
   return None
 
 
-def shell_call(command, throw_on_error=True, stdout_path=None):
+def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False):
   """Executes a shell command.
 
   Args:
     command: Command to execute, as a list of parameters.
     throw_on_error: Whether to throw an error or return the status code.
     stdout_path: File path to write stdout output to.
+    stderr_path: File path to write stderr output to.
 
   Returns:
     If throw_on_error is False the status code of the call will be returned.
@@ -121,17 +122,22 @@ def shell_call(command, throw_on_error=True, stdout_path=None):
   stdout_file = None
   if stdout_path:
     stdout_file = open(stdout_path, 'w')
+  stderr_file = None
+  if stderr_path:
+    stderr_file = open(stderr_path, 'w')
   result = 0
   try:
     if throw_on_error:
       result = 1
-      subprocess.check_call(command, shell=False, stdout=stdout_file)
+      subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
       result = 0
     else:
-      result = subprocess.call(command, shell=False, stdout=stdout_file)
+      result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
   finally:
     if stdout_file:
       stdout_file.close()
+    if stderr_file:
+      stderr_file.close()
   return result
 
 
@@ -196,42 +202,5 @@ def import_subprocess_environment(args):
         os.environ[var.upper()] = setting
         break
 
-def git_submodule_update():
-  """Runs a full recursive git submodule init and update.
-
-  Older versions of git do not support 'update --init --recursive'. We could
-  check and run it on versions that do support it and speed things up a bit.
-  """
-  if True:
-    shell_call([
-        'git',
-        'submodule',
-        'update',
-        '--init',
-        '--recursive',
-        ])
-  else:
-    shell_call([
-        'git',
-        'submodule',
-        'init',
-        ])
-    shell_call([
-        'git',
-        'submodule',
-        'foreach',
-        '--recursive',
-        'git',
-        'submodule',
-        'init',
-        ])
-    shell_call([
-        'git',
-        'submodule',
-        'update',
-        '--recursive',
-        ])
-
-
 if __name__ == '__main__':
   main()
diff --git a/xenia-build b/xenia-build
index 82b18b833..207ba20a0 100755
--- a/xenia-build
+++ b/xenia-build
@@ -34,8 +34,11 @@ def main():
 
     # Check git exists.
     if not has_bin('git'):
-        print('ERROR: git must be installed and on PATH.')
-        sys.exit(1)
+        print('WARNING: Git should be installed and on PATH. Version info will be omitted from all binaries!')
+        print('')
+    elif not git_is_repository():
+        print('WARNING: The source tree is unversioned. Version info will be omitted from all binaries!')
+        print('')
 
     # Check python version.
     if not sys.version_info[:2] >= (3, 6):
@@ -185,13 +188,14 @@ def get_bin(binary):
     return None
 
 
-def shell_call(command, throw_on_error=True, stdout_path=None, shell=False):
+def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False):
     """Executes a shell command.
 
     Args:
       command: Command to execute, as a list of parameters.
       throw_on_error: Whether to throw an error or return the status code.
       stdout_path: File path to write stdout output to.
+      stderr_path: File path to write stderr output to.
 
     Returns:
       If throw_on_error is False the status code of the call will be returned.
@@ -199,21 +203,49 @@ def shell_call(command, throw_on_error=True, stdout_path=None, shell=False):
     stdout_file = None
     if stdout_path:
         stdout_file = open(stdout_path, 'w')
+    stderr_file = None
+    if stderr_path:
+        stderr_file = open(stderr_path, 'w')
     result = 0
     try:
         if throw_on_error:
             result = 1
-            subprocess.check_call(command, shell=shell, stdout=stdout_file)
+            subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
             result = 0
         else:
-            result = subprocess.call(command, shell=shell, stdout=stdout_file)
+            result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
     finally:
         if stdout_file:
             stdout_file.close()
+        if stderr_file:
+            stderr_file.close()
     return result
 
 
-def get_git_head_info():
+def generate_version_h():
+    """Generates a build/version.h file that contains current git info.
+    """
+    if git_is_repository():
+        (branch_name, commit, commit_short) = git_get_head_info()
+    else:
+        branch_name = 'tarball'
+        commit = ':(-dont-do-this'
+        commit_short = ':('
+
+    contents = '''// Autogenerated by `xb premake`.
+  #ifndef GENERATED_VERSION_H_
+  #define GENERATED_VERSION_H_
+  #define XE_BUILD_BRANCH "%s"
+  #define XE_BUILD_COMMIT "%s"
+  #define XE_BUILD_COMMIT_SHORT "%s"
+  #define XE_BUILD_DATE __DATE__
+  #endif  // GENERATED_VERSION_H_
+  ''' % (branch_name, commit, commit_short)
+    with open('build/version.h', 'w') as f:
+        f.write(contents)
+
+
+def git_get_head_info():
     """Queries the current branch and commit checksum from git.
 
     Returns:
@@ -247,58 +279,28 @@ def get_git_head_info():
     return branch_name, commit, commit_short
 
 
-def generate_version_h():
-    """Generates a build/version.h file that contains current git info.
+def git_is_repository():
+    """Checks if git is available and this source tree is versioned.
     """
-    (branch_name, commit, commit_short) = get_git_head_info()
-    contents = '''// Autogenerated by `xb premake`.
-  #ifndef GENERATED_VERSION_H_
-  #define GENERATED_VERSION_H_
-  #define XE_BUILD_BRANCH "%s"
-  #define XE_BUILD_COMMIT "%s"
-  #define XE_BUILD_COMMIT_SHORT "%s"
-  #define XE_BUILD_DATE __DATE__
-  #endif  // GENERATED_VERSION_H_
-  ''' % (branch_name, commit, commit_short)
-    with open('build/version.h', 'w') as f:
-        f.write(contents)
+    if not has_bin('git'):
+        return False
+    return shell_call([
+        'git',
+        'rev-parse',
+        '--is-inside-work-tree',
+        ], throw_on_error=False, stdout_path=os.devnull, stderr_path=os.devnull) == 0
 
 
 def git_submodule_update():
     """Runs a full recursive git submodule init and update.
-
-    Older versions of git do not support 'update --init --recursive'. We could
-    check and run it on versions that do support it and speed things up a bit.
     """
-    if True:
-        shell_call([
-            'git',
-            'submodule',
-            'update',
-            '--init',
-            '--recursive',
-            ])
-    else:
-        shell_call([
-            'git',
-            'submodule',
-            'init',
-            ])
-        shell_call([
-            'git',
-            'submodule',
-            'foreach',
-            '--recursive',
-            'git',
-            'submodule',
-            'init',
-            ])
-        shell_call([
-            'git',
-            'submodule',
-            'update',
-            '--recursive',
-            ])
+    shell_call([
+        'git',
+        'submodule',
+        'update',
+        '--init',
+        '--recursive',
+        ])
 
 
 def get_clang_format_binary():
@@ -491,7 +493,10 @@ class SetupCommand(Command):
 
         # Setup submodules.
         print('- git submodule init / update...')
-        git_submodule_update()
+        if git_is_repository():
+            git_submodule_update()
+        else:
+            print('WARNING: Git not available or not a repository. Dependencies may be missing.')
         print('')
 
         print('- running premake...')