Merge pull request #57 from chrisps/canary_experimental

Add separate VMX/fpu mxcsr
2022-07-31 18:43:30 +02:00 · 2022-07-31 18:43:30 +02:00 · 332f69f36b
parent 3185b0ac9c 968f656d96
commit 332f69f36b
18 changed files with 687 additions and 611 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -692,6 +692,12 @@ void X64Backend::InitializeBackendContext(void* ctx) {
  X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
      reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
  bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
  bctx->mxcsr_fpu =
      DEFAULT_FPU_MXCSR;  // idk if this is right, check on rgh what the
                          // rounding on ppc is at startup
  bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
  bctx->flags = 0;
  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
 }
 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -37,9 +37,17 @@ typedef void (*ResolveFunctionThunk)();
 // negatively index the membase reg)
 struct X64BackendContext {
  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
  unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu
  unsigned int mxcsr_vmx;
  unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx
  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
                        // add of it by... 2 bytes lol
 };
 constexpr unsigned int DEFAULT_VMX_MXCSR =
    0x8000 |                   // flush to zero
    0x0040 | (_MM_MASK_MASK);  // default rounding mode for vmx
 constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80;
 class X64Backend : public Backend {
 public:
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -320,6 +320,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  // Body.
  auto block = builder->first_block();
  while (block) {
    ForgetMxcsrMode();  // at start of block, mxcsr mode is undefined
    // Mark block labels.
    auto label = block->label_head;
    while (label) {
@ -490,6 +492,7 @@ uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
 void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
  assert_not_null(function);
  ForgetMxcsrMode();
  auto fn = static_cast<X64Function*>(function);
  // Resolve address to the function to call and store in rax.
@ -564,6 +567,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
 void X64Emitter::CallIndirect(const hir::Instr* instr,
                              const Xbyak::Reg64& reg) {
  ForgetMxcsrMode();
  // Check if return.
  if (instr->flags & hir::CALL_POSSIBLE_RETURN) {
    cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
@ -617,6 +621,7 @@ uint64_t UndefinedCallExtern(void* raw_context, uint64_t function_ptr) {
  return 0;
 }
 void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
  ForgetMxcsrMode();
  bool undefined = true;
  if (function->behavior() == Function::Behavior::kBuiltin) {
    auto builtin_function = static_cast<const BuiltinFunction*>(function);
@ -696,11 +701,13 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
 }
 // Important: If you change these, you must update the thunks in x64_backend.cc!
-Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
+Xbyak::Reg64 X64Emitter::GetContextReg() const { return rsi; }
-Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }
+Xbyak::Reg64 X64Emitter::GetMembaseReg() const { return rdi; }
 void X64Emitter::ReloadMembase() {
-  mov(GetMembaseReg(), qword[GetContextReg() + 8]);  // membase
+  mov(GetMembaseReg(),
      qword[GetContextReg() +
            offsetof(ppc::PPCContext, virtual_membase)]);  // membase
 }
 // Len Assembly                                   Byte Sequence
@ -917,7 +924,7 @@ static const vec128_t xmm_consts[] = {
    /* XMMQNaN                */ vec128i(0x7FC00000u),
    /* XMMInt127              */ vec128i(0x7Fu),
    /* XMM2To32               */ vec128f(0x1.0p32f),
-    /* xmminf */ vec128i(0x7f800000),
+    /* XMMFloatInf */ vec128i(0x7f800000),
    /* XMMIntsToBytes*/
    v128_setr_bytes(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
@ -938,9 +945,7 @@ static const vec128_t xmm_consts[] = {
    /*XMMVSRShlByteshuf*/
    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
    // XMMVSRMask
-    vec128b(1)
+    vec128b(1)};
 };
 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
@ -1347,7 +1352,7 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
  return SimdDomain::DONTCARE;
 }
-Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) {
+Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) const {
  /*
    index context ptr negatively to get to backend ctx field
  */
@ -1368,6 +1373,93 @@ Xbyak::Label& X64Emitter::NewCachedLabel() {
  label_cache_.push_back(tmp);
  return *tmp;
 }
 template<bool switching_to_fpu>
 static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
  auto flags = e.GetBackendFlagsPtr();
  if (switching_to_fpu) {
    e.btr(flags, 0);  // bit 0 set to 0 = is fpu mode
  } else {
    e.bts(flags, 0); // bit 0 set to 1 = is vmx mode
  }
  Xbyak::Label& come_back = e.NewCachedLabel();
  Xbyak::Label& reload_bailout =
      e.AddToTail([&come_back](X64Emitter& e, Xbyak::Label& thislabel) {
        e.L(thislabel);
        if (switching_to_fpu) {
          e.LoadFpuMxcsrDirect();
        } else {
          e.LoadVmxMxcsrDirect();
 		}
        e.jmp(come_back, X64Emitter::T_NEAR);
      });
  if (switching_to_fpu) {
    e.jc(reload_bailout,
         X64Emitter::T_NEAR);  // if carry flag was set, we were VMX mxcsr mode.
  } else {
    e.jnc(reload_bailout,
         X64Emitter::T_NEAR);  // if carry flag was set, we were VMX mxcsr mode.
  }
  e.L(come_back);
 }
 bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
  if (new_mode == mxcsr_mode_) {
    return false;
  }
  assert_true(new_mode != MXCSRMode::Unknown);
  if (mxcsr_mode_ == MXCSRMode::Unknown) {
    // check the mode dynamically
    mxcsr_mode_ = new_mode;
    if (!already_set) {
      if (new_mode == MXCSRMode::Fpu) {
        ChangeMxcsrModeDynamicHelper<true>(*this);
      } else if (new_mode == MXCSRMode::Vmx) {
        ChangeMxcsrModeDynamicHelper<false>(*this);
      } else {
        assert_unhandled_case(new_mode);
 	  }
    } else { //even if already set, we still need to update flags to reflect our mode
      if (new_mode == MXCSRMode::Fpu) {
        btr(GetBackendFlagsPtr(), 0);
      } else if (new_mode == MXCSRMode::Vmx) {
        bts(GetBackendFlagsPtr(), 0);
      } else {
        assert_unhandled_case(new_mode);
      }	
 	}
  } else {
    mxcsr_mode_ = new_mode;
    if (!already_set) {
      if (new_mode == MXCSRMode::Fpu) {
        LoadFpuMxcsrDirect();
        btr(GetBackendFlagsPtr(), 0);
        return true;
      } else if (new_mode == MXCSRMode::Vmx) {
        LoadVmxMxcsrDirect();
        bts(GetBackendFlagsPtr(), 0);
        return true;
      } else {
        assert_unhandled_case(new_mode);
      }
    }
  }
  return false;
 }
 void X64Emitter::LoadFpuMxcsrDirect() {
  vldmxcsr(GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)));
 }
 void X64Emitter::LoadVmxMxcsrDirect() {
  vldmxcsr(GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx)));
 }
 Xbyak::Address X64Emitter::GetBackendFlagsPtr() const {
  Xbyak::Address pt = GetBackendCtxPtr(offsetof(X64BackendContext, flags));
  pt.setBit(32);
  return pt;
 }
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -65,6 +65,12 @@ enum class SimdDomain : uint32_t {
               // CONFLICTING means its used in multiple domains)
 };
 enum class MXCSRMode : uint32_t {
 	Unknown,
 	Fpu,
 	Vmx
 };
 static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
  if (dom1 == dom2) {
    return dom1;
@ -283,8 +289,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
  Xbyak::Reg64 GetNativeParam(uint32_t param);
-  Xbyak::Reg64 GetContextReg();
+  Xbyak::Reg64 GetContextReg() const;
-  Xbyak::Reg64 GetMembaseReg();
+  Xbyak::Reg64 GetMembaseReg() const;
  bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; }
  void ReloadMembase();
@ -295,7 +301,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  void MovMem64(const Xbyak::RegExp& addr, uint64_t v);
  Xbyak::Address GetXmmConstPtr(XmmConst id);
-  Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx);
+  Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx) const;
  void LoadConstantXmm(Xbyak::Xmm dest, float v);
  void LoadConstantXmm(Xbyak::Xmm dest, double v);
@ -304,6 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  Xbyak::Address StashConstantXmm(int index, float v);
  Xbyak::Address StashConstantXmm(int index, double v);
  Xbyak::Address StashConstantXmm(int index, const vec128_t& v);
  Xbyak::Address GetBackendFlagsPtr() const;
  void* FindByteConstantOffset(unsigned bytevalue);
  void* FindWordConstantOffset(unsigned wordvalue);
  void* FindDwordConstantOffset(unsigned bytevalue);
@ -319,6 +326,16 @@ class X64Emitter : public Xbyak::CodeGenerator {
  size_t stack_size() const { return stack_size_; }
  SimdDomain DeduceSimdDomain(const hir::Value* for_value);
  void ForgetMxcsrMode() {
    mxcsr_mode_ = MXCSRMode::Unknown;
  }
  /*
 	returns true if had to load mxcsr. DOT_PRODUCT can use this to skip clearing the overflow flag, as it will never be set in the vmx fpscr
  */
  bool ChangeMxcsrMode(MXCSRMode new_mode, bool already_set=false);//already_set means that the caller already did vldmxcsr, used for SET_ROUNDING_MODE
  void LoadFpuMxcsrDirect(); //unsafe, does not change mxcsr_mode_
  void LoadVmxMxcsrDirect(); //unsafe, does not change mxcsr_mode_
 protected:
  void* Emplace(const EmitFunctionInfo& func_info,
                GuestFunction* function = nullptr);
@ -359,6 +376,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  std::vector<Xbyak::Label*>
      label_cache_;  // for creating labels that need to be referenced much
                     // later by tail emitters
  MXCSRMode mxcsr_mode_ = MXCSRMode::Unknown;
 };
 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_op.h
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@ -616,7 +616,31 @@ struct Sequence {
    }
  }
 };
 template <typename T>
 static Xmm GetInputRegOrConstant(X64Emitter& e, const T& input,
                                 Xmm xmm_to_use_if_const) {
  if (input.is_constant) {
    using constant_type = std::remove_reference_t<decltype(input.constant())>;
    if constexpr (std::is_integral_v<constant_type>) {
      vec128_t input_constant = vec128b(0);
      if constexpr (sizeof(constant_type) == 4) {
        input_constant.i32[0] = input.constant();
      } else if constexpr (sizeof(constant_type) == 8) {
        input_constant.low = input.constant();
      } else {
        assert_unhandled_case(sizeof(constant_type));
      }
      e.LoadConstantXmm(xmm_to_use_if_const, input_constant);
    } else {
      e.LoadConstantXmm(xmm_to_use_if_const, input.constant());
    }
    return xmm_to_use_if_const;
  } else {
    return input;
  }
 }
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@ -257,6 +257,7 @@ struct CALL_TRUE_I8
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_I16
@ -268,6 +269,7 @@ struct CALL_TRUE_I16
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_I32
@ -279,6 +281,7 @@ struct CALL_TRUE_I32
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_I64
@ -290,6 +293,7 @@ struct CALL_TRUE_I64
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_F32
@ -301,6 +305,7 @@ struct CALL_TRUE_F32
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
@ -313,6 +318,7 @@ struct CALL_TRUE_F64
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
@ -326,6 +332,7 @@ struct CALL_INDIRECT
    : Sequence<CALL_INDIRECT, I<OPCODE_CALL_INDIRECT, VoidOp, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.CallIndirect(i.instr, i.src1);
    e.ForgetMxcsrMode();
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT);
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -16,7 +16,13 @@
 // For OPCODE_PACK/OPCODE_UNPACK
 #include "third_party/half/include/half.hpp"
 #include "xenia/base/cvar.h"
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 DEFINE_bool(use_extended_range_half, true,
            "Emulate extended range half-precision, may be slower on games "
            "that use it heavily",
            "CPU");
 namespace xe {
 namespace cpu {
 namespace backend {
@ -31,6 +37,8 @@ struct VECTOR_CONVERT_I2F
    : Sequence<VECTOR_CONVERT_I2F,
               I<OPCODE_VECTOR_CONVERT_I2F, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
    // flags = ARITHMETIC_UNSIGNED
    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
      // Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the
@ -46,8 +54,8 @@ struct VECTOR_CONVERT_I2F
      // be 4294967296.0f.
      // xmm0 = src + 0b01111111 + ((src >> 8) & 1)
      // (xmm1 also used to launch reg + mem early and to require it late)
-      e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127));
+      e.vpaddd(e.xmm1, src1, e.GetXmmConstPtr(XMMInt127));
-      e.vpslld(e.xmm0, i.src1, 31 - 8);
+      e.vpslld(e.xmm0, src1, 31 - 8);
      e.vpsrld(e.xmm0, e.xmm0, 31);
      e.vpaddd(e.xmm0, e.xmm0, e.xmm1);
      // xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed
@ -63,13 +71,13 @@ struct VECTOR_CONVERT_I2F
      // Convert from signed integer to float.
      // xmm1 = [0x00000000, 0x7FFFFFFF] case result
-      e.vcvtdq2ps(e.xmm1, i.src1);
+      e.vcvtdq2ps(e.xmm1, src1);
      // Merge the two ways depending on whether the number is >= 0x80000000
      // (has high bit set).
-      e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
+      e.vblendvps(i.dest, e.xmm1, e.xmm0, src1);
    } else {
-      e.vcvtdq2ps(i.dest, i.src1);
+      e.vcvtdq2ps(i.dest, src1);
    }
  }
 };
@ -82,9 +90,11 @@ struct VECTOR_CONVERT_F2I
    : Sequence<VECTOR_CONVERT_F2I,
               I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
      // clamp to min 0
-      e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));
+      e.vmaxps(e.xmm0, src1, e.GetXmmConstPtr(XMMZero));
      // xmm1 = mask of values >= (unsigned)INT_MIN
      e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
@ -108,14 +118,14 @@ struct VECTOR_CONVERT_F2I
      e.vpor(i.dest, i.dest, e.xmm0);
    } else {
      // xmm2 = NaN mask
-      e.vcmpunordps(e.xmm2, i.src1, i.src1);
+      e.vcmpunordps(e.xmm2, src1, src1);
      // convert packed floats to packed dwords
-      e.vcvttps2dq(e.xmm0, i.src1);
+      e.vcvttps2dq(e.xmm0, src1);
      // (high bit) xmm1 = dest is indeterminate and i.src1 >= 0
      e.vpcmpeqd(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMIntMin));
-      e.vpandn(e.xmm1, i.src1, e.xmm1);
+      e.vpandn(e.xmm1, src1, e.xmm1);
      // saturate positive values
      e.vblendvps(i.dest, e.xmm0, e.GetXmmConstPtr(XMMIntMax), e.xmm1);
@ -131,6 +141,7 @@ struct VECTOR_DENORMFLUSH
    : Sequence<VECTOR_DENORMFLUSH,
               I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    e.vxorps(e.xmm1, e.xmm1, e.xmm1);  // 0.25 P0123
    e.vandps(e.xmm0, i.src1,
@ -352,6 +363,7 @@ struct VECTOR_COMPARE_EQ_V128
              e.vpcmpeqd(dest, src1, src2);
              break;
            case FLOAT32_TYPE:
              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vcmpeqps(dest, src1, src2);
              break;
          }
@ -380,6 +392,7 @@ struct VECTOR_COMPARE_SGT_V128
              e.vpcmpgtd(dest, src1, src2);
              break;
            case FLOAT32_TYPE:
              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vcmpgtps(dest, src1, src2);
              break;
          }
@ -414,6 +427,7 @@ struct VECTOR_COMPARE_SGE_V128
              e.vpor(dest, e.xmm0);
              break;
            case FLOAT32_TYPE:
              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vcmpgeps(dest, src1, src2);
              break;
          }
@ -441,6 +455,7 @@ struct VECTOR_COMPARE_UGT_V128
        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
        break;
      case FLOAT32_TYPE:
        e.ChangeMxcsrMode(MXCSRMode::Vmx);
        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
        break;
      default:
@ -498,6 +513,7 @@ struct VECTOR_COMPARE_UGE_V128
        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
        break;
      case FLOAT32_TYPE:
        e.ChangeMxcsrMode(MXCSRMode::Vmx);
        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
        break;
    }
@ -620,6 +636,7 @@ struct VECTOR_ADD
            case FLOAT32_TYPE:
              assert_false(is_unsigned);
              assert_false(saturate);
              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vaddps(dest, src1, src2);
              break;
            default:
@ -711,6 +728,7 @@ struct VECTOR_SUB
              }
              break;
            case FLOAT32_TYPE:
              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vsubps(dest, src1, src2);
              break;
            default:
@ -2003,6 +2021,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE);
 // ============================================================================
 struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    switch (i.instr->flags & PACK_TYPE_MODE) {
      case PACK_TYPE_D3DCOLOR:
        EmitD3DCOLOR(e, i);
@ -2062,10 +2081,15 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    alignas(16) uint16_t b[8];
    _mm_store_ps(a, src1);
    std::memset(b, 0, sizeof(b));
-
+    if (!cvars::use_extended_range_half) {
      for (int i = 0; i < 2; i++) {
        b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
      }
    } else {
      for (int i = 0; i < 2; i++) {
        b[7 - i] = float_to_xenos_half(a[i]);
      }
    }
    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
  }
@ -2074,7 +2098,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
    // dest = [(src1.x | src1.y), 0, 0, 0]
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2101,11 +2125,16 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    alignas(16) uint16_t b[8];
    _mm_store_ps(a, src1);
    std::memset(b, 0, sizeof(b));
-
+    if (!cvars::use_extended_range_half) {
      for (int i = 0; i < 4; i++) {
        b[7 - (i ^ 2)] =
            half_float::detail::float2half<std::round_toward_zero>(a[i]);
      }
    } else {
      for (int i = 0; i < 4; i++) {
        b[7 - (i ^ 2)] = float_to_xenos_half(a[i]);
      }
    }
    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
  }
@ -2113,7 +2142,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    assert_true(i.src2.value->IsConstantZero());
    // dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0]
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2420,6 +2449,7 @@ EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK);
 // ============================================================================
 struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    switch (i.instr->flags & PACK_TYPE_MODE) {
      case PACK_TYPE_D3DCOLOR:
        EmitD3DCOLOR(e, i);
@ -2478,10 +2508,15 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    alignas(16) float b[4];
    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
    if (!cvars::use_extended_range_half) {
      for (int i = 0; i < 2; i++) {
        b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
      }
-
+    } else {
      for (int i = 0; i < 2; i++) {
        b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
      }
    }
    // Constants, or something
    b[2] = 0.f;
    b[3] = 1.f;
@ -2501,7 +2536,9 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    // Also zero out the high end.
    // TODO(benvanik): special case constant unpacks that just get 0/1/etc.
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) &&
        !cvars::use_extended_range_half) {  // todo: can use cvtph and bit logic
                                            // to implement
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2534,16 +2571,21 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    alignas(16) uint16_t a[8];
    alignas(16) float b[4];
    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-
+    if (!cvars::use_extended_range_half) {
      for (int i = 0; i < 4; i++) {
        b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
      }
    } else {
      for (int i = 0; i < 4; i++) {
        b[i] = xenos_half_to_float(a[VEC128_W(4 + i)]);
      }
    }
    return _mm_load_ps(b);
  }
  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
    // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2805,6 +2847,32 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK);
 struct SET_NJM_I8 : Sequence<SET_NJM_I8, I<OPCODE_SET_NJM, VoidOp, I8Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));
    addr_vmx.setBit(32);
    if (i.src1.is_constant) {
      if (i.src1.constant() == 0) {
        // turn off daz/flush2z
        e.mov(addr_vmx, _MM_MASK_MASK);
      } else {
        e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
      }
    } else {
      e.test(i.src1, i.src1);
      e.mov(e.edx, DEFAULT_VMX_MXCSR);
      e.mov(e.eax, _MM_MASK_MASK);
      e.cmove(e.edx, e.eax);
      e.mov(addr_vmx, e.edx);
    }
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_SET_NJM, SET_NJM_I8);
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -20,6 +20,9 @@
 DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.",
            "CPU");
 DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math",
            "CPU");
 namespace xe {
 namespace cpu {
 namespace compiler {
@ -68,8 +71,24 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
  result = false;
  auto block = builder->first_block();
  while (block) {
-    auto i = block->instr_head;
+    for (auto i = block->instr_head; i; i = i->next) {
-    while (i) {
+      if (((i->opcode->flags & OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) != 0) &&
          !cvars::permit_float_constant_evaluation) {
        continue;
      }
      bool might_be_floatop = false;
      i->VisitValueOperands(
          [&might_be_floatop](Value* current_opnd, uint32_t opnd_index) {
            might_be_floatop |= current_opnd->MaybeFloaty();
          });
      if (i->dest) {
        might_be_floatop |= i->dest->MaybeFloaty();
      }
 	  bool should_skip_because_of_float =
          might_be_floatop && !cvars::permit_float_constant_evaluation;
      auto v = i->dest;
      switch (i->opcode->num) {
        case OPCODE_DEBUG_BREAK_TRUE:
@ -452,7 +471,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          break;
        case OPCODE_ADD:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
              !should_skip_because_of_float) {
            v->set_from(i->src1.value);
            v->Add(i->src2.value);
            i->Remove();
@ -481,7 +501,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;
        case OPCODE_SUB:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
              !should_skip_because_of_float) {
            v->set_from(i->src1.value);
            v->Sub(i->src2.value);
            i->Remove();
@ -489,6 +510,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;
        case OPCODE_MUL:
          if (!should_skip_because_of_float) {
            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
              v->set_from(i->src1.value);
              v->Mul(i->src2.value);
@ -518,6 +540,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                }
              }
            }
          }
          break;
        case OPCODE_MUL_HI:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
@ -528,6 +551,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;
        case OPCODE_DIV:
          if (!should_skip_because_of_float) {
            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
              v->set_from(i->src1.value);
              v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
@ -551,50 +575,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                }
              }
            }
          break;
        case OPCODE_MUL_ADD:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            if (i->src3.value->IsConstant()) {
              v->set_from(i->src1.value);
              Value::MulAdd(v, i->src1.value, i->src2.value, i->src3.value);
              i->Remove();
              result = true;
            } else {
              // Multiply part is constant.
              Value* mul = builder->AllocValue();
              mul->set_from(i->src1.value);
              mul->Mul(i->src2.value);
              Value* add = i->src3.value;
              i->Replace(&OPCODE_ADD_info, 0);
              i->set_src1(mul);
              i->set_src2(add);
              result = true;
            }
          }
          break;
        case OPCODE_MUL_SUB:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            // Multiply part is constant.
            if (i->src3.value->IsConstant()) {
              v->set_from(i->src1.value);
              Value::MulSub(v, i->src1.value, i->src2.value, i->src3.value);
              i->Remove();
              result = true;
            } else {
              // Multiply part is constant.
              Value* mul = builder->AllocValue();
              mul->set_from(i->src1.value);
              mul->Mul(i->src2.value);
              Value* add = i->src3.value;
              i->Replace(&OPCODE_SUB_info, 0);
              i->set_src1(mul);
              i->set_src2(add);
              result = true;
            }
          }
          break;
        case OPCODE_MAX:
@ -925,7 +905,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-        case OPCODE_VECTOR_DENORMFLUSH:
+        case OPCODE_VECTOR_DENORMFLUSH:  // this one is okay to constant
                                         // evaluate, since it is just bit math
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->DenormalFlush();
@ -933,19 +914,10 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
        case OPCODE_TO_SINGLE:
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->ToSingle();
            i->Remove();
            result = true;
          }
          break;
        default:
          // Ignored.
          break;
      }
      i = i->next;
    }
    block = block->next;
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -1287,7 +1287,11 @@ void HIRBuilder::SetRoundingMode(Value* value) {
  Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0);
  i->set_src1(value);
 }
-
+void HIRBuilder::SetNJM(Value* value) {
  ASSERT_INTEGER_TYPE(value);
  Instr* i = AppendInstr(OPCODE_SET_NJM_info, 0);
  i->set_src1(value);
 }
 Value* HIRBuilder::Max(Value* value1, Value* value2) {
  ASSERT_TYPES_EQUAL(value1, value2);
@ -1632,7 +1636,7 @@ Value* HIRBuilder::Div(Value* value1, Value* value2,
 Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
-
+  #if 0
  bool c1 = value1->IsConstant();
  bool c2 = value2->IsConstant();
  if (c1 && c2) {
@ -1640,7 +1644,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
    dest->Mul(value2);
    return Add(dest, value3);
  }
-
+  #endif
  Instr* i = AppendInstr(OPCODE_MUL_ADD_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
@ -1651,7 +1655,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
 Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
-
+  #if 0
  bool c1 = value1->IsConstant();
  bool c2 = value2->IsConstant();
  if (c1 && c2) {
@ -1659,7 +1663,7 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
    dest->Mul(value2);
    return Sub(dest, value3);
  }
-
+  #endif
  Instr* i = AppendInstr(OPCODE_MUL_SUB_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -264,7 +264,7 @@ class HIRBuilder {
                               Value* new_value);
  Value* AtomicAdd(Value* address, Value* value);
  Value* AtomicSub(Value* address, Value* value);
-
+  void SetNJM(Value* value);
 protected:
  void DumpValue(StringBuffer* str, Value* value);
  void DumpOp(StringBuffer* str, OpcodeSignatureType sig_type, Instr::Op* op);
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -284,6 +284,7 @@ enum Opcode {
  OPCODE_TO_SINGLE,  // i could not find a decent name to assign to this opcode,
                     // as we already have OPCODE_ROUND. round double to float (
                     // ppc "single" fpu instruction result rounding behavior )
 	  OPCODE_SET_NJM, 
  __OPCODE_MAX_VALUE,  // Keep at end.
 };
@ -295,6 +296,7 @@ enum OpcodeFlags {
  OPCODE_FLAG_IGNORE = (1 << 5),
  OPCODE_FLAG_HIDE = (1 << 6),
  OPCODE_FLAG_PAIRED_PREV = (1 << 7),
  OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING = (1 << 8)
 };
 enum OpcodeSignatureType {
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -151,25 +151,25 @@ DEFINE_OPCODE(
    OPCODE_CONVERT,
    "convert",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_ROUND,
    "round",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_VECTOR_CONVERT_I2F,
    "vector_convert_i2f",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_VECTOR_CONVERT_F2I,
    "vector_convert_f2i",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_LOAD_VECTOR_SHL,
@ -456,13 +456,13 @@ DEFINE_OPCODE(
    OPCODE_MUL_ADD,
    "mul_add",
    OPCODE_SIG_V_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_MUL_SUB,
    "mul_sub",
    OPCODE_SIG_V_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_NEG,
@ -480,43 +480,43 @@ DEFINE_OPCODE(
    OPCODE_SQRT,
    "sqrt",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_RSQRT,
    "rsqrt",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_RECIP,
    "recip",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_POW2,
    "pow2",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_LOG2,
    "log2",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_DOT_PRODUCT_3,
    "dot_product_3",
    OPCODE_SIG_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_DOT_PRODUCT_4,
    "dot_product_4",
    OPCODE_SIG_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_AND,
@ -685,5 +685,11 @@ DEFINE_OPCODE(
 	OPCODE_TO_SINGLE,
 	"to_single",
 	OPCODE_SIG_V_V,
 	OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING
 )
 DEFINE_OPCODE(
 	OPCODE_SET_NJM,
 	"set_njm",
 	OPCODE_SIG_X_V,
 	0
 )
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) {
      return;
  }
 }
-
+//WARNING: this does not handle rounding flags at all!
 void Value::Convert(TypeName target_type, RoundMode round_mode) {
  switch (type) {
    case FLOAT32_TYPE:
@ -401,7 +401,7 @@ void Value::MulHi(Value* other, bool is_unsigned) {
                      32);
      }
      break;
-    case INT64_TYPE:
+    case INT64_TYPE: {
 #if XE_COMPILER_MSVC
      if (is_unsigned) {
        constant.i64 = __umulh(constant.i64, other->constant.i64);
@ -409,17 +409,19 @@ void Value::MulHi(Value* other, bool is_unsigned) {
        constant.i64 = __mulh(constant.i64, other->constant.i64);
      }
 #else
      unsigned __int128 product;
      if (is_unsigned) {
-        constant.i64 = static_cast<uint64_t>(
+        product = static_cast<unsigned __int128>(constant.i64) *
-            static_cast<unsigned __int128>(constant.i64) *
+                  static_cast<unsigned __int128>(other->constant.i64);
            static_cast<unsigned __int128>(other->constant.i64));
      } else {
-        constant.i64 =
+        product = static_cast<unsigned __int128>(
-            static_cast<uint64_t>(static_cast<__int128>(constant.i64) *
+            static_cast<__int128>(constant.i64) *
            static_cast<__int128>(other->constant.i64));
      }
      constant.i64 = static_cast<int64_t>(product >> 64);
 #endif  // XE_COMPILER_MSVC
      break;
    }
    default:
      assert_unhandled_case(type);
      break;
@ -495,52 +497,6 @@ void Value::Max(Value* other) {
  }
 }
 void Value::MulAdd(Value* dest, Value* value1, Value* value2, Value* value3) {
  switch (dest->type) {
    case VEC128_TYPE:
      for (int i = 0; i < 4; i++) {
        dest->constant.v128.f32[i] =
            (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) +
            value3->constant.v128.f32[i];
      }
      break;
    case FLOAT32_TYPE:
      dest->constant.f32 =
          (value1->constant.f32 * value2->constant.f32) + value3->constant.f32;
      break;
    case FLOAT64_TYPE:
      dest->constant.f64 =
          (value1->constant.f64 * value2->constant.f64) + value3->constant.f64;
      break;
    default:
      assert_unhandled_case(dest->type);
      break;
  }
 }
 void Value::MulSub(Value* dest, Value* value1, Value* value2, Value* value3) {
  switch (dest->type) {
    case VEC128_TYPE:
      for (int i = 0; i < 4; i++) {
        dest->constant.v128.f32[i] =
            (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) -
            value3->constant.v128.f32[i];
      }
      break;
    case FLOAT32_TYPE:
      dest->constant.f32 =
          (value1->constant.f32 * value2->constant.f32) - value3->constant.f32;
      break;
    case FLOAT64_TYPE:
      dest->constant.f64 =
          (value1->constant.f64 * value2->constant.f64) - value3->constant.f64;
      break;
    default:
      assert_unhandled_case(dest->type);
      break;
  }
 }
 void Value::Neg() {
  switch (type) {
    case INT8_TYPE:
@ -1643,11 +1599,7 @@ void Value::DenormalFlush() {
    constant.v128.u32[i] = current_element;
  }
 }
 void Value::ToSingle() {
  assert_true(type == FLOAT64_TYPE);
  constant.f64 = static_cast<double>(static_cast<float>(constant.f64));
 }
 void Value::CountLeadingZeros(const Value* other) {
  switch (other->type) {
    case INT8_TYPE:
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -563,8 +563,7 @@ class Value {
  void MulHi(Value* other, bool is_unsigned);
  void Div(Value* other, bool is_unsigned);
  void Max(Value* other);
-  static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3);
+
  static void MulSub(Value* dest, Value* value1, Value* value2, Value* value3);
  void Neg();
  void Abs();
  void Sqrt();
@ -603,7 +602,6 @@ class Value {
                     bool saturate);
  void ByteSwap();
  void DenormalFlush();
  void ToSingle();
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
@ -615,7 +613,10 @@ class Value {
  // returns true if every single use is as an operand to a single instruction
  // (add var2, var1, var1)
  bool AllUsesByOneInsn() const;
-
+  //the maybe is here because this includes vec128, which is untyped data that can be treated as float or int depending on the context
  bool MaybeFloaty() const {
    return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
  }
 private:
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
  static bool CompareInt16(Opcode opcode, Value* a, Value* b);
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -364,7 +364,16 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
  // is this the right format?
 	//todo: what mtvscr does with the unused bits is implementation defined, figure out what it does
  Value* v = f.LoadVR(i.VX128_1.RB);
  Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE);
  f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536))));
  f.StoreContext(offsetof(PPCContext, vscr_vec), v);
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -382,7 +382,6 @@ int InstrEmit_mtfsfx(PPCHIRBuilder& f, const InstrData& i) {
    return 1;
  } else {
    assert_zero(i.XFL.W);
    // Store under control of mask.
    // Expand the mask from 8 bits -> 32 bits.
    uint32_t mask = 0;
@ -402,7 +401,7 @@ int InstrEmit_mtfsfx(PPCHIRBuilder& f, const InstrData& i) {
    // Update the system rounding mode.
    if (mask & 0x7) {
-      f.SetRoundingMode(v);
+      f.SetRoundingMode(f.And(v, f.LoadConstantInt32(7)));
    }
  }
  if (i.XFL.Rc) {
@ -425,7 +424,7 @@ int InstrEmit_mtfsfix(PPCHIRBuilder& f, const InstrData& i) {
  // Update the system rounding mode.
  if (mask & 0x7) {
-    f.SetRoundingMode(fpscr);
+    f.SetRoundingMode(f.And(fpscr, f.LoadConstantInt32(7)));
  }
  if (i.X.Rc) {
--- a/src/xenia/emulator.cc
+++ b/src/xenia/emulator.cc
@ -64,9 +64,13 @@ DEFINE_string(
    "or the module specified by the game. Leave blank to launch the default "
    "module.",
    "General");
 DEFINE_bool(allow_game_relative_writes, false,
            "Not useful to non-developers. Allows code to write to paths "
            "relative to game://. Used for "
            "generating test data to compare with original hardware. ",
            "General");
 namespace xe {
 using namespace xe::literals;
 Emulator::GameConfigLoadCallback::GameConfigLoadCallback(Emulator& emulator)
@ -282,7 +286,8 @@ const std::unique_ptr<vfs::Device> Emulator::CreateVfsDeviceBasedOnPath(
  auto extension = xe::utf8::lower_ascii(xe::path_to_utf8(path.extension()));
  if (extension == ".xex" || extension == ".elf" || extension == ".exe") {
    auto parent_path = path.parent_path();
-    return std::make_unique<vfs::HostPathDevice>(mount_path, parent_path, true);
+    return std::make_unique<vfs::HostPathDevice>(
        mount_path, parent_path, !cvars::allow_game_relative_writes);
  } else {
    return std::make_unique<vfs::DiscImageDevice>(mount_path, path);
  }
@ -653,8 +658,8 @@ bool Emulator::ExceptionCallback(Exception* ex) {
    // debugger.
    return false;
  } else if (processor()->is_debugger_attached()) {
-    // Let the debugger handle this exception. It may decide to continue past it
+    // Let the debugger handle this exception. It may decide to continue past
-    // (if it was a stepping breakpoint, etc).
+    // it (if it was a stepping breakpoint, etc).
    return processor()->OnUnhandledException(ex);
  }
@ -823,8 +828,8 @@ static std::string format_version(xex2_version version) {
 X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
                                  const std::string_view module_path) {
-  // Making changes to the UI (setting the icon) and executing game config load
+  // Making changes to the UI (setting the icon) and executing game config
-  // callbacks which expect to be called from the UI thread.
+  // load callbacks which expect to be called from the UI thread.
  assert_true(display_window_->app_context().IsInUIThread());
  // Setup NullDevices for raw HDD partition accesses
@ -832,12 +837,12 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
  // By using a NullDevice that just returns success to all IO requests it
  // should allow games to believe cache/raw disk was accessed successfully
-  // NOTE: this should probably be moved to xenia_main.cc, but right now we need
+  // NOTE: this should probably be moved to xenia_main.cc, but right now we
-  // to register the \Device\Harddisk0\ NullDevice _after_ the
+  // need to register the \Device\Harddisk0\ NullDevice _after_ the
  // \Device\Harddisk0\Partition1 HostPathDevice, otherwise requests to
-  // Partition1 will go to this. Registering during CompleteLaunch allows us to
+  // Partition1 will go to this. Registering during CompleteLaunch allows us
-  // make sure any HostPathDevices are ready beforehand.
+  // to make sure any HostPathDevices are ready beforehand. (see comment above
-  // (see comment above cache:\ device registration for more info about why)
+  // cache:\ device registration for more info about why)
  auto null_paths = {std::string("\\Partition0"), std::string("\\Cache0"),
                     std::string("\\Cache1")};
  auto null_device =
@ -900,8 +905,8 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
  if (module->title_id()) {
    auto title_id = fmt::format("{:08X}", module->title_id());
-    // Load the per-game configuration file and make sure updates are handled by
+    // Load the per-game configuration file and make sure updates are handled
-    // the callbacks.
+    // by the callbacks.
    config::LoadGameConfig(title_id);
    assert_true(game_config_load_callback_loop_next_index_ == SIZE_MAX);
    game_config_load_callback_loop_next_index_ = 0;
@ -934,10 +939,10 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
    }
  }
-  // Initializing the shader storage in a blocking way so the user doesn't miss
+  // Initializing the shader storage in a blocking way so the user doesn't
-  // the initial seconds - for instance, sound from an intro video may start
+  // miss the initial seconds - for instance, sound from an intro video may
-  // playing before the video can be seen if doing this in parallel with the
+  // start playing before the video can be seen if doing this in parallel with
-  // main thread.
+  // the main thread.
  on_shader_storage_initialization(true);
  graphics_system_->InitializeShaderStorage(cache_root_, title_id_.value(),
                                            true);