Merge pull request #57 from chrisps/canary_experimental

Add separate VMX/fpu mxcsr
2022-07-31 18:43:30 +02:00 · 2022-07-31 18:43:30 +02:00 · 332f69f36b
parent 3185b0ac9c 968f656d96
commit 332f69f36b
18 changed files with 687 additions and 611 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -692,6 +692,12 @@ void X64Backend::InitializeBackendContext(void* ctx) {
  X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
      reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
  bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
+  bctx->mxcsr_fpu =
+      DEFAULT_FPU_MXCSR;  // idk if this is right, check on rgh what the
+                          // rounding on ppc is at startup
+  bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
+  bctx->flags = 0;
+  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
 }
 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -37,9 +37,17 @@ typedef void (*ResolveFunctionThunk)();
 // negatively index the membase reg)
 struct X64BackendContext {
  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
+  unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu
+  unsigned int mxcsr_vmx;
+  unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx
  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
                        // add of it by... 2 bytes lol
 };
+constexpr unsigned int DEFAULT_VMX_MXCSR =
+    0x8000 |                   // flush to zero
+    0x0040 | (_MM_MASK_MASK);  // default rounding mode for vmx
+
+constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80;

 class X64Backend : public Backend {
 public:
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -320,6 +320,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  // Body.
  auto block = builder->first_block();
  while (block) {
+    ForgetMxcsrMode();  // at start of block, mxcsr mode is undefined
+
    // Mark block labels.
    auto label = block->label_head;
    while (label) {
@ -490,6 +492,7 @@ uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {

 void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
  assert_not_null(function);
+  ForgetMxcsrMode();
  auto fn = static_cast<X64Function*>(function);
  // Resolve address to the function to call and store in rax.

@ -564,6 +567,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {

 void X64Emitter::CallIndirect(const hir::Instr* instr,
                              const Xbyak::Reg64& reg) {
+  ForgetMxcsrMode();
  // Check if return.
  if (instr->flags & hir::CALL_POSSIBLE_RETURN) {
    cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
@ -617,6 +621,7 @@ uint64_t UndefinedCallExtern(void* raw_context, uint64_t function_ptr) {
  return 0;
 }
 void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
+  ForgetMxcsrMode();
  bool undefined = true;
  if (function->behavior() == Function::Behavior::kBuiltin) {
    auto builtin_function = static_cast<const BuiltinFunction*>(function);
@ -696,11 +701,13 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
 }

 // Important: If you change these, you must update the thunks in x64_backend.cc!
-Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
-Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }
+Xbyak::Reg64 X64Emitter::GetContextReg() const { return rsi; }
+Xbyak::Reg64 X64Emitter::GetMembaseReg() const { return rdi; }

 void X64Emitter::ReloadMembase() {
-  mov(GetMembaseReg(), qword[GetContextReg() + 8]);  // membase
+  mov(GetMembaseReg(),
+      qword[GetContextReg() +
+            offsetof(ppc::PPCContext, virtual_membase)]);  // membase
 }

 // Len Assembly                                   Byte Sequence
@ -917,7 +924,7 @@ static const vec128_t xmm_consts[] = {
    /* XMMQNaN                */ vec128i(0x7FC00000u),
    /* XMMInt127              */ vec128i(0x7Fu),
    /* XMM2To32               */ vec128f(0x1.0p32f),
-    /* xmminf */ vec128i(0x7f800000),
+    /* XMMFloatInf */ vec128i(0x7f800000),

    /* XMMIntsToBytes*/
    v128_setr_bytes(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
@ -938,9 +945,7 @@ static const vec128_t xmm_consts[] = {
    /*XMMVSRShlByteshuf*/
    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
    // XMMVSRMask
-    vec128b(1)
-
-};
+    vec128b(1)};

 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
@ -1347,7 +1352,7 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {

  return SimdDomain::DONTCARE;
 }
-Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) {
+Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) const {
  /*
    index context ptr negatively to get to backend ctx field
  */
@ -1368,6 +1373,93 @@ Xbyak::Label& X64Emitter::NewCachedLabel() {
  label_cache_.push_back(tmp);
  return *tmp;
 }
+
+template<bool switching_to_fpu>
+static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
+  auto flags = e.GetBackendFlagsPtr();
+  if (switching_to_fpu) {
+    e.btr(flags, 0);  // bit 0 set to 0 = is fpu mode
+  } else {
+    e.bts(flags, 0); // bit 0 set to 1 = is vmx mode
+  }
+  Xbyak::Label& come_back = e.NewCachedLabel();
+
+  Xbyak::Label& reload_bailout =
+      e.AddToTail([&come_back](X64Emitter& e, Xbyak::Label& thislabel) {
+        e.L(thislabel);
+        if (switching_to_fpu) {
+          e.LoadFpuMxcsrDirect();
+        } else {
+          e.LoadVmxMxcsrDirect();
+		}
+        e.jmp(come_back, X64Emitter::T_NEAR);
+      });
+  if (switching_to_fpu) {
+    e.jc(reload_bailout,
+         X64Emitter::T_NEAR);  // if carry flag was set, we were VMX mxcsr mode.
+  } else {
+    e.jnc(reload_bailout,
+         X64Emitter::T_NEAR);  // if carry flag was set, we were VMX mxcsr mode.
+  }
+  e.L(come_back);
+}
+
+bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
+  if (new_mode == mxcsr_mode_) {
+    return false;
+  }
+  assert_true(new_mode != MXCSRMode::Unknown);
+
+  if (mxcsr_mode_ == MXCSRMode::Unknown) {
+    // check the mode dynamically
+    mxcsr_mode_ = new_mode;
+    if (!already_set) {
+      if (new_mode == MXCSRMode::Fpu) {
+        ChangeMxcsrModeDynamicHelper<true>(*this);
+      } else if (new_mode == MXCSRMode::Vmx) {
+        ChangeMxcsrModeDynamicHelper<false>(*this);
+      } else {
+        assert_unhandled_case(new_mode);
+	  }
+    } else { //even if already set, we still need to update flags to reflect our mode
+      if (new_mode == MXCSRMode::Fpu) {
+        btr(GetBackendFlagsPtr(), 0);
+      } else if (new_mode == MXCSRMode::Vmx) {
+        bts(GetBackendFlagsPtr(), 0);
+      } else {
+        assert_unhandled_case(new_mode);
+      }	
+	}
+  } else {
+    mxcsr_mode_ = new_mode;
+    if (!already_set) {
+      if (new_mode == MXCSRMode::Fpu) {
+		  
+        LoadFpuMxcsrDirect();
+        btr(GetBackendFlagsPtr(), 0);
+        return true;
+      } else if (new_mode == MXCSRMode::Vmx) {
+        LoadVmxMxcsrDirect();
+        bts(GetBackendFlagsPtr(), 0);
+        return true;
+      } else {
+        assert_unhandled_case(new_mode);
+      }
+    }
+  }
+  return false;
+}
+void X64Emitter::LoadFpuMxcsrDirect() {
+  vldmxcsr(GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)));
+}
+void X64Emitter::LoadVmxMxcsrDirect() {
+  vldmxcsr(GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx)));
+}
+Xbyak::Address X64Emitter::GetBackendFlagsPtr() const {
+  Xbyak::Address pt = GetBackendCtxPtr(offsetof(X64BackendContext, flags));
+  pt.setBit(32);
+  return pt;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -65,6 +65,12 @@ enum class SimdDomain : uint32_t {
               // CONFLICTING means its used in multiple domains)
 };

+enum class MXCSRMode : uint32_t {
+	Unknown,
+	Fpu,
+	Vmx
+};
+
 static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
  if (dom1 == dom2) {
    return dom1;
@ -283,8 +289,8 @@ class X64Emitter : public Xbyak::CodeGenerator {

  Xbyak::Reg64 GetNativeParam(uint32_t param);

-  Xbyak::Reg64 GetContextReg();
-  Xbyak::Reg64 GetMembaseReg();
+  Xbyak::Reg64 GetContextReg() const;
+  Xbyak::Reg64 GetMembaseReg() const;
  bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; }
  void ReloadMembase();

@ -295,7 +301,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  void MovMem64(const Xbyak::RegExp& addr, uint64_t v);

  Xbyak::Address GetXmmConstPtr(XmmConst id);
-  Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx);
+  Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx) const;

  void LoadConstantXmm(Xbyak::Xmm dest, float v);
  void LoadConstantXmm(Xbyak::Xmm dest, double v);
@ -304,6 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  Xbyak::Address StashConstantXmm(int index, float v);
  Xbyak::Address StashConstantXmm(int index, double v);
  Xbyak::Address StashConstantXmm(int index, const vec128_t& v);
+  Xbyak::Address GetBackendFlagsPtr() const;
  void* FindByteConstantOffset(unsigned bytevalue);
  void* FindWordConstantOffset(unsigned wordvalue);
  void* FindDwordConstantOffset(unsigned bytevalue);
@ -319,6 +326,16 @@ class X64Emitter : public Xbyak::CodeGenerator {
  size_t stack_size() const { return stack_size_; }
  SimdDomain DeduceSimdDomain(const hir::Value* for_value);

+  void ForgetMxcsrMode() {
+    mxcsr_mode_ = MXCSRMode::Unknown;
+  }
+  /*
+	returns true if had to load mxcsr. DOT_PRODUCT can use this to skip clearing the overflow flag, as it will never be set in the vmx fpscr
+  */
+  bool ChangeMxcsrMode(MXCSRMode new_mode, bool already_set=false);//already_set means that the caller already did vldmxcsr, used for SET_ROUNDING_MODE
+
+  void LoadFpuMxcsrDirect(); //unsafe, does not change mxcsr_mode_
+  void LoadVmxMxcsrDirect(); //unsafe, does not change mxcsr_mode_
 protected:
  void* Emplace(const EmitFunctionInfo& func_info,
                GuestFunction* function = nullptr);
@ -359,6 +376,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  std::vector<Xbyak::Label*>
      label_cache_;  // for creating labels that need to be referenced much
                     // later by tail emitters
+  MXCSRMode mxcsr_mode_ = MXCSRMode::Unknown;
 };

 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_op.h
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@ -616,7 +616,31 @@ struct Sequence {
    }
  }
 };
+template <typename T>
+static Xmm GetInputRegOrConstant(X64Emitter& e, const T& input,
+                                 Xmm xmm_to_use_if_const) {
+  if (input.is_constant) {
+    using constant_type = std::remove_reference_t<decltype(input.constant())>;

+    if constexpr (std::is_integral_v<constant_type>) {
+      vec128_t input_constant = vec128b(0);
+      if constexpr (sizeof(constant_type) == 4) {
+        input_constant.i32[0] = input.constant();
+
+      } else if constexpr (sizeof(constant_type) == 8) {
+        input_constant.low = input.constant();
+      } else {
+        assert_unhandled_case(sizeof(constant_type));
+      }
+      e.LoadConstantXmm(xmm_to_use_if_const, input_constant);
+    } else {
+      e.LoadConstantXmm(xmm_to_use_if_const, input.constant());
+    }
+    return xmm_to_use_if_const;
+  } else {
+    return input;
+  }
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@ -257,6 +257,7 @@ struct CALL_TRUE_I8
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
+    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_I16
@ -268,6 +269,7 @@ struct CALL_TRUE_I16
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
+    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_I32
@ -279,6 +281,7 @@ struct CALL_TRUE_I32
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
+    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_I64
@ -290,6 +293,7 @@ struct CALL_TRUE_I64
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
+    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_F32
@ -301,6 +305,7 @@ struct CALL_TRUE_F32
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
+    e.ForgetMxcsrMode();
  }
 };

@ -313,6 +318,7 @@ struct CALL_TRUE_F64
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
+    e.ForgetMxcsrMode();
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
@ -326,6 +332,7 @@ struct CALL_INDIRECT
    : Sequence<CALL_INDIRECT, I<OPCODE_CALL_INDIRECT, VoidOp, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.CallIndirect(i.instr, i.src1);
+    e.ForgetMxcsrMode();
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT);
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -16,7 +16,13 @@

 // For OPCODE_PACK/OPCODE_UNPACK
 #include "third_party/half/include/half.hpp"
+#include "xenia/base/cvar.h"
+#include "xenia/cpu/backend/x64/x64_stack_layout.h"

+DEFINE_bool(use_extended_range_half, true,
+            "Emulate extended range half-precision, may be slower on games "
+            "that use it heavily",
+            "CPU");
 namespace xe {
 namespace cpu {
 namespace backend {
@ -31,6 +37,8 @@ struct VECTOR_CONVERT_I2F
    : Sequence<VECTOR_CONVERT_I2F,
               I<OPCODE_VECTOR_CONVERT_I2F, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
+    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
    // flags = ARITHMETIC_UNSIGNED
    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
      // Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the
@ -46,8 +54,8 @@ struct VECTOR_CONVERT_I2F
      // be 4294967296.0f.
      // xmm0 = src + 0b01111111 + ((src >> 8) & 1)
      // (xmm1 also used to launch reg + mem early and to require it late)
-      e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127));
-      e.vpslld(e.xmm0, i.src1, 31 - 8);
+      e.vpaddd(e.xmm1, src1, e.GetXmmConstPtr(XMMInt127));
+      e.vpslld(e.xmm0, src1, 31 - 8);
      e.vpsrld(e.xmm0, e.xmm0, 31);
      e.vpaddd(e.xmm0, e.xmm0, e.xmm1);
      // xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed
@ -63,13 +71,13 @@ struct VECTOR_CONVERT_I2F

      // Convert from signed integer to float.
      // xmm1 = [0x00000000, 0x7FFFFFFF] case result
-      e.vcvtdq2ps(e.xmm1, i.src1);
+      e.vcvtdq2ps(e.xmm1, src1);

      // Merge the two ways depending on whether the number is >= 0x80000000
      // (has high bit set).
-      e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
+      e.vblendvps(i.dest, e.xmm1, e.xmm0, src1);
    } else {
-      e.vcvtdq2ps(i.dest, i.src1);
+      e.vcvtdq2ps(i.dest, src1);
    }
  }
 };
@ -82,9 +90,11 @@ struct VECTOR_CONVERT_F2I
    : Sequence<VECTOR_CONVERT_F2I,
               I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
+    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
      // clamp to min 0
-      e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));
+      e.vmaxps(e.xmm0, src1, e.GetXmmConstPtr(XMMZero));

      // xmm1 = mask of values >= (unsigned)INT_MIN
      e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
@ -108,14 +118,14 @@ struct VECTOR_CONVERT_F2I
      e.vpor(i.dest, i.dest, e.xmm0);
    } else {
      // xmm2 = NaN mask
-      e.vcmpunordps(e.xmm2, i.src1, i.src1);
+      e.vcmpunordps(e.xmm2, src1, src1);

      // convert packed floats to packed dwords
-      e.vcvttps2dq(e.xmm0, i.src1);
+      e.vcvttps2dq(e.xmm0, src1);

      // (high bit) xmm1 = dest is indeterminate and i.src1 >= 0
      e.vpcmpeqd(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMIntMin));
-      e.vpandn(e.xmm1, i.src1, e.xmm1);
+      e.vpandn(e.xmm1, src1, e.xmm1);

      // saturate positive values
      e.vblendvps(i.dest, e.xmm0, e.GetXmmConstPtr(XMMIntMax), e.xmm1);
@ -131,6 +141,7 @@ struct VECTOR_DENORMFLUSH
    : Sequence<VECTOR_DENORMFLUSH,
               I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    e.vxorps(e.xmm1, e.xmm1, e.xmm1);  // 0.25 P0123

    e.vandps(e.xmm0, i.src1,
@ -352,6 +363,7 @@ struct VECTOR_COMPARE_EQ_V128
              e.vpcmpeqd(dest, src1, src2);
              break;
            case FLOAT32_TYPE:
+              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vcmpeqps(dest, src1, src2);
              break;
          }
@ -380,6 +392,7 @@ struct VECTOR_COMPARE_SGT_V128
              e.vpcmpgtd(dest, src1, src2);
              break;
            case FLOAT32_TYPE:
+              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vcmpgtps(dest, src1, src2);
              break;
          }
@ -414,6 +427,7 @@ struct VECTOR_COMPARE_SGE_V128
              e.vpor(dest, e.xmm0);
              break;
            case FLOAT32_TYPE:
+              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vcmpgeps(dest, src1, src2);
              break;
          }
@ -441,6 +455,7 @@ struct VECTOR_COMPARE_UGT_V128
        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
        break;
      case FLOAT32_TYPE:
+        e.ChangeMxcsrMode(MXCSRMode::Vmx);
        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
        break;
      default:
@ -498,6 +513,7 @@ struct VECTOR_COMPARE_UGE_V128
        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
        break;
      case FLOAT32_TYPE:
+        e.ChangeMxcsrMode(MXCSRMode::Vmx);
        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
        break;
    }
@ -620,6 +636,7 @@ struct VECTOR_ADD
            case FLOAT32_TYPE:
              assert_false(is_unsigned);
              assert_false(saturate);
+              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vaddps(dest, src1, src2);
              break;
            default:
@ -711,6 +728,7 @@ struct VECTOR_SUB
              }
              break;
            case FLOAT32_TYPE:
+              e.ChangeMxcsrMode(MXCSRMode::Vmx);
              e.vsubps(dest, src1, src2);
              break;
            default:
@ -2003,6 +2021,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE);
 // ============================================================================
 struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    switch (i.instr->flags & PACK_TYPE_MODE) {
      case PACK_TYPE_D3DCOLOR:
        EmitD3DCOLOR(e, i);
@ -2062,9 +2081,14 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    alignas(16) uint16_t b[8];
    _mm_store_ps(a, src1);
    std::memset(b, 0, sizeof(b));
-
-    for (int i = 0; i < 2; i++) {
-      b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
+    if (!cvars::use_extended_range_half) {
+      for (int i = 0; i < 2; i++) {
+        b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
+      }
+    } else {
+      for (int i = 0; i < 2; i++) {
+        b[7 - i] = float_to_xenos_half(a[i]);
+      }
    }

    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
@ -2074,7 +2098,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
    // dest = [(src1.x | src1.y), 0, 0, 0]

-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2101,10 +2125,15 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    alignas(16) uint16_t b[8];
    _mm_store_ps(a, src1);
    std::memset(b, 0, sizeof(b));
-
-    for (int i = 0; i < 4; i++) {
-      b[7 - (i ^ 2)] =
-          half_float::detail::float2half<std::round_toward_zero>(a[i]);
+    if (!cvars::use_extended_range_half) {
+      for (int i = 0; i < 4; i++) {
+        b[7 - (i ^ 2)] =
+            half_float::detail::float2half<std::round_toward_zero>(a[i]);
+      }
+    } else {
+      for (int i = 0; i < 4; i++) {
+        b[7 - (i ^ 2)] = float_to_xenos_half(a[i]);
+      }
    }

    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
@ -2113,7 +2142,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    assert_true(i.src2.value->IsConstantZero());
    // dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0]

-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2420,6 +2449,7 @@ EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK);
 // ============================================================================
 struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    switch (i.instr->flags & PACK_TYPE_MODE) {
      case PACK_TYPE_D3DCOLOR:
        EmitD3DCOLOR(e, i);
@ -2478,10 +2508,15 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    alignas(16) float b[4];
    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);

-    for (int i = 0; i < 2; i++) {
-      b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
+    if (!cvars::use_extended_range_half) {
+      for (int i = 0; i < 2; i++) {
+        b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
+      }
+    } else {
+      for (int i = 0; i < 2; i++) {
+        b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
+      }
    }
-
    // Constants, or something
    b[2] = 0.f;
    b[3] = 1.f;
@ -2501,7 +2536,9 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    // Also zero out the high end.
    // TODO(benvanik): special case constant unpacks that just get 0/1/etc.

-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) &&
+        !cvars::use_extended_range_half) {  // todo: can use cvtph and bit logic
+                                            // to implement
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2534,16 +2571,21 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    alignas(16) uint16_t a[8];
    alignas(16) float b[4];
    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-
-    for (int i = 0; i < 4; i++) {
-      b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
+    if (!cvars::use_extended_range_half) {
+      for (int i = 0; i < 4; i++) {
+        b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
+      }
+    } else {
+      for (int i = 0; i < 4; i++) {
+        b[i] = xenos_half_to_float(a[VEC128_W(4 + i)]);
+      }
    }

    return _mm_load_ps(b);
  }
  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
    // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
      Xmm src;
      if (i.src1.is_constant) {
        src = i.dest;
@ -2805,6 +2847,32 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK);

+struct SET_NJM_I8 : Sequence<SET_NJM_I8, I<OPCODE_SET_NJM, VoidOp, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));
+
+    addr_vmx.setBit(32);
+    if (i.src1.is_constant) {
+      if (i.src1.constant() == 0) {
+        // turn off daz/flush2z
+        e.mov(addr_vmx, _MM_MASK_MASK);
+
+      } else {
+        e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
+      }
+
+    } else {
+      e.test(i.src1, i.src1);
+      e.mov(e.edx, DEFAULT_VMX_MXCSR);
+      e.mov(e.eax, _MM_MASK_MASK);
+
+      e.cmove(e.edx, e.eax);
+      e.mov(addr_vmx, e.edx);
+    }
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SET_NJM, SET_NJM_I8);
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -20,6 +20,9 @@
 DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.",
            "CPU");

+DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math",
+            "CPU");
+
 namespace xe {
 namespace cpu {
 namespace compiler {
@ -68,8 +71,24 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
  result = false;
  auto block = builder->first_block();
  while (block) {
-    auto i = block->instr_head;
-    while (i) {
+    for (auto i = block->instr_head; i; i = i->next) {
+      if (((i->opcode->flags & OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) != 0) &&
+          !cvars::permit_float_constant_evaluation) {
+        continue;
+      }
+      bool might_be_floatop = false;
+
+      i->VisitValueOperands(
+          [&might_be_floatop](Value* current_opnd, uint32_t opnd_index) {
+            might_be_floatop |= current_opnd->MaybeFloaty();
+          });
+      if (i->dest) {
+        might_be_floatop |= i->dest->MaybeFloaty();
+      }
+    
+	  bool should_skip_because_of_float =
+          might_be_floatop && !cvars::permit_float_constant_evaluation;
+
      auto v = i->dest;
      switch (i->opcode->num) {
        case OPCODE_DEBUG_BREAK_TRUE:
@ -452,7 +471,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          break;

        case OPCODE_ADD:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              !should_skip_because_of_float) {
            v->set_from(i->src1.value);
            v->Add(i->src2.value);
            i->Remove();
@ -481,7 +501,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;
        case OPCODE_SUB:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              !should_skip_because_of_float) {
            v->set_from(i->src1.value);
            v->Sub(i->src2.value);
            i->Remove();
@ -489,32 +510,34 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;
        case OPCODE_MUL:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->Mul(i->src2.value);
-            i->Remove();
-            result = true;
-          } else if (i->src1.value->IsConstant() ||
-                     i->src2.value->IsConstant()) {
-            // Reorder the sources to make things simpler.
-            // s1 = non-const, s2 = const
-            auto s1 =
-                i->src1.value->IsConstant() ? i->src2.value : i->src1.value;
-            auto s2 =
-                i->src1.value->IsConstant() ? i->src1.value : i->src2.value;
-
-            // Multiplication by one = no-op
-            if (s2->type != VEC128_TYPE && s2->IsConstantOne()) {
-              i->Replace(&OPCODE_ASSIGN_info, 0);
-              i->set_src1(s1);
+          if (!should_skip_because_of_float) {
+            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+              v->set_from(i->src1.value);
+              v->Mul(i->src2.value);
+              i->Remove();
              result = true;
-            } else if (s2->type == VEC128_TYPE) {
-              auto& c = s2->constant;
-              if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f &&
-                  c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) {
+            } else if (i->src1.value->IsConstant() ||
+                       i->src2.value->IsConstant()) {
+              // Reorder the sources to make things simpler.
+              // s1 = non-const, s2 = const
+              auto s1 =
+                  i->src1.value->IsConstant() ? i->src2.value : i->src1.value;
+              auto s2 =
+                  i->src1.value->IsConstant() ? i->src1.value : i->src2.value;
+
+              // Multiplication by one = no-op
+              if (s2->type != VEC128_TYPE && s2->IsConstantOne()) {
                i->Replace(&OPCODE_ASSIGN_info, 0);
                i->set_src1(s1);
                result = true;
+              } else if (s2->type == VEC128_TYPE) {
+                auto& c = s2->constant;
+                if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f &&
+                    c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) {
+                  i->Replace(&OPCODE_ASSIGN_info, 0);
+                  i->set_src1(s1);
+                  result = true;
+                }
              }
            }
          }
@ -528,75 +551,32 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;
        case OPCODE_DIV:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
-            i->Remove();
-            result = true;
-          } else if (i->src2.value->IsConstant()) {
-            // Division by one = no-op.
-            Value* src1 = i->src1.value;
-            if (i->src2.value->type != VEC128_TYPE &&
-                i->src2.value->IsConstantOne()) {
-              i->Replace(&OPCODE_ASSIGN_info, 0);
-              i->set_src1(src1);
+          if (!should_skip_because_of_float) {
+            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+              v->set_from(i->src1.value);
+              v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
+              i->Remove();
              result = true;
-            } else if (i->src2.value->type == VEC128_TYPE) {
-              auto& c = i->src2.value->constant;
-              if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f &&
-                  c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) {
+            } else if (i->src2.value->IsConstant()) {
+              // Division by one = no-op.
+              Value* src1 = i->src1.value;
+              if (i->src2.value->type != VEC128_TYPE &&
+                  i->src2.value->IsConstantOne()) {
                i->Replace(&OPCODE_ASSIGN_info, 0);
                i->set_src1(src1);
                result = true;
+              } else if (i->src2.value->type == VEC128_TYPE) {
+                auto& c = i->src2.value->constant;
+                if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f &&
+                    c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) {
+                  i->Replace(&OPCODE_ASSIGN_info, 0);
+                  i->set_src1(src1);
+                  result = true;
+                }
              }
            }
          }
          break;
-        case OPCODE_MUL_ADD:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            if (i->src3.value->IsConstant()) {
-              v->set_from(i->src1.value);
-              Value::MulAdd(v, i->src1.value, i->src2.value, i->src3.value);
-              i->Remove();
-              result = true;
-            } else {
-              // Multiply part is constant.
-              Value* mul = builder->AllocValue();
-              mul->set_from(i->src1.value);
-              mul->Mul(i->src2.value);
-
-              Value* add = i->src3.value;
-              i->Replace(&OPCODE_ADD_info, 0);
-              i->set_src1(mul);
-              i->set_src2(add);
-
-              result = true;
-            }
-          }
-          break;
-        case OPCODE_MUL_SUB:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            // Multiply part is constant.
-            if (i->src3.value->IsConstant()) {
-              v->set_from(i->src1.value);
-              Value::MulSub(v, i->src1.value, i->src2.value, i->src3.value);
-              i->Remove();
-              result = true;
-            } else {
-              // Multiply part is constant.
-              Value* mul = builder->AllocValue();
-              mul->set_from(i->src1.value);
-              mul->Mul(i->src2.value);
-
-              Value* add = i->src3.value;
-              i->Replace(&OPCODE_SUB_info, 0);
-              i->set_src1(mul);
-              i->set_src2(add);
-
-              result = true;
-            }
-          }
-          break;
        case OPCODE_MAX:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
@ -925,7 +905,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-        case OPCODE_VECTOR_DENORMFLUSH:
+        case OPCODE_VECTOR_DENORMFLUSH:  // this one is okay to constant
+                                         // evaluate, since it is just bit math
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->DenormalFlush();
@ -933,19 +914,10 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-        case OPCODE_TO_SINGLE:
-          if (i->src1.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->ToSingle();
-            i->Remove();
-            result = true;
-          }
-          break;
        default:
          // Ignored.
          break;
      }
-      i = i->next;
    }

    block = block->next;
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -1287,7 +1287,11 @@ void HIRBuilder::SetRoundingMode(Value* value) {
  Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0);
  i->set_src1(value);
 }
-
+void HIRBuilder::SetNJM(Value* value) {
+  ASSERT_INTEGER_TYPE(value);
+  Instr* i = AppendInstr(OPCODE_SET_NJM_info, 0);
+  i->set_src1(value);
+}
 Value* HIRBuilder::Max(Value* value1, Value* value2) {
  ASSERT_TYPES_EQUAL(value1, value2);

@ -1632,7 +1636,7 @@ Value* HIRBuilder::Div(Value* value1, Value* value2,
 Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
-
+  #if 0
  bool c1 = value1->IsConstant();
  bool c2 = value2->IsConstant();
  if (c1 && c2) {
@ -1640,7 +1644,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
    dest->Mul(value2);
    return Add(dest, value3);
  }
-
+  #endif
  Instr* i = AppendInstr(OPCODE_MUL_ADD_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
@ -1651,7 +1655,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
 Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
-
+  #if 0
  bool c1 = value1->IsConstant();
  bool c2 = value2->IsConstant();
  if (c1 && c2) {
@ -1659,7 +1663,7 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
    dest->Mul(value2);
    return Sub(dest, value3);
  }
-
+  #endif
  Instr* i = AppendInstr(OPCODE_MUL_SUB_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -264,7 +264,7 @@ class HIRBuilder {
                               Value* new_value);
  Value* AtomicAdd(Value* address, Value* value);
  Value* AtomicSub(Value* address, Value* value);
-
+  void SetNJM(Value* value);
 protected:
  void DumpValue(StringBuffer* str, Value* value);
  void DumpOp(StringBuffer* str, OpcodeSignatureType sig_type, Instr::Op* op);
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -284,6 +284,7 @@ enum Opcode {
  OPCODE_TO_SINGLE,  // i could not find a decent name to assign to this opcode,
                     // as we already have OPCODE_ROUND. round double to float (
                     // ppc "single" fpu instruction result rounding behavior )
+	  OPCODE_SET_NJM, 
  __OPCODE_MAX_VALUE,  // Keep at end.
 };

@ -295,6 +296,7 @@ enum OpcodeFlags {
  OPCODE_FLAG_IGNORE = (1 << 5),
  OPCODE_FLAG_HIDE = (1 << 6),
  OPCODE_FLAG_PAIRED_PREV = (1 << 7),
+  OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING = (1 << 8)
 };

 enum OpcodeSignatureType {
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -151,25 +151,25 @@ DEFINE_OPCODE(
    OPCODE_CONVERT,
    "convert",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_ROUND,
    "round",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_VECTOR_CONVERT_I2F,
    "vector_convert_i2f",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_VECTOR_CONVERT_F2I,
    "vector_convert_f2i",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_LOAD_VECTOR_SHL,
@ -456,13 +456,13 @@ DEFINE_OPCODE(
    OPCODE_MUL_ADD,
    "mul_add",
    OPCODE_SIG_V_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_MUL_SUB,
    "mul_sub",
    OPCODE_SIG_V_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_NEG,
@ -480,43 +480,43 @@ DEFINE_OPCODE(
    OPCODE_SQRT,
    "sqrt",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_RSQRT,
    "rsqrt",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_RECIP,
    "recip",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_POW2,
    "pow2",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_LOG2,
    "log2",
    OPCODE_SIG_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_DOT_PRODUCT_3,
    "dot_product_3",
    OPCODE_SIG_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_DOT_PRODUCT_4,
    "dot_product_4",
    OPCODE_SIG_V_V_V,
-    0)
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

 DEFINE_OPCODE(
    OPCODE_AND,
@ -685,5 +685,11 @@ DEFINE_OPCODE(
 	OPCODE_TO_SINGLE,
 	"to_single",
 	OPCODE_SIG_V_V,
+	OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING
+)
+DEFINE_OPCODE(
+	OPCODE_SET_NJM,
+	"set_njm",
+	OPCODE_SIG_X_V,
 	0
 )
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) {
      return;
  }
 }
-
+//WARNING: this does not handle rounding flags at all!
 void Value::Convert(TypeName target_type, RoundMode round_mode) {
  switch (type) {
    case FLOAT32_TYPE:
@ -401,7 +401,7 @@ void Value::MulHi(Value* other, bool is_unsigned) {
                      32);
      }
      break;
-    case INT64_TYPE:
+    case INT64_TYPE: {
 #if XE_COMPILER_MSVC
      if (is_unsigned) {
        constant.i64 = __umulh(constant.i64, other->constant.i64);
@ -409,17 +409,19 @@ void Value::MulHi(Value* other, bool is_unsigned) {
        constant.i64 = __mulh(constant.i64, other->constant.i64);
      }
 #else
+      unsigned __int128 product;
      if (is_unsigned) {
-        constant.i64 = static_cast<uint64_t>(
-            static_cast<unsigned __int128>(constant.i64) *
-            static_cast<unsigned __int128>(other->constant.i64));
+        product = static_cast<unsigned __int128>(constant.i64) *
+                  static_cast<unsigned __int128>(other->constant.i64);
      } else {
-        constant.i64 =
-            static_cast<uint64_t>(static_cast<__int128>(constant.i64) *
-                                  static_cast<__int128>(other->constant.i64));
+        product = static_cast<unsigned __int128>(
+            static_cast<__int128>(constant.i64) *
+            static_cast<__int128>(other->constant.i64));
      }
+      constant.i64 = static_cast<int64_t>(product >> 64);
 #endif  // XE_COMPILER_MSVC
      break;
+    }
    default:
      assert_unhandled_case(type);
      break;
@ -495,52 +497,6 @@ void Value::Max(Value* other) {
  }
 }

-void Value::MulAdd(Value* dest, Value* value1, Value* value2, Value* value3) {
-  switch (dest->type) {
-    case VEC128_TYPE:
-      for (int i = 0; i < 4; i++) {
-        dest->constant.v128.f32[i] =
-            (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) +
-            value3->constant.v128.f32[i];
-      }
-      break;
-    case FLOAT32_TYPE:
-      dest->constant.f32 =
-          (value1->constant.f32 * value2->constant.f32) + value3->constant.f32;
-      break;
-    case FLOAT64_TYPE:
-      dest->constant.f64 =
-          (value1->constant.f64 * value2->constant.f64) + value3->constant.f64;
-      break;
-    default:
-      assert_unhandled_case(dest->type);
-      break;
-  }
-}
-
-void Value::MulSub(Value* dest, Value* value1, Value* value2, Value* value3) {
-  switch (dest->type) {
-    case VEC128_TYPE:
-      for (int i = 0; i < 4; i++) {
-        dest->constant.v128.f32[i] =
-            (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) -
-            value3->constant.v128.f32[i];
-      }
-      break;
-    case FLOAT32_TYPE:
-      dest->constant.f32 =
-          (value1->constant.f32 * value2->constant.f32) - value3->constant.f32;
-      break;
-    case FLOAT64_TYPE:
-      dest->constant.f64 =
-          (value1->constant.f64 * value2->constant.f64) - value3->constant.f64;
-      break;
-    default:
-      assert_unhandled_case(dest->type);
-      break;
-  }
-}
-
 void Value::Neg() {
  switch (type) {
    case INT8_TYPE:
@ -1643,11 +1599,7 @@ void Value::DenormalFlush() {
    constant.v128.u32[i] = current_element;
  }
 }
-void Value::ToSingle() {
-  assert_true(type == FLOAT64_TYPE);

-  constant.f64 = static_cast<double>(static_cast<float>(constant.f64));
-}
 void Value::CountLeadingZeros(const Value* other) {
  switch (other->type) {
    case INT8_TYPE:
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -563,8 +563,7 @@ class Value {
  void MulHi(Value* other, bool is_unsigned);
  void Div(Value* other, bool is_unsigned);
  void Max(Value* other);
-  static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3);
-  static void MulSub(Value* dest, Value* value1, Value* value2, Value* value3);
+
  void Neg();
  void Abs();
  void Sqrt();
@ -603,7 +602,6 @@ class Value {
                     bool saturate);
  void ByteSwap();
  void DenormalFlush();
-  void ToSingle();
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
@ -615,7 +613,10 @@ class Value {
  // returns true if every single use is as an operand to a single instruction
  // (add var2, var1, var1)
  bool AllUsesByOneInsn() const;
-
+  //the maybe is here because this includes vec128, which is untyped data that can be treated as float or int depending on the context
+  bool MaybeFloaty() const {
+    return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
+  }
 private:
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
  static bool CompareInt16(Opcode opcode, Value* a, Value* b);
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -364,7 +364,16 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
  // is this the right format?
+	//todo: what mtvscr does with the unused bits is implementation defined, figure out what it does
+
+
  Value* v = f.LoadVR(i.VX128_1.RB);
+
+
+  Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE);
+
+  f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536))));
+
  f.StoreContext(offsetof(PPCContext, vscr_vec), v);
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -382,7 +382,6 @@ int InstrEmit_mtfsfx(PPCHIRBuilder& f, const InstrData& i) {
    return 1;
  } else {
    assert_zero(i.XFL.W);
-
    // Store under control of mask.
    // Expand the mask from 8 bits -> 32 bits.
    uint32_t mask = 0;
@ -402,7 +401,7 @@ int InstrEmit_mtfsfx(PPCHIRBuilder& f, const InstrData& i) {

    // Update the system rounding mode.
    if (mask & 0x7) {
-      f.SetRoundingMode(v);
+      f.SetRoundingMode(f.And(v, f.LoadConstantInt32(7)));
    }
  }
  if (i.XFL.Rc) {
@ -425,7 +424,7 @@ int InstrEmit_mtfsfix(PPCHIRBuilder& f, const InstrData& i) {

  // Update the system rounding mode.
  if (mask & 0x7) {
-    f.SetRoundingMode(fpscr);
+    f.SetRoundingMode(f.And(fpscr, f.LoadConstantInt32(7)));
  }

  if (i.X.Rc) {
--- a/src/xenia/emulator.cc
+++ b/src/xenia/emulator.cc
@ -64,9 +64,13 @@ DEFINE_string(
    "or the module specified by the game. Leave blank to launch the default "
    "module.",
    "General");
+DEFINE_bool(allow_game_relative_writes, false,
+            "Not useful to non-developers. Allows code to write to paths "
+            "relative to game://. Used for "
+            "generating test data to compare with original hardware. ",
+            "General");

 namespace xe {
-
 using namespace xe::literals;

 Emulator::GameConfigLoadCallback::GameConfigLoadCallback(Emulator& emulator)
@ -282,7 +286,8 @@ const std::unique_ptr<vfs::Device> Emulator::CreateVfsDeviceBasedOnPath(
  auto extension = xe::utf8::lower_ascii(xe::path_to_utf8(path.extension()));
  if (extension == ".xex" || extension == ".elf" || extension == ".exe") {
    auto parent_path = path.parent_path();
-    return std::make_unique<vfs::HostPathDevice>(mount_path, parent_path, true);
+    return std::make_unique<vfs::HostPathDevice>(
+        mount_path, parent_path, !cvars::allow_game_relative_writes);
  } else {
    return std::make_unique<vfs::DiscImageDevice>(mount_path, path);
  }
@ -653,8 +658,8 @@ bool Emulator::ExceptionCallback(Exception* ex) {
    // debugger.
    return false;
  } else if (processor()->is_debugger_attached()) {
-    // Let the debugger handle this exception. It may decide to continue past it
-    // (if it was a stepping breakpoint, etc).
+    // Let the debugger handle this exception. It may decide to continue past
+    // it (if it was a stepping breakpoint, etc).
    return processor()->OnUnhandledException(ex);
  }

@ -823,8 +828,8 @@ static std::string format_version(xex2_version version) {

 X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
                                  const std::string_view module_path) {
-  // Making changes to the UI (setting the icon) and executing game config load
-  // callbacks which expect to be called from the UI thread.
+  // Making changes to the UI (setting the icon) and executing game config
+  // load callbacks which expect to be called from the UI thread.
  assert_true(display_window_->app_context().IsInUIThread());

  // Setup NullDevices for raw HDD partition accesses
@ -832,12 +837,12 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
  // By using a NullDevice that just returns success to all IO requests it
  // should allow games to believe cache/raw disk was accessed successfully

-  // NOTE: this should probably be moved to xenia_main.cc, but right now we need
-  // to register the \Device\Harddisk0\ NullDevice _after_ the
+  // NOTE: this should probably be moved to xenia_main.cc, but right now we
+  // need to register the \Device\Harddisk0\ NullDevice _after_ the
  // \Device\Harddisk0\Partition1 HostPathDevice, otherwise requests to
-  // Partition1 will go to this. Registering during CompleteLaunch allows us to
-  // make sure any HostPathDevices are ready beforehand.
-  // (see comment above cache:\ device registration for more info about why)
+  // Partition1 will go to this. Registering during CompleteLaunch allows us
+  // to make sure any HostPathDevices are ready beforehand. (see comment above
+  // cache:\ device registration for more info about why)
  auto null_paths = {std::string("\\Partition0"), std::string("\\Cache0"),
                     std::string("\\Cache1")};
  auto null_device =
@ -900,8 +905,8 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
  if (module->title_id()) {
    auto title_id = fmt::format("{:08X}", module->title_id());

-    // Load the per-game configuration file and make sure updates are handled by
-    // the callbacks.
+    // Load the per-game configuration file and make sure updates are handled
+    // by the callbacks.
    config::LoadGameConfig(title_id);
    assert_true(game_config_load_callback_loop_next_index_ == SIZE_MAX);
    game_config_load_callback_loop_next_index_ = 0;
@ -934,10 +939,10 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
    }
  }

-  // Initializing the shader storage in a blocking way so the user doesn't miss
-  // the initial seconds - for instance, sound from an intro video may start
-  // playing before the video can be seen if doing this in parallel with the
-  // main thread.
+  // Initializing the shader storage in a blocking way so the user doesn't
+  // miss the initial seconds - for instance, sound from an intro video may
+  // start playing before the video can be seen if doing this in parallel with
+  // the main thread.
  on_shader_storage_initialization(true);
  graphics_system_->InitializeShaderStorage(cache_root_, title_id_.value(),
                                            true);