implement bit-perfect vrsqrtefp

2023-09-30 14:59:56 -04:00 · 2023-09-30 14:59:56 -04:00 · 79465708aa
parent cfecdcbeab
commit 79465708aa
9 changed files with 540 additions and 28 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -73,6 +73,9 @@ class X64HelperEmitter : public X64Emitter {
  void* EmitTryAcquireReservationHelper();
  void* EmitReservedStoreHelper(bool bit64 = false);

+  void* EmitScalarVRsqrteHelper();
+  void* EmitVectorVRsqrteHelper(void* scalar_helper);
+
 private:
  void* EmitCurrentForOffsets(const _code_offsets& offsets,
                              size_t stack_size = 0);
@ -207,6 +210,8 @@ bool X64Backend::Initialize(Processor* processor) {
  if (!code_cache_->Initialize()) {
    return false;
  }
+  // Allocate emitter constant data.
+  emitter_data_ = X64Emitter::PlaceConstData();

  // Generate thunks used to transition between jitted code and host code.
  XbyakAllocator allocator;
@ -233,7 +238,8 @@ bool X64Backend::Initialize(Processor* processor) {
      thunk_emitter.EmitTryAcquireReservationHelper();
  reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
  reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
-
+  vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
+  vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
  // Set the code cache to use the ResolveFunction thunk for default
  // indirections.
  assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
@ -243,9 +249,6 @@ bool X64Backend::Initialize(Processor* processor) {
  // Allocate some special indirections.
  code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);

-  // Allocate emitter constant data.
-  emitter_data_ = X64Emitter::PlaceConstData();
-
  // Setup exception callback
  ExceptionHandler::Install(&ExceptionCallbackThunk, this);
  if (cvars::record_mmio_access_exceptions) {
@ -844,7 +847,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
  _code_offsets code_offsets = {};
  code_offsets.prolog = getSize();
  pop(r8);  // return address
-
+  
  switch (stack_element_size) {
    case 4:
      mov(r11d, ptr[r8]);
@ -865,6 +868,300 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
  return EmitCurrentForOffsets(code_offsets);
 }

+void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
+  _code_offsets code_offsets = {};
+
+  Xbyak::Label L18, L2, L35, L4, L9, L8, L10, L11, L12, L13, L1;
+  Xbyak::Label LC1, _LCPI3_1;
+  Xbyak::Label handle_denormal_input;
+  Xbyak::Label specialcheck_1, convert_to_signed_inf_and_ret, handle_oddball_denormal;
+
+  auto emulate_lzcnt_helper_unary_reg = [this](auto& reg, auto& scratch_reg) {
+    inLocalLabel();
+    Xbyak::Label end_lzcnt;
+    bsr(scratch_reg, reg);
+    mov(reg, 0x20);
+    jz(end_lzcnt);
+    xor_(scratch_reg, 0x1F);
+    mov(reg, scratch_reg);
+    L(end_lzcnt);
+    outLocalLabel();
+  };
+
+  vmovd(r8d, xmm0);
+  vmovaps(xmm1, xmm0);
+  mov(ecx, r8d);
+  //extract mantissa
+  and_(ecx, 0x7fffff);
+  mov(edx, ecx);
+  cmp(r8d, 0xff800000);
+  jz(specialcheck_1, CodeGenerator::T_NEAR);
+  //is exponent zero?
+  test(r8d, 0x7f800000);
+  jne(L18);
+  test(ecx, ecx);
+  jne(L2);
+
+  L(L18);
+  //extract biased exponent and unbias
+  mov(r9d, r8d);
+  shr(r9d, 23);
+  movzx(r9d, r9b);
+  lea(eax, ptr[r9 - 127]);
+  cmp(r9d, 255);
+  jne(L4);
+  jmp(L35);
+
+  L(L2);
+
+  bt(GetBackendFlagsPtr(), kX64BackendNJMOn);
+  jnc(handle_denormal_input, CodeGenerator::T_NEAR);
+ 
+  // handle denormal input with NJM on 
+  // denorms get converted to zero w/ input sign, jump to our label
+  // that handles inputs of 0 for this
+  
+  jmp(convert_to_signed_inf_and_ret);
+  L(L35);
+
+  vxorps(xmm0, xmm0, xmm0);
+  mov(eax, 128);
+  vcomiss(xmm1, xmm0);
+  jb(L4);
+  test(ecx, ecx);
+  jne(L8);
+  ret();
+
+  L(L4);
+  cmp(eax, 128);
+  jne(L9);
+  vxorps(xmm0, xmm0, xmm0);
+  vcomiss(xmm0, xmm1);
+  jbe(L9);
+  vmovss(xmm2, ptr[rip+LC1]);
+  vandps(xmm1, GetXmmConstPtr(XMMSignMaskF32));
+
+  test(edx, edx);
+  jne(L8);
+  vorps(xmm0, xmm2, xmm2);
+  ret();
+
+  L(L9);
+  test(edx, edx);
+  je(L10);
+  cmp(eax, 128);
+  jne(L11);
+  L(L8);
+  or_(r8d, 0x400000);
+  vmovd(xmm0, r8d);
+  ret();
+  L(L10);
+  test(r9d, r9d);
+  jne(L11);
+  L(convert_to_signed_inf_and_ret);
+  not_(r8d);
+  shr(r8d, 31);
+
+  lea(rdx, ptr[rip + _LCPI3_1]);
+  shl(r8d, 2);
+  vmovss(xmm0, ptr[r8 + rdx]);
+  ret();
+
+  L(L11);
+  vxorps(xmm2, xmm2, xmm2);
+  vmovss(xmm0, ptr[rip+LC1]);
+  vcomiss(xmm2, xmm1);
+  ja(L1, CodeGenerator::T_NEAR);
+  mov(ecx, 127);
+  sal(eax, 4);
+  sub(ecx, r9d);
+  mov(r9d, edx);
+  and_(eax, 16);
+  shr(edx, 9);
+  shr(r9d, 19);
+  and_(edx, 1023);
+  sar(ecx, 1);
+  or_(eax, r9d);
+  xor_(eax, 16);
+  mov(r9d, ptr[backend()->LookupXMMConstantAddress32(XMMVRsqrteTableStart) +
+               rax * 4]);
+  mov(eax, r9d);
+  shr(r9d, 16);
+  imul(edx, r9d);
+  sal(eax, 10);
+  and_(eax, 0x3fffc00);
+  sub(eax, edx);
+  bt(eax, 25);
+  jc(L12);
+  mov(edx, eax);
+  add(ecx, 6);
+  and_(edx, 0x1ffffff);
+
+  if (IsFeatureEnabled(kX64EmitLZCNT)) {
+    lzcnt(edx, edx);
+  } else {
+    emulate_lzcnt_helper_unary_reg(edx, r9d);
+  }
+
+  lea(r9d, ptr[rdx - 6]);
+  sub(ecx, edx);
+  if (IsFeatureEnabled(kX64EmitBMI2)) {
+    shlx(eax, eax, r9d);
+  } else {
+    xchg(ecx, r9d);
+    shl(eax, cl);
+    xchg(ecx, r9d);
+  }
+
+  L(L12);
+  test(al, 5);
+  je(L13);
+  test(al, 2);
+  je(L13);
+  add(eax, 4);
+
+  L(L13);
+  sal(ecx, 23);
+  and_(r8d, 0x80000000);
+  shr(eax, 2);
+  add(ecx, 0x3f800000);
+  and_(eax, 0x7fffff);
+  vxorps(xmm1, xmm1);
+  or_(ecx, r8d);
+  or_(ecx, eax);
+  vmovd(xmm0, ecx);
+  vaddss(xmm0, xmm1);//apply DAZ behavior to output
+
+  L(L1);
+  ret();
+
+
+  L(handle_denormal_input);
+  mov(r9d, r8d);
+  and_(r9d, 0x7FFFFFFF);
+  cmp(r9d, 0x400000);
+  jz(handle_oddball_denormal);
+  if (IsFeatureEnabled(kX64EmitLZCNT)) {
+    lzcnt(ecx, ecx);
+  } else {
+    emulate_lzcnt_helper_unary_reg(ecx, r9d);
+  }
+
+  mov(r9d, 9);
+  mov(eax, -118);
+  lea(edx, ptr[rcx - 8]);
+  sub(r9d, ecx);
+  sub(eax, ecx);
+  if (IsFeatureEnabled(kX64EmitBMI2)) {
+    shlx(edx, r8d, edx);
+  } else {
+    xchg(ecx, edx);
+    // esi is just the value of xmm0's low word, so we can restore it from there
+    shl(r8d, cl);
+    mov(ecx, edx);  // restore ecx, dont xchg because we're going to spoil edx anyway
+    mov(edx, r8d);
+    vmovd(r8d, xmm0);
+  }
+  and_(edx, 0x7ffffe);
+  jmp(L4);
+
+  L(specialcheck_1);
+  //should be extremely rare
+  vmovss(xmm0, ptr[rip+LC1]);
+  ret();
+
+  L(handle_oddball_denormal);
+  not_(r8d);
+  lea(r9, ptr[rip + LC1]);
+
+  shr(r8d, 31);
+  movss(xmm0, ptr[r9 + r8 * 4]);
+  ret();
+
+  L(_LCPI3_1);
+  dd(0xFF800000);
+  dd(0x7F800000);
+  L(LC1);
+  //the position of 7FC00000 here matters, this address will be indexed in handle_oddball_denormal
+  dd(0x7FC00000);
+  dd(0x5F34FD00);
+
+
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.prolog = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+
+void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
+  _code_offsets code_offsets = {};
+  Xbyak::Label check_scalar_operation_in_vmx, actual_vector_version;
+  auto result_ptr =
+      GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_xmms[0]));
+  auto counter_ptr = GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_u64s[2]));
+  counter_ptr.setBit(64);
+
+  //shuffle and xor to check whether all lanes are equal
+  //sadly has to leave the float pipeline for the vptest, which is moderate yikes
+  vmovhlps(xmm2, xmm0, xmm0);
+  vmovsldup(xmm1, xmm0);
+  vxorps(xmm1, xmm1, xmm0);
+  vxorps(xmm2, xmm2, xmm0);
+  vorps(xmm2, xmm1, xmm2);
+  vptest(xmm2, xmm2);
+  jnz(check_scalar_operation_in_vmx);
+  //jmp(scalar_helper, CodeGenerator::T_NEAR);
+  call(scalar_helper);
+  vshufps(xmm0, xmm0, xmm0, 0);
+  ret();
+
+  L(check_scalar_operation_in_vmx);
+
+  vptest(xmm0, ptr[backend()->LookupXMMConstantAddress(XMMThreeFloatMask)]);
+  jnz(actual_vector_version);
+  vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+  call(scalar_helper);
+ // this->DebugBreak();
+  vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6));
+
+  vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)],
+           0b0111);
+  
+  ret();
+
+
+  L(actual_vector_version);
+
+
+  xor_(ecx, ecx);
+  vmovaps(result_ptr, xmm0);
+
+  mov(counter_ptr, rcx);
+  Xbyak::Label loop;
+
+  L(loop);
+  lea(rax, result_ptr);
+  vmovss(xmm0, ptr[rax+rcx*4]);
+  call(scalar_helper);
+  mov(rcx, counter_ptr);
+  lea(rax, result_ptr);
+  vmovss(ptr[rax+rcx*4], xmm0);
+  inc(ecx);
+  cmp(ecx, 4);
+  mov(counter_ptr, rcx);
+  jl(loop);
+  vmovaps(xmm0, result_ptr);
+  ret();
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  code_offsets.prolog = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+
 void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
  _code_offsets code_offsets = {};
  code_offsets.prolog = getSize();
@ -872,7 +1169,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
  Xbyak::Label already_has_a_reservation;
  Xbyak::Label acquire_new_reservation;

-  btr(GetBackendFlagsPtr(), 1);
+  btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
  mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
  jc(already_has_a_reservation);

@ -888,7 +1185,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
  // set flag on local backend context for thread to indicate our previous
  // attempt to get the reservation succeeded
  setnc(r9b);  // success = bitmap did not have a set bit at the idx
-  shl(r9b, 1);
+  shl(r9b, kX64BackendHasReserveBit);

  mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
      rdx);
@ -917,7 +1214,7 @@ void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
  Xbyak::Label somehow_double_cleared;
  // carry must be set + zero flag must be set

-  btr(GetBackendFlagsPtr(), 1);
+  btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);

  jnc(done);

@ -1097,7 +1394,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
                          : nullptr;
  bctx->current_stackpoint_depth = 0;
  bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
-  bctx->flags = 0;
+  bctx->flags = (1U << kX64BackendNJMOn);  // NJM on by default
  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
  bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
@ -1128,7 +1425,9 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
  uint32_t control = mode & 7;
  _mm_setcsr(mxcsr_table[control]);
  bctx->mxcsr_fpu = mxcsr_table[control];
-  ((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
+  auto ppc_context = ((ppc::PPCContext*)ctx);
+  ppc_context->fpscr.bits.rn = control;
+  ppc_context->fpscr.bits.ni = control >> 2;
 }

 bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -61,11 +61,22 @@ struct X64BackendStackpoint {
  // use
  unsigned guest_return_address_;
 };
+enum : uint32_t { 
+    kX64BackendMXCSRModeBit = 0, 
+    kX64BackendHasReserveBit = 1,
+    kX64BackendNJMOn = 2, //non-java mode bit is currently set. for use in software fp routines
+    kX64BackendNonIEEEMode = 3, //non-ieee mode is currently enabled for scalar fpu.
+};
 // located prior to the ctx register
 // some things it would be nice to have be per-emulator instance instead of per
 // context (somehow placing a global X64BackendCtx prior to membase, so we can
 // negatively index the membase reg)
 struct X64BackendContext {
+  union {
+    __m128 helper_scratch_xmms[4];
+    uint64_t helper_scratch_u64s[8];
+    uint32_t helper_scratch_u32s[16];
+  };
  ReserveHelper* reserve_helper_;
  uint64_t cached_reserve_value_;
  // guest_tick_count is used if inline_loadclock is used
@ -147,6 +158,13 @@ class X64Backend : public Backend {
  virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
  virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override;
  void RecordMMIOExceptionForGuestInstruction(void* host_address);
+
+  uint32_t LookupXMMConstantAddress32(unsigned index) {
+    return static_cast<uint32_t>(emitter_data() + sizeof(vec128_t) * index);
+  }
+  void* LookupXMMConstantAddress(unsigned index) {
+    return reinterpret_cast<void*>(emitter_data() + sizeof(vec128_t) * index);
+  }
 #if XE_X64_PROFILER_AVAILABLE == 1
  uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
 #endif
@ -173,7 +191,8 @@ class X64Backend : public Backend {
  void* try_acquire_reservation_helper_ = nullptr;
  void* reserved_store_32_helper = nullptr;
  void* reserved_store_64_helper = nullptr;
-
+  void* vrsqrtefp_vector_helper = nullptr;
+  void* vrsqrtefp_scalar_helper = nullptr;
 private:
 #if XE_X64_PROFILER_AVAILABLE == 1
  GuestProfilerData profiler_data_;
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -982,6 +982,16 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
  return result;
 }

+static inline vec128_t v128_setr_words(uint32_t v0, uint32_t v1, uint32_t v2,
+    uint32_t v3) {
+  vec128_t result;
+  result.u32[0] = v0;
+  result.u32[1] = v1;
+  result.u32[2] = v2;
+  result.u32[3] = v3;
+  return result;
+}
+
 static const vec128_t xmm_consts[] = {
    /* XMMZero                */ vec128f(0.0f),
    /* XMMByteSwapMask        */
@ -1151,7 +1161,19 @@ static const vec128_t xmm_consts[] = {
    vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
    // XMMVSRMask
-    vec128b(1)};
+    vec128b(1),
+    //XMMVRsqrteTableStart
+    v128_setr_words(0x568B4FD, 0x4F3AF97, 0x48DAAA5, 0x435A618),
+    v128_setr_words(0x3E7A1E4, 0x3A29DFE, 0x3659A5C, 0x32E96F8),
+    v128_setr_words(0x2FC93CA, 0x2D090CE, 0x2A88DFE, 0x2838B57),
+    v128_setr_words(0x26188D4, 0x2438673, 0x2268431, 0x20B820B),
+    v128_setr_words(0x3D27FFA, 0x3807C29, 0x33878AA, 0x2F97572),
+    v128_setr_words(0x2C27279, 0x2926FB7, 0x2666D26, 0x23F6AC0),
+    v128_setr_words(0x21D6881, 0x1FD6665, 0x1E16468, 0x1C76287),
+    v128_setr_words(0x1AF60C1, 0x1995F12, 0x1855D79, 0x1735BF4),
+    //XMMVRsqrteTableBase
+    vec128i(0) //filled in later
+};

 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
@ -1223,7 +1245,17 @@ uintptr_t X64Emitter::PlaceConstData() {

  // The pointer must not be greater than 31 bits.
  assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
+
  std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
+  /*
+    set each 32-bit element of the constant XMMVRsqrteTableBase to be the address of the start of the constant XMMVRsqrteTableStart
+    this 
+  */
+  vec128_t* deferred_constants = reinterpret_cast<vec128_t*>(mem);
+  vec128_t* vrsqrte_table_base = &deferred_constants[XMMVRsqrteTableBase];
+  uint32_t ptr_to_vrsqrte_table32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(&deferred_constants[XMMVRsqrteTableStart]));
+  *vrsqrte_table_base = vec128i(ptr_to_vrsqrte_table32);
+
  memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);

  return reinterpret_cast<uintptr_t>(mem);
@ -1237,8 +1269,9 @@ void X64Emitter::FreeConstData(uintptr_t data) {
 Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
  // Load through fixed constant table setup by PlaceConstData.
  // It's important that the pointer is not signed, as it will be sign-extended.
-  return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
-                                     sizeof(vec128_t) * id)];
+  void* emitter_data_ptr = backend_->LookupXMMConstantAddress(static_cast<unsigned>(id));
+  xenia_assert(reinterpret_cast<uintptr_t>(emitter_data_ptr) < (1ULL << 31));//must not have signbit set
+  return ptr[emitter_data_ptr];
 }
 // Implies possible StashXmm(0, ...)!
 void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
@ -1634,9 +1667,9 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
    } else {  // even if already set, we still need to update flags to reflect
              // our mode
      if (new_mode == MXCSRMode::Fpu) {
-        btr(GetBackendFlagsPtr(), 0);
+        btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
      } else if (new_mode == MXCSRMode::Vmx) {
-        bts(GetBackendFlagsPtr(), 0);
+        bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
      } else {
        assert_unhandled_case(new_mode);
      }
@ -1646,11 +1679,11 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
    if (!already_set) {
      if (new_mode == MXCSRMode::Fpu) {
        LoadFpuMxcsrDirect();
-        btr(GetBackendFlagsPtr(), 0);
+        btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
        return true;
      } else if (new_mode == MXCSRMode::Vmx) {
        LoadVmxMxcsrDirect();
-        bts(GetBackendFlagsPtr(), 0);
+        bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
        return true;
      } else {
        assert_unhandled_case(new_mode);
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -174,7 +174,9 @@ enum XmmConst {
  XMMSTVLShuffle,
  XMMSTVRSwapMask,  // swapwordmask with bit 7 set
  XMMVSRShlByteshuf,
-  XMMVSRMask
+  XMMVSRMask,
+  XMMVRsqrteTableStart,
+  XMMVRsqrteTableBase = XMMVRsqrteTableStart + (32 / 4), //32 4-byte elements in table, 4 4-byte elements fit in each xmm

 };
 using amdfx::xopcompare_e;
@ -308,7 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator {

  size_t stack_size() const { return stack_size_; }
  SimdDomain DeduceSimdDomain(const hir::Value* for_value);
-
+ 
  void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
  /*
        returns true if had to load mxcsr. DOT_PRODUCT can use this to skip
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -3376,17 +3376,28 @@ struct SET_NJM_I8 : Sequence<SET_NJM_I8, I<OPCODE_SET_NJM, VoidOp, I8Op>> {
    auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));

    addr_vmx.setBit(32);
+    auto flags_ptr = e.GetBackendFlagsPtr();
    if (i.src1.is_constant) {
      if (i.src1.constant() == 0) {
        // turn off daz/flush2z
        e.mov(addr_vmx, _MM_MASK_MASK);
+        e.btr(flags_ptr, kX64BackendNJMOn);

      } else {
        e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
+        e.bts(flags_ptr, kX64BackendNJMOn);
      }

    } else {
+      e.mov(e.eax, flags_ptr);
+      e.mov(e.edx, 1U << kX64BackendNJMOn);
+      e.mov(e.ecx, e.edx);
+      e.not_(e.ecx);
+      e.and_(e.ecx, e.eax);
+      e.or_(e.edx, e.eax);
      e.test(i.src1, i.src1);
+      e.cmove(e.edx, e.ecx);
+      e.mov(flags_ptr, e.edx);
      e.mov(e.edx, DEFAULT_VMX_MXCSR);
      e.mov(e.eax, _MM_MASK_MASK);

--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -2123,12 +2123,19 @@ struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
-    if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
-      e.vrsqrt14ps(i.dest, src1);
+    /*
+        the vast majority of inputs to vrsqrte come from vmsum3 or vmsum4 as part
+        of a vector normalization sequence. in fact, its difficult to find uses of vrsqrte in titles
+        that have inputs which do not come from vmsum.
+    */
+    if (i.src1.value && i.src1.value->AllFloatVectorLanesSameValue()) {
+      e.vmovss(e.xmm0, src1);
+      e.call(e.backend()->vrsqrtefp_scalar_helper);
+      e.vshufps(i.dest, e.xmm0, e.xmm0, 0);
    } else {
-      e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne));
-      e.vsqrtps(e.xmm1, src1);
-      e.vdivps(i.dest, e.xmm0, e.xmm1);
+      e.vmovaps(e.xmm0, src1);
+      e.call(e.backend()->vrsqrtefp_vector_helper);
+      e.vmovaps(i.dest, e.xmm0);
    }
  }
 };
@ -3183,16 +3190,37 @@ struct SET_ROUNDING_MODE_I32
    // removed the And with 7 and hoisted that and into the InstrEmit_'s that
    // generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and
    // backends dont have to worry about it
+    auto flags_ptr = e.GetBackendFlagsPtr();
    if (i.src1.is_constant) {
-      e.mov(e.eax, mxcsr_table[i.src1.constant()]);
+      unsigned constant_value = i.src1.constant();
+      e.mov(e.eax, mxcsr_table[constant_value]);
+
+      if (constant_value & 4) {
+        e.or_(flags_ptr, 1U << kX64BackendNonIEEEMode);
+      }
+      else {
+        e.btr(flags_ptr, kX64BackendNonIEEEMode);
+      }
      e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
      e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
      e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);

    } else {
-      e.mov(e.ecx, i.src1);
+      //can andnot, but this is a very infrequently used opcode
+      e.mov(e.eax, 1U << kX64BackendNonIEEEMode);
+      e.mov(e.edx, e.eax);
+      e.not_(e.edx);
+      e.mov(e.ecx, flags_ptr);
+      //edx = flags w/ non ieee cleared
+      e.and_(e.edx, e.ecx);
+      //eax = flags w/ non ieee set
+      e.or_(e.eax, e.ecx);
+      e.bt(i.src1, 2);

+      e.mov(e.ecx, i.src1);
+      e.cmovc(e.edx, e.eax);
      e.mov(e.rax, uintptr_t(mxcsr_table));
+      e.mov(flags_ptr, e.edx);
      e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]);
      // this was not here
      e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx);
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -1370,6 +1370,38 @@ bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
      }
    }
  }
+
+  /*
+    splatting a 32-bit value extracted from a vector where all 4 32-bit values are the same should be eliminated and
+    instead use the vector extracted from, which will be identical
+    have seen this happen, some games vmsum and then splat the low float to all 4 floats, even though it already is there
+  */
+  if (opc == OPCODE_SPLAT) {
+    if (i->dest->type == VEC128_TYPE) {
+      auto splatted_value = i->src1.value;
+      auto splat_type = splatted_value->type;
+      if (splat_type == FLOAT32_TYPE || splat_type == INT32_TYPE) {
+        //its a splat of a fourbyte value, check the definition
+        auto splat_input_definition = splatted_value->GetDefSkipAssigns();
+        if (splat_input_definition) {
+          auto defining_opcode = splat_input_definition->GetOpcodeNum();
+          if (defining_opcode == OPCODE_EXTRACT) {
+            auto value_extracted_from = splat_input_definition->src1.value;
+            if (value_extracted_from->type == VEC128_TYPE) {
+
+              xenia_assert(splat_input_definition->dest->type == splat_type);
+
+              if (value_extracted_from->AllFloatVectorLanesSameValue()) {
+                i->Replace(&OPCODE_ASSIGN_info,0);
+                i->set_src1(value_extracted_from);
+                return true;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
  return false;
 }
 bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -1805,6 +1805,86 @@ bool Value::AllUsesByOneInsn() const {
  }
  return true;
 }
+bool Value::AllFloatVectorLanesSameValue(const hir::Value* for_value,
+                                              uint32_t current_depth) {
+  // limit recursion, otherwise this function will slow down emission
+  if (current_depth == 16) {
+    return false;
+  }
+  using namespace hir;
+  hir::Instr* definition;
+  Opcode definition_opcode_number;
+re_enter:
+  definition = for_value->def;
+  if (!definition) {
+    xenia_assert(for_value->IsConstant());
+
+    auto&& constant_value = for_value->constant.v128;
+    for (unsigned constant_lane_index = 1; constant_lane_index < 4; ++constant_lane_index) {
+      if (constant_value.u32[0] != constant_value.u32[constant_lane_index]) {
+        return false;
+      }
+    }
+    return true;
+  }
+  definition_opcode_number = definition->GetOpcodeNum();
+
+  if (definition_opcode_number == OPCODE_ASSIGN) {
+    for_value = definition->src1.value;
+    goto re_enter;
+  }
+
+  if (definition_opcode_number == OPCODE_VECTOR_DENORMFLUSH) {
+    for_value = definition->src1.value;
+    goto re_enter;
+  }
+  /*
+    vmsum propagates its result to every lane
+  */
+  if (definition_opcode_number == OPCODE_DOT_PRODUCT_4 ||
+      definition_opcode_number == OPCODE_DOT_PRODUCT_3) {
+    return true;
+  }
+  //if splat of 32-bit value type, return true
+  //technically a splat of int16 or int8 would also produce the same "float" in all lanes
+  //but i think its best to keep this function focused on specifically float data
+  if (definition_opcode_number == OPCODE_SPLAT) {
+    if (definition->dest->type == VEC128_TYPE) {
+      auto splat_src_value_type = definition->src1.value->type;
+      if (splat_src_value_type == INT32_TYPE ||
+          splat_src_value_type == FLOAT32_TYPE) {
+        return true;
+      }
+    }
+  }
+
+  switch (definition_opcode_number) { 
+      //all of these opcodes produce the same value for the same input
+  case OPCODE_RSQRT:
+  case OPCODE_RECIP:
+  case OPCODE_POW2:
+  case OPCODE_LOG2:
+      for_value = definition->src1.value;
+      goto re_enter;
+
+    //binary opcodes
+  case OPCODE_ADD:
+  case OPCODE_SUB:
+  case OPCODE_MUL:
+      if (!AllFloatVectorLanesSameValue(definition->src1.value,
+                                        current_depth + 1)) {
+        return false;
+      }
+      for_value = definition->src2.value;
+      goto re_enter;
+  default:
+      break;
+  }
+
+  return false;
+}
+
+
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -618,8 +618,16 @@ class Value {
  bool MaybeFloaty() const {
    return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
  }
-
+  bool AllFloatVectorLanesSameValue() const {
+    return Value::AllFloatVectorLanesSameValue(this);
+  }
 private:
+  /*
+returns true if for_value (which must be VEC128_TYPE) has the same value in
+every float
+*/
+  static bool AllFloatVectorLanesSameValue(const hir::Value* for_value,
+                                           uint32_t current_depth = 0);
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
  static bool CompareInt16(Opcode opcode, Value* a, Value* b);
  static bool CompareInt32(Opcode opcode, Value* a, Value* b);