Merge pull request #58 from chrisps/canary_experimental

[CPU] VKPKX Implementation, miscellaneous fixes
2022-08-08 07:54:26 +02:00 · 2022-08-08 07:54:26 +02:00 · 3ac99e0d7d
parent f45e9e5e9a 324a8eb818
commit 3ac99e0d7d
19 changed files with 512 additions and 477 deletions
--- a/src/xenia/cpu/backend/backend.h
+++ b/src/xenia/cpu/backend/backend.h
@ -67,6 +67,7 @@ class Backend {
  // up until the start of ctx may be used by the backend to store whatever data
  // they want
  virtual void InitializeBackendContext(void* ctx) {}
+  virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};

 protected:
  Processor* processor_ = nullptr;
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -689,8 +689,7 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
 #endif
 }
 void X64Backend::InitializeBackendContext(void* ctx) {
-  X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
-      reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
+  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
  bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
  bctx->mxcsr_fpu =
      DEFAULT_FPU_MXCSR;  // idk if this is right, check on rgh what the
@ -700,6 +699,18 @@ void X64Backend::InitializeBackendContext(void* ctx) {
  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
 }
+const uint32_t mxcsr_table[8] = {
+    0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
+};
+
+void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
+  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
+
+  uint32_t control = mode & 7;
+  _mm_setcsr(mxcsr_table[control]);
+  bctx->mxcsr_fpu = mxcsr_table[control];
+  ((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -37,9 +37,10 @@ typedef void (*ResolveFunctionThunk)();
 // negatively index the membase reg)
 struct X64BackendContext {
  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
-  unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu
+  unsigned int mxcsr_fpu;     // currently, the way we implement rounding mode
+                              // affects both vmx and the fpu
  unsigned int mxcsr_vmx;
-  unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx
+  unsigned int flags;   // bit 0 = 0 if mxcsr is fpu, else it is vmx
  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
                        // add of it by... 2 bytes lol
 };
@ -48,7 +49,7 @@ constexpr unsigned int DEFAULT_VMX_MXCSR =
    0x0040 | (_MM_MASK_MASK);  // default rounding mode for vmx

 constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80;
-
+extern const uint32_t mxcsr_table[8];
 class X64Backend : public Backend {
 public:
  static const uint32_t kForceReturnAddress = 0x9FFF0000u;
@ -85,6 +86,12 @@ class X64Backend : public Backend {
  void UninstallBreakpoint(Breakpoint* breakpoint) override;
  virtual void InitializeBackendContext(void* ctx) override;

+  X64BackendContext* BackendContextForGuestContext(void* ctx) {
+    return reinterpret_cast<X64BackendContext*>(
+        reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
+  }
+  virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
+
 private:
  static bool ExceptionCallbackThunk(Exception* ex, void* data);
  bool ExceptionCallback(Exception* ex);
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -50,6 +50,13 @@ DEFINE_bool(resolve_rel32_guest_calls, true,
            "Experimental optimization, directly call already resolved "
            "functions via x86 rel32 call/jmp",
            "CPU");
+
+DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
+            "Disables the FPU/VMX MXCSR sharing workaround, potentially "
+            "causing incorrect rounding behavior and denormal handling in VMX "
+            "code. The workaround may cause reduced CPU performance but is a "
+            "more accurate emulation",
+            "x64");
 namespace xe {
 namespace cpu {
 namespace backend {
@ -1374,13 +1381,13 @@ Xbyak::Label& X64Emitter::NewCachedLabel() {
  return *tmp;
 }

-template<bool switching_to_fpu>
+template <bool switching_to_fpu>
 static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
  auto flags = e.GetBackendFlagsPtr();
  if (switching_to_fpu) {
    e.btr(flags, 0);  // bit 0 set to 0 = is fpu mode
  } else {
-    e.bts(flags, 0); // bit 0 set to 1 = is vmx mode
+    e.bts(flags, 0);  // bit 0 set to 1 = is vmx mode
  }
  Xbyak::Label& come_back = e.NewCachedLabel();

@ -1391,20 +1398,24 @@ static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
          e.LoadFpuMxcsrDirect();
        } else {
          e.LoadVmxMxcsrDirect();
-		}
+        }
        e.jmp(come_back, X64Emitter::T_NEAR);
      });
  if (switching_to_fpu) {
    e.jc(reload_bailout,
         X64Emitter::T_NEAR);  // if carry flag was set, we were VMX mxcsr mode.
  } else {
-    e.jnc(reload_bailout,
-         X64Emitter::T_NEAR);  // if carry flag was set, we were VMX mxcsr mode.
+    e.jnc(
+        reload_bailout,
+        X64Emitter::T_NEAR);  // if carry flag was set, we were VMX mxcsr mode.
  }
  e.L(come_back);
 }

 bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
+  if (cvars::enable_incorrect_roundingmode_behavior) {
+    return false;  // no MXCSR mode handling!
+  }
  if (new_mode == mxcsr_mode_) {
    return false;
  }
@ -1420,21 +1431,21 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
        ChangeMxcsrModeDynamicHelper<false>(*this);
      } else {
        assert_unhandled_case(new_mode);
-	  }
-    } else { //even if already set, we still need to update flags to reflect our mode
+      }
+    } else {  // even if already set, we still need to update flags to reflect
+              // our mode
      if (new_mode == MXCSRMode::Fpu) {
        btr(GetBackendFlagsPtr(), 0);
      } else if (new_mode == MXCSRMode::Vmx) {
        bts(GetBackendFlagsPtr(), 0);
      } else {
        assert_unhandled_case(new_mode);
-      }	
-	}
+      }
+    }
  } else {
    mxcsr_mode_ = new_mode;
    if (!already_set) {
      if (new_mode == MXCSRMode::Fpu) {
-		  
        LoadFpuMxcsrDirect();
        btr(GetBackendFlagsPtr(), 0);
        return true;
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -23,6 +23,10 @@ DEFINE_bool(
    elide_e0_check, false,
    "Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
    "CPU");
+DEFINE_bool(enable_rmw_context_merging, false,
+            "Permit merging read-modify-write HIR instr sequences together "
+            "into x86 instructions that use a memory operand.",
+            "x64");

 namespace xe {
 namespace cpu {
@ -88,6 +92,9 @@ struct LoadModStoreContext : public LoadModStore {
 };
 static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
                                   LoadModStoreContext* out) {
+  if (!cvars::enable_rmw_context_merging) {
+    return false;
+  }
  if (!GetLoadModStore(loadinsn, out)) {
    return false;
  }
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@ -10,11 +10,13 @@
 #ifndef XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_
 #define XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_

+#include "xenia/base/logging.h"
 #include "xenia/cpu/hir/instr.h"

 #include <unordered_map>
-#define assert_impossible_sequence(name) \
-  assert_always("impossible sequence hit" #name);
+#define assert_impossible_sequence(name)          \
+  assert_always("impossible sequence hit" #name); \
+  XELOGE("impossible sequence hit: {}", #name)

 namespace xe {
 namespace cpu {
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -20,7 +20,9 @@
 DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.",
            "CPU");

-DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math",
+DEFINE_bool(permit_float_constant_evaluation, false,
+            "Allow float constant evaluation, may produce incorrect results "
+            "and break games math",
            "CPU");

 namespace xe {
@ -85,8 +87,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
      if (i->dest) {
        might_be_floatop |= i->dest->MaybeFloaty();
      }
-    
-	  bool should_skip_because_of_float =
+
+      bool should_skip_because_of_float =
          might_be_floatop && !cvars::permit_float_constant_evaluation;

      auto v = i->dest;
@ -557,6 +559,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
              i->Remove();
              result = true;
+            } else if (!i->src2.value->MaybeFloaty() &&
+                       i->src2.value->IsConstantZero()) {
+              // division by 0 == 0 every time,
+              v->set_zero(i->src2.value->type);
+              i->Remove();
+              result = true;
            } else if (i->src2.value->IsConstant()) {
              // Division by one = no-op.
              Value* src1 = i->src1.value;
@ -672,29 +680,33 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;
        case OPCODE_SHL:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->Shl(i->src2.value);
-            i->Remove();
-            result = true;
-          } else if (i->src2.value->IsConstantZero()) {
-            auto src1 = i->src1.value;
-            i->Replace(&OPCODE_ASSIGN_info, 0);
-            i->set_src1(src1);
-            result = true;
+          if (i->dest->type != VEC128_TYPE) {
+            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+              v->set_from(i->src1.value);
+              v->Shl(i->src2.value);
+              i->Remove();
+              result = true;
+            } else if (i->src2.value->IsConstantZero()) {
+              auto src1 = i->src1.value;
+              i->Replace(&OPCODE_ASSIGN_info, 0);
+              i->set_src1(src1);
+              result = true;
+            }
          }
          break;
        case OPCODE_SHR:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->Shr(i->src2.value);
-            i->Remove();
-            result = true;
-          } else if (i->src2.value->IsConstantZero()) {
-            auto src1 = i->src1.value;
-            i->Replace(&OPCODE_ASSIGN_info, 0);
-            i->set_src1(src1);
-            result = true;
+          if (i->dest->type != VEC128_TYPE) {
+            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+              v->set_from(i->src1.value);
+              v->Shr(i->src2.value);
+              i->Remove();
+              result = true;
+            } else if (i->src2.value->IsConstantZero()) {
+              auto src1 = i->src1.value;
+              i->Replace(&OPCODE_ASSIGN_info, 0);
+              i->set_src1(src1);
+              result = true;
+            }
          }
          break;
        case OPCODE_SHA:
@ -729,7 +741,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-
+#if 1
        case OPCODE_PERMUTE: {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
              i->src3.value->IsConstant() &&
@ -756,6 +768,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {

          break;
        }
+#endif
        case OPCODE_INSERT:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
              i->src3.value->IsConstant()) {
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -83,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
    iter_result |= SimplifyBitArith(builder);
    iter_result |= EliminateConversions(builder);
    iter_result |= SimplifyAssignments(builder);
+    iter_result |= SimplifyBasicArith(builder);

    result |= iter_result;
  } while (iter_result);
@ -1228,6 +1229,91 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) {
  return value;
 }

+bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
+                                          hir::HIRBuilder* builder) {
+  /*
+          example: (x <<1 ) + x == (x*3)
+
+  */
+  auto [shlinsn, addend] =
+      i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info);
+  if (!shlinsn) {
+    return false;
+  }
+  Instr* shift_insn = shlinsn->def;
+
+  Value* shift = shift_insn->src2.value;
+
+  // if not a constant shift, we cant combine to a multiply
+  if (!shift->IsConstant()) {
+    return false;
+  }
+
+  Value* shouldbeaddend = shift_insn->src1.value;
+
+  if (!shouldbeaddend->IsEqual(addend)) {
+    return false;
+  }
+
+  uint64_t multiplier = 1ULL << shift->constant.u8;
+
+  multiplier++;
+
+  hir::Value* oldvalue = shouldbeaddend;
+
+  i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED);
+  i->set_src1(oldvalue);
+
+  // this sequence needs to be broken out into some kind of LoadConstant(type,
+  // raw_value) method of hirbuilder
+  auto constmul = builder->AllocValue(oldvalue->type);
+  // could cause problems on big endian targets...
+  constmul->flags |= VALUE_IS_CONSTANT;
+  constmul->constant.u64 = multiplier;
+
+  i->set_src2(constmul);
+
+  return true;
+}
+
+bool SimplificationPass::SimplifySubArith(hir::Instr* i,
+                                          hir::HIRBuilder* builder) {
+  return false;
+}
+bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
+                                            hir::HIRBuilder* builder) {
+  if (!i->dest) {
+    return false;
+  }
+  if (i->dest->MaybeFloaty()) {
+    return false;
+  }
+
+  hir::Opcode op = i->GetOpcodeNum();
+
+  switch (op) {
+    case OPCODE_ADD: {
+      return SimplifyAddArith(i, builder);
+    }
+    case OPCODE_SUB: {
+      return SimplifySubArith(i, builder);
+    }
+  }
+  return false;
+}
+bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
+  bool result = false;
+  auto block = builder->first_block();
+  while (block) {
+    auto i = block->instr_head;
+    while (i) {
+      result |= SimplifyBasicArith(i, builder);
+      i = i->next;
+    }
+    block = block->next;
+  }
+  return result;
+}
 }  // namespace passes
 }  // namespace compiler
 }  // namespace cpu
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -32,6 +32,13 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool SimplifyAssignments(hir::HIRBuilder* builder);
  hir::Value* CheckValue(hir::Value* value, bool& result);
  bool SimplifyBitArith(hir::HIRBuilder* builder);
+
+  // handles simple multiplication/addition rules
+  bool SimplifyBasicArith(hir::HIRBuilder* builder);
+  bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
+
+  bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder);
+  bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder);
  // handle either or or xor with 0
  bool CheckOrXorZero(hir::Instr* i);
  bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -79,6 +79,10 @@ class Instr {
  void MoveBefore(Instr* other);
  void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
  void Remove();
+  const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
+  // if opcode is null, we have bigger problems
+  Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
+
  template <typename TPredicate>
  std::pair<Value*, Value*> BinaryValueArrangeByPredicateExclusive(
      TPredicate&& pred) {
@ -86,12 +90,13 @@ class Instr {
    auto src2_value = src2.value;
    if (!src1_value || !src2_value) return {nullptr, nullptr};

-    if (!opcode) return {nullptr, nullptr};  // impossible!
+    if (!GetOpcodeInfo()) return {nullptr, nullptr};  // impossible!

    // check if binary opcode taking two values. we dont care if the dest is a
    // value

-    if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr};
+    if (!IsOpcodeBinaryValue(GetOpcodeInfo()->signature))
+      return {nullptr, nullptr};

    if (pred(src1_value)) {
      if (pred(src2_value)) {
@ -119,7 +124,7 @@ if both are constant, return nullptr, nullptr
  std::pair<Value*, Value*> BinaryValueArrangeByDefiningOpcode(
      const OpcodeInfo* op_ptr) {
    return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) {
-      return value->def && value->def->opcode == op_ptr;
+      return value->def && value->def->GetOpcodeInfo() == op_ptr;
    });
  }

@ -143,7 +148,7 @@ if both are constant, return nullptr, nullptr
 */
  template <typename TCallable>
  void VisitValueOperands(TCallable&& call_for_values) {
-    uint32_t signature = opcode->signature;
+    uint32_t signature = GetOpcodeInfo()->signature;

    OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;

--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) {
      return;
  }
 }
-//WARNING: this does not handle rounding flags at all!
+// WARNING: this does not handle rounding flags at all!
 void Value::Convert(TypeName target_type, RoundMode round_mode) {
  switch (type) {
    case FLOAT32_TYPE:
@ -428,35 +428,57 @@ void Value::MulHi(Value* other, bool is_unsigned) {
  }
 }

+template <typename T>
+static T PPCUDiv(T numer, T denom) {
+  if (!denom) {
+    return 0;
+  } else {
+    return numer / denom;
+  }
+}
+template <typename T>
+static T PPCIDiv(T numer, T denom) {
+  if (!denom) {
+    return 0;
+  } else if (numer == static_cast<T>(1LL << ((sizeof(T) * CHAR_BIT) - 1)) &&
+             !~denom) {  // if numer is signbit and denom is all ones, signed
+                         // oflow
+    return 0;
+  } else {
+    return numer / denom;
+  }
+}
+
+// warning : we tolerate division by 0 in x64_sequences, but here we do not
 void Value::Div(Value* other, bool is_unsigned) {
  assert_true(type == other->type);
  switch (type) {
    case INT8_TYPE:
      if (is_unsigned) {
-        constant.i8 /= uint8_t(other->constant.i8);
+        constant.i8 = PPCUDiv<uint8_t>(constant.i8, other->constant.i8);
      } else {
-        constant.i8 /= other->constant.i8;
+        constant.i8 = PPCIDiv<int8_t>(constant.i8, other->constant.i8);
      }
      break;
    case INT16_TYPE:
      if (is_unsigned) {
-        constant.i16 /= uint16_t(other->constant.i16);
+        constant.i16 = PPCUDiv<uint16_t>(constant.i16, other->constant.i16);
      } else {
-        constant.i16 /= other->constant.i16;
+        constant.i16 = PPCIDiv<int16_t>(constant.i16, other->constant.i16);
      }
      break;
    case INT32_TYPE:
      if (is_unsigned) {
-        constant.i32 /= uint32_t(other->constant.i32);
+        constant.i32 = PPCUDiv<uint32_t>(constant.i32, other->constant.i32);
      } else {
-        constant.i32 /= other->constant.i32;
+        constant.i32 = PPCIDiv<int32_t>(constant.i32, other->constant.i32);
      }
      break;
    case INT64_TYPE:
      if (is_unsigned) {
-        constant.i64 /= uint64_t(other->constant.i64);
+        constant.i64 = PPCUDiv<uint64_t>(constant.i64, other->constant.i64);
      } else {
-        constant.i64 /= other->constant.i64;
+        constant.i64 = PPCIDiv<int64_t>(constant.i64, other->constant.i64);
      }
      break;
    case FLOAT32_TYPE:
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -364,12 +364,11 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
  // is this the right format?
-	//todo: what mtvscr does with the unused bits is implementation defined, figure out what it does
-
+  // todo: what mtvscr does with the unused bits is implementation defined,
+  // figure out what it does

  Value* v = f.LoadVR(i.VX128_1.RB);

-
  Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE);

  f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536))));
@ -1824,9 +1823,38 @@ int InstrEmit_vsum4ubs(PPCHIRBuilder& f, const InstrData& i) {
  return 1;
 }

+static Value* vkpkx_in_low(PPCHIRBuilder& f, Value* input) {
+  // truncate from argb8888 to 1 bit alpha, 5 bit red, 5 bit green, 5 bit blue
+  auto ShrU32Vec = [&f](Value* input, unsigned shift) {
+    return f.VectorShr(input, f.LoadConstantVec128(vec128i(shift)), INT32_TYPE);
+  };
+  auto AndU32Vec = [&f](Value* input, unsigned msk) {
+    return f.And(input, f.LoadConstantVec128(vec128i(msk)));
+  };
+  auto tmp1 = AndU32Vec(ShrU32Vec(input, 9), 0xFC00);
+  auto tmp2 = AndU32Vec(ShrU32Vec(input, 6), 0x3E0);
+  auto tmp3 = AndU32Vec(ShrU32Vec(input, 3), 0x1F);
+  return f.Or(tmp3, f.Or(tmp1, tmp2));
+}
+
 int InstrEmit_vpkpx(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  // I compared the results of this against over a million randomly generated
+  // sets of inputs and all compared equal
+
+  Value* src1 = f.LoadVR(i.VX.VA);
+
+  Value* src2 = f.LoadVR(i.VX.VB);
+
+  Value* pck1 = vkpkx_in_low(f, src1);
+  Value* pck2 = vkpkx_in_low(f, src2);
+
+  Value* result = f.Pack(
+      pck1, pck2,
+      PACK_TYPE_16_IN_32 | PACK_TYPE_IN_UNSIGNED | PACK_TYPE_OUT_UNSIGNED);
+
+  f.StoreVR(i.VX.VD, result);
+
+  return 0;
 }

 int InstrEmit_vpkshss_(PPCHIRBuilder& f, uint32_t vd, uint32_t va,
--- a/src/xenia/cpu/ppc/ppc_emit_alu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc
@ -336,10 +336,14 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
    XEINSTRNOTIMPLEMENTED();
    return 1;
  }
+  Value* ratrunc =
+      f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
+
+  Value* rbtrunc =
+      f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
+
+  Value* v = f.Sha(f.Mul(ratrunc, rbtrunc), 32);

-  Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
-                                  f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
-                          INT64_TYPE);
  f.StoreGPR(i.XO.RT, v);
  if (i.XO.Rc) {
    f.UpdateCR(0, v);
@ -355,10 +359,13 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
    return 1;
  }

-  Value* v = f.ZeroExtend(
-      f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
-              f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),
-      INT64_TYPE);
+  Value* ratrunc =
+      f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
+
+  Value* rbtrunc =
+      f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
+
+  Value* v = f.Shr(f.Mul(ratrunc, rbtrunc, ARITHMETIC_UNSIGNED), 32);
  f.StoreGPR(i.XO.RT, v);
  if (i.XO.Rc) {
    f.UpdateCR(0, v);
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -89,8 +89,10 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- 1.0 / (frB)

-  Value* v = f.Recip(f.LoadFPR(i.A.FRB));
-  v = f.ToSingle(v);
+  // this actually does seem to require single precision, oddly
+  // more research is needed
+  Value* v = f.Recip(f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE));
+  v = f.Convert(v, FLOAT64_TYPE);  // f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
--- a/src/xenia/cpu/ppc/ppc_emit_memory.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc
@ -11,9 +11,17 @@

 #include <stddef.h>
 #include "xenia/base/assert.h"
+#include "xenia/base/cvar.h"
 #include "xenia/cpu/ppc/ppc_context.h"
 #include "xenia/cpu/ppc/ppc_hir_builder.h"

+DEFINE_bool(
+    disable_prefetch_and_cachecontrol, false,
+    "Disables translating ppc prefetch/cache flush instructions to host "
+    "prefetch/cacheflush instructions. This may improve performance as these "
+    "instructions were written with the Xbox 360's cache in mind, and modern "
+    "processors do their own automatic prefetching.",
+    "CPU");
 namespace xe {
 namespace cpu {
 namespace ppc {
@ -1080,28 +1088,36 @@ int InstrEmit_stfsx(PPCHIRBuilder& f, const InstrData& i) {
 // https://randomascii.wordpress.com/2018/01/07/finding-a-cpu-design-bug-in-the-xbox-360/

 int InstrEmit_dcbf(PPCHIRBuilder& f, const InstrData& i) {
-  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  f.CacheControl(ea, 128,
-                 CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH);
+  if (!cvars::disable_prefetch_and_cachecontrol) {
+    Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
+    f.CacheControl(ea, 128,
+                   CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH);
+  }
  return 0;
 }

 int InstrEmit_dcbst(PPCHIRBuilder& f, const InstrData& i) {
-  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE);
+  if (!cvars::disable_prefetch_and_cachecontrol) {
+    Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
+    f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE);
+  }
  return 0;
 }

 int InstrEmit_dcbt(PPCHIRBuilder& f, const InstrData& i) {
-  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH);
+  if (!cvars::disable_prefetch_and_cachecontrol) {
+    Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
+    f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH);
+  }
  return 0;
 }

 int InstrEmit_dcbtst(PPCHIRBuilder& f, const InstrData& i) {
-  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  f.CacheControl(ea, 128,
-                 CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE);
+  if (!cvars::disable_prefetch_and_cachecontrol) {
+    Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
+    f.CacheControl(ea, 128,
+                   CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE);
+  }
  return 0;
 }

--- a/src/xenia/cpu/ppc/ppc_frontend.h
+++ b/src/xenia/cpu/ppc/ppc_frontend.h
@ -55,7 +55,9 @@ class PPCFrontend {
  PPCBuiltins builtins_ = {0};
  TypePool<PPCTranslator, PPCFrontend*> translator_pool_;
 };
-
+// Checks the state of the global lock and sets scratch to the current MSR
+// value.
+void CheckGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1);
 }  // namespace ppc
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/kernel/util/shim_utils.h
+++ b/src/xenia/kernel/util/shim_utils.h
@ -192,6 +192,21 @@ class ParamBase : public Param {
  T value_;
 };

+class ContextParam : public Param {
+ public:
+  ContextParam() : Param(), ctx_(nullptr) {}
+  ContextParam(PPCContext* value) : Param(), ctx_(value) {}
+  ContextParam(Init& init) : Param(init), ctx_(init.ppc_context) {}
+
+  operator PPCContext*() const { return ctx_; }
+  PPCContext* value() const { return ctx_; }
+
+  PPCContext* operator->() const { return ctx_; }
+
+ protected:
+  PPCContext* ctx_;
+};
+
 class PointerParam : public ParamBase<uint32_t> {
 public:
  PointerParam(Init& init) : ParamBase(init) {
@ -370,6 +385,7 @@ using int_result_t = shim::ResultBase<int32_t>;
 using dword_result_t = shim::ResultBase<uint32_t>;
 using pointer_result_t = shim::ResultBase<uint32_t>;
 using X_HRESULT_result_t = shim::ResultBase<X_HRESULT>;
+using ppc_context_t = shim::ContextParam;

 // Exported from kernel_state.cc.
 KernelState* kernel_state();
@ -422,6 +438,9 @@ inline void AppendParam(StringBuffer* string_buffer, lpdouble_t param) {
    string_buffer->AppendFormat("({:G})", param.value());
  }
 }
+inline void AppendParam(StringBuffer* string_buffer, ppc_context_t param) {
+  string_buffer->Append("ContextArg");
+}
 inline void AppendParam(StringBuffer* string_buffer, lpstring_t param) {
  string_buffer->AppendFormat("{:08X}", param.guest_address());
  if (param) {
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_misc.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_misc.cc
@ -8,12 +8,13 @@
 */

 #include "xenia/base/logging.h"
+#include "xenia/cpu/ppc/ppc_frontend.h"
+#include "xenia/cpu/processor.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/util/shim_utils.h"
 #include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
 #include "xenia/kernel/xthread.h"
 #include "xenia/xbox.h"
-
 namespace xe {
 namespace kernel {
 namespace xboxkrnl {
@ -22,6 +23,94 @@ void KeEnableFpuExceptions_entry(dword_t enabled) {
  // TODO(benvanik): can we do anything about exceptions?
 }
 DECLARE_XBOXKRNL_EXPORT1(KeEnableFpuExceptions, kNone, kStub);
+#if 0
+struct __declspec(align(8)) fpucontext_ptr_t {
+  char unknown_data[158];
+  __int16 field_9E;
+  char field_A0[2272];
+  unsigned __int64 saved_FPSCR;
+  double saved_fpu_regs[32];
+};
+#pragma pack(push, 1)
+struct __declspec(align(1)) r13_struct_t {
+  char field_0[6];
+  __int16 field_6;
+  char field_8[2];
+  char field_A;
+  char field_B[5];
+  int field_10;
+  char field_14[315];
+  char field_14F;
+  unsigned int field_150;
+  char field_154[427];
+  char field_2FF;
+  char field_300;
+};
+#pragma pack(pop)
+
+
+static uint64_t Do_mfmsr(ppc_context_t& ctx) {
+  auto frontend = ctx->thread_state->processor()->frontend();
+  cpu::ppc::CheckGlobalLock(
+      ctx, reinterpret_cast<void*>(&xe::global_critical_region::mutex()),
+      reinterpret_cast<void*>(&frontend->builtins()->global_lock_count));
+  return ctx->scratch;
+}
+
+void KeSaveFloatingPointState_entry(ppc_context_t& ctx) {
+  xe::Memory* memory = ctx->thread_state->memory();
+  unsigned int r13 = static_cast<unsigned int>(ctx->r[13]);
+
+
+
+  
+  r13_struct_t* st = memory->TranslateVirtual<r13_struct_t*>(r13);
+  /*
+                 lwz       r10, 0x150(r13)
+                lbz       r11, 0xA(r13)
+                tweqi     r10, 0
+                twnei     r11, 0
+  */
+
+  unsigned int r10 = st->field_150;
+  unsigned char r11 = st->field_A;
+
+  if (r10 == 0 || r11 != 0) {
+	  //trap!
+  }
+
+  //should do mfmsr here
+  
+  unsigned int r3 = xe::load_and_swap<unsigned int>(&st->field_10);
+  
+  //too much work to do the mfmsr/mtmsr stuff right now
+  int to_store = -2049;
+  xe::store_and_swap(&st->field_10, (unsigned int)to_store);
+  xe::store_and_swap(&st->field_6, (short)to_store);
+ 
+
+
+  if (r3 != ~0u) {
+    fpucontext_ptr_t* fpucontext =
+        memory->TranslateVirtual<fpucontext_ptr_t*>(r3);
+    xe::store_and_swap<uint64_t>(&fpucontext->saved_FPSCR, ctx->fpscr.value);
+	
+    for (unsigned int i = 0; i < 32; ++i) {
+      xe::store_and_swap(&fpucontext->saved_fpu_regs[i], ctx->f[i]);
+	}
+    xe::store_and_swap<unsigned short>(&fpucontext->field_9E, 0xD7FF);
+  }
+  ctx->processor->backend()->SetGuestRoundingMode(ctx.value(), 0);
+  ctx->fpscr.value = 0;
+  st->field_A = 1;
+
+  xe::store_and_swap(&st->field_10, r13 + 0x300);
+  ctx->r[3] = r3;
+
+}
+
+DECLARE_XBOXKRNL_EXPORT1(KeSaveFloatingPointState, kNone, kImplemented);
+#endif

 }  // namespace xboxkrnl
 }  // namespace kernel