From 2e9f656546caba6796b2f6ef1030f70b732a153f Mon Sep 17 00:00:00 2001
From: Connor McLaughlin <stenzek@gmail.com>
Date: Wed, 19 Aug 2020 23:26:57 +1000
Subject: [PATCH] CPU: Implement PGXP CPU Mode

This is *very* slow. You don't want to enable it if you don't need it.
It is also incompatible with the recompiler and will disable it if the
option is enabled.
---
 README.md                                     |    1 +
 src/common/types.h                            |    7 +
 src/core/cpu_code_cache.cpp                   |   20 +-
 src/core/cpu_code_cache.h                     |    1 +
 src/core/cpu_core.cpp                         |  497 +++++---
 src/core/cpu_recompiler_code_generator.cpp    |    6 +-
 src/core/cpu_recompiler_thunks.h              |    1 +
 src/core/cpu_types.h                          |    1 -
 src/core/host_interface.cpp                   |   20 +
 src/core/host_interface.h                     |    3 +
 src/core/pgxp.cpp                             | 1039 +++++++++++++++++
 src/core/pgxp.h                               |   52 +
 src/core/settings.cpp                         |    2 +
 src/core/settings.h                           |    6 +
 src/core/types.h                              |    7 +
 .../libretro_host_interface.cpp               |   10 +-
 src/duckstation-qt/gpusettingswidget.cpp      |    5 +
 src/duckstation-qt/gpusettingswidget.ui       |    7 +
 src/duckstation-sdl/sdl_host_interface.cpp    |    3 +
 19 files changed, 1490 insertions(+), 198 deletions(-)
diff --git a/README.md b/README.md
index 6ff2f0495..1d76ce271 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ A "BIOS" ROM image is required to to start the emulator and to play games. You c
 
 ## Latest News
 
+- 2020/08/19: CPU PGXP mode added. It is very slow and incompatible with the recompiler, only use for games which need it.
 - 2020/08/15: Playlist support/single memcard for multi-disc games in Qt frontend added.
 - 2020/08/07: Automatic updater for standalone Windows builds.
 - 2020/08/01: Initial PGXP (geometry/perspective correction) support.
diff --git a/src/common/types.h b/src/common/types.h
index 02657135d..6b94fd87f 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -15,6 +15,13 @@
 #endif
 #endif
 
+// Force inline in non-debug helper
+#ifdef _DEBUG
+#define ALWAYS_INLINE_RELEASE
+#else
+#define ALWAYS_INLINE_RELEASE ALWAYS_INLINE
+#endif
+
 // unreferenced parameter macro
 #ifndef UNREFERENCED_VARIABLE
 #if defined(_MSC_VER)
diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp
index 1bd8bda9c..eb21533f4 100644
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@@ -110,7 +110,8 @@ void Shutdown()
 #endif
 }
 
-void Execute()
+template<PGXPMode pgxp_mode>
+static void ExecuteImpl()
 {
   CodeBlockKey next_block_key;
 
@@ -157,7 +158,7 @@ void Execute()
       }
       else
       {
-        InterpretCachedBlock(*block);
+        InterpretCachedBlock<pgxp_mode>(*block);
       }
 
       if (g_state.pending_ticks >= g_state.downcount)
@@ -212,6 +213,21 @@ void Execute()
   g_state.regs.npc = g_state.regs.pc;
 }
 
+void Execute()
+{
+  if (g_settings.gpu_pgxp_enable)
+  {
+    if (g_settings.gpu_pgxp_cpu)
+      ExecuteImpl<PGXPMode::CPU>();
+    else
+      ExecuteImpl<PGXPMode::Memory>();
+  }
+  else
+  {
+    ExecuteImpl<PGXPMode::Disabled>();
+  }
+}
+
 #ifdef WITH_RECOMPILER
 
 void ExecuteRecompiler()
diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h
index 28401bea6..eec01ac3b 100644
--- a/src/core/cpu_code_cache.h
+++ b/src/core/cpu_code_cache.h
@@ -96,6 +96,7 @@ void SetUseRecompiler(bool enable);
 /// Invalidates all blocks which are in the range of the specified code page.
 void InvalidateBlocksWithPageIndex(u32 page_index);
 
+template<PGXPMode pgxp_mode>
 void InterpretCachedBlock(const CodeBlock& block);
 void InterpretUncachedBlock();
 
diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp
index 6ad58d962..6e2b8cb34 100644
--- a/src/core/cpu_core.cpp
+++ b/src/core/cpu_core.cpp
@@ -15,19 +15,9 @@ Log_SetChannel(CPU::Core);
 
 namespace CPU {
 
-/// Sets the PC and flushes the pipeline.
 static void SetPC(u32 new_pc);
-
-// Updates load delays - call after each instruction
 static void UpdateLoadDelay();
-
-// Fetches the instruction at m_regs.npc
-static void ExecuteInstruction();
-static void ExecuteCop0Instruction();
-static void ExecuteCop2Instruction();
 static void Branch(u32 target);
-
-// clears pipeline of load/branch delays
 static void FlushPipeline();
 
 State g_state;
@@ -139,14 +129,14 @@ bool DoState(StateWrapper& sw)
   return !sw.HasError();
 }
 
-void SetPC(u32 new_pc)
+ALWAYS_INLINE_RELEASE void SetPC(u32 new_pc)
 {
   DebugAssert(Common::IsAlignedPow2(new_pc, 4));
   g_state.regs.npc = new_pc;
   FlushPipeline();
 }
 
-void Branch(u32 target)
+ALWAYS_INLINE_RELEASE void Branch(u32 target)
 {
   if (!Common::IsAlignedPow2(target, 4))
   {
@@ -240,7 +230,7 @@ void ClearExternalInterrupt(u8 bit)
   g_state.cop0_regs.cause.Ip &= static_cast<u8>(~(1u << bit));
 }
 
-void UpdateLoadDelay()
+ALWAYS_INLINE_RELEASE static void UpdateLoadDelay()
 {
   // the old value is needed in case the delay slot instruction overwrites the same register
   if (g_state.load_delay_reg != Reg::count)
@@ -251,7 +241,7 @@ void UpdateLoadDelay()
   g_state.next_load_delay_reg = Reg::count;
 }
 
-void FlushPipeline()
+ALWAYS_INLINE_RELEASE static void FlushPipeline()
 {
   // loads are flushed
   g_state.next_load_delay_reg = Reg::count;
@@ -275,12 +265,12 @@ void FlushPipeline()
   g_state.current_instruction_was_branch_taken = false;
 }
 
-ALWAYS_INLINE u32 ReadReg(Reg rs)
+ALWAYS_INLINE static u32 ReadReg(Reg rs)
 {
   return g_state.regs.r[static_cast<u8>(rs)];
 }
 
-ALWAYS_INLINE void WriteReg(Reg rd, u32 value)
+ALWAYS_INLINE static void WriteReg(Reg rd, u32 value)
 {
   g_state.regs.r[static_cast<u8>(rd)] = value;
   g_state.load_delay_reg = (rd == g_state.load_delay_reg) ? Reg::count : g_state.load_delay_reg;
@@ -289,7 +279,7 @@ ALWAYS_INLINE void WriteReg(Reg rd, u32 value)
   g_state.regs.zero = 0;
 }
 
-static void WriteRegDelayed(Reg rd, u32 value)
+ALWAYS_INLINE_RELEASE static void WriteRegDelayed(Reg rd, u32 value)
 {
   Assert(g_state.next_load_delay_reg == Reg::count);
   if (rd == Reg::zero)
@@ -304,7 +294,7 @@ static void WriteRegDelayed(Reg rd, u32 value)
   g_state.next_load_delay_value = value;
 }
 
-static std::optional<u32> ReadCop0Reg(Cop0Reg reg)
+ALWAYS_INLINE_RELEASE static std::optional<u32> ReadCop0Reg(Cop0Reg reg)
 {
   switch (reg)
   {
@@ -347,7 +337,7 @@ static std::optional<u32> ReadCop0Reg(Cop0Reg reg)
   }
 }
 
-static void WriteCop0Reg(Cop0Reg reg, u32 value)
+ALWAYS_INLINE_RELEASE static void WriteCop0Reg(Cop0Reg reg, u32 value)
 {
   switch (reg)
   {
@@ -431,12 +421,12 @@ static void LogInstruction(u32 bits, u32 pc, Registers* regs)
   WriteToExecutionLog("%08x: %08x %s\n", pc, bits, instr.GetCharArray());
 }
 
-static constexpr bool AddOverflow(u32 old_value, u32 add_value, u32 new_value)
+ALWAYS_INLINE static constexpr bool AddOverflow(u32 old_value, u32 add_value, u32 new_value)
 {
   return (((new_value ^ old_value) & (new_value ^ add_value)) & UINT32_C(0x80000000)) != 0;
 }
 
-static constexpr bool SubOverflow(u32 old_value, u32 sub_value, u32 new_value)
+ALWAYS_INLINE static constexpr bool SubOverflow(u32 old_value, u32 sub_value, u32 new_value)
 {
   return (((new_value ^ old_value) & (old_value ^ sub_value)) & UINT32_C(0x80000000)) != 0;
 }
@@ -467,53 +457,8 @@ void DisassembleAndPrint(u32 addr, u32 instructions_before /* = 0 */, u32 instru
   }
 }
 
-void Execute()
-{
-  g_state.frame_done = false;
-  while (!g_state.frame_done)
-  {
-    TimingEvents::UpdateCPUDowncount();
-
-    while (g_state.pending_ticks <= g_state.downcount)
-    {
-      if (HasPendingInterrupt())
-        DispatchInterrupt();
-
-      g_state.pending_ticks++;
-
-      // now executing the instruction we previously fetched
-      g_state.current_instruction.bits = g_state.next_instruction.bits;
-      g_state.current_instruction_pc = g_state.regs.pc;
-      g_state.current_instruction_in_branch_delay_slot = g_state.next_instruction_is_branch_delay_slot;
-      g_state.current_instruction_was_branch_taken = g_state.branch_was_taken;
-      g_state.next_instruction_is_branch_delay_slot = false;
-      g_state.branch_was_taken = false;
-      g_state.exception_raised = false;
-
-      // fetch the next instruction
-      if (!FetchInstruction())
-        continue;
-
-#if 0 // GTE flag test debugging
-      if (g_state.m_current_instruction_pc == 0x8002cdf4)
-      {
-        if (g_state.m_regs.v1 != g_state.m_regs.v0)
-          printf("Got %08X Expected? %08X\n", g_state.m_regs.v1, g_state.m_regs.v0);
-      }
-#endif
-
-      // execute the instruction we previously fetched
-      ExecuteInstruction();
-
-      // next load delay
-      UpdateLoadDelay();
-    }
-
-    TimingEvents::RunEvents();
-  }
-}
-
-void ExecuteInstruction()
+template<PGXPMode pgxp_mode>
+ALWAYS_INLINE_RELEASE static void ExecuteInstruction()
 {
   const Instruction inst = g_state.current_instruction;
 
@@ -525,14 +470,6 @@ void ExecuteInstruction()
   }
 #endif
 
-#if 0
-  if (g_state.m_current_instruction_pc == 0x8002bf50)
-  {
-    TRACE_EXECUTION = true;
-    __debugbreak();
-  }
-#endif
-
 #ifdef _DEBUG
   if (TRACE_EXECUTION)
     PrintInstruction(inst.bits, g_state.current_instruction_pc, &g_state.regs);
@@ -549,6 +486,9 @@ void ExecuteInstruction()
         case InstructionFunct::sll:
         {
           const u32 new_value = ReadReg(inst.r.rt) << inst.r.shamt;
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SLL(inst.bits, new_value, ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -556,6 +496,9 @@ void ExecuteInstruction()
         case InstructionFunct::srl:
         {
           const u32 new_value = ReadReg(inst.r.rt) >> inst.r.shamt;
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SRL(inst.bits, new_value, ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -563,6 +506,9 @@ void ExecuteInstruction()
         case InstructionFunct::sra:
         {
           const u32 new_value = static_cast<u32>(static_cast<s32>(ReadReg(inst.r.rt)) >> inst.r.shamt);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SRA(inst.bits, new_value, ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -571,6 +517,9 @@ void ExecuteInstruction()
         {
           const u32 shift_amount = ReadReg(inst.r.rs) & UINT32_C(0x1F);
           const u32 new_value = ReadReg(inst.r.rt) << shift_amount;
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SLLV(inst.bits, new_value, ReadReg(inst.r.rt), shift_amount);
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -579,6 +528,9 @@ void ExecuteInstruction()
         {
           const u32 shift_amount = ReadReg(inst.r.rs) & UINT32_C(0x1F);
           const u32 new_value = ReadReg(inst.r.rt) >> shift_amount;
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SRLV(inst.bits, new_value, ReadReg(inst.r.rt), shift_amount);
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -587,6 +539,9 @@ void ExecuteInstruction()
         {
           const u32 shift_amount = ReadReg(inst.r.rs) & UINT32_C(0x1F);
           const u32 new_value = static_cast<u32>(static_cast<s32>(ReadReg(inst.r.rt)) >> shift_amount);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SRAV(inst.bits, new_value, ReadReg(inst.r.rt), shift_amount);
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -594,6 +549,9 @@ void ExecuteInstruction()
         case InstructionFunct::and_:
         {
           const u32 new_value = ReadReg(inst.r.rs) & ReadReg(inst.r.rt);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_AND(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -601,6 +559,9 @@ void ExecuteInstruction()
         case InstructionFunct::or_:
         {
           const u32 new_value = ReadReg(inst.r.rs) | ReadReg(inst.r.rt);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_OR(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -608,6 +569,9 @@ void ExecuteInstruction()
         case InstructionFunct::xor_:
         {
           const u32 new_value = ReadReg(inst.r.rs) ^ ReadReg(inst.r.rt);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_XOR(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -615,6 +579,9 @@ void ExecuteInstruction()
         case InstructionFunct::nor:
         {
           const u32 new_value = ~(ReadReg(inst.r.rs) | ReadReg(inst.r.rt));
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_NOR(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -630,6 +597,9 @@ void ExecuteInstruction()
             return;
           }
 
+          if constexpr (pgxp_mode == PGXPMode::CPU)
+            PGXP::CPU_ADD(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -637,6 +607,9 @@ void ExecuteInstruction()
         case InstructionFunct::addu:
         {
           const u32 new_value = ReadReg(inst.r.rs) + ReadReg(inst.r.rt);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_ADDU(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -652,6 +625,9 @@ void ExecuteInstruction()
             return;
           }
 
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SUB(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -659,6 +635,9 @@ void ExecuteInstruction()
         case InstructionFunct::subu:
         {
           const u32 new_value = ReadReg(inst.r.rs) - ReadReg(inst.r.rt);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SUBU(inst.bits, new_value, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, new_value);
         }
         break;
@@ -666,6 +645,9 @@ void ExecuteInstruction()
         case InstructionFunct::slt:
         {
           const u32 result = BoolToUInt32(static_cast<s32>(ReadReg(inst.r.rs)) < static_cast<s32>(ReadReg(inst.r.rt)));
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SLT(inst.bits, result, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, result);
         }
         break;
@@ -673,12 +655,18 @@ void ExecuteInstruction()
         case InstructionFunct::sltu:
         {
           const u32 result = BoolToUInt32(ReadReg(inst.r.rs) < ReadReg(inst.r.rt));
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_SLTU(inst.bits, result, ReadReg(inst.r.rs), ReadReg(inst.r.rt));
+
           WriteReg(inst.r.rd, result);
         }
         break;
 
         case InstructionFunct::mfhi:
         {
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_MFHI(inst.bits, ReadReg(inst.r.rd), g_state.regs.hi);
+
           WriteReg(inst.r.rd, g_state.regs.hi);
         }
         break;
@@ -686,12 +674,18 @@ void ExecuteInstruction()
         case InstructionFunct::mthi:
         {
           const u32 value = ReadReg(inst.r.rs);
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_MTHI(inst.bits, g_state.regs.hi, value);
+
           g_state.regs.hi = value;
         }
         break;
 
         case InstructionFunct::mflo:
         {
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_MFLO(inst.bits, ReadReg(inst.r.rd), g_state.regs.lo);
+
           WriteReg(inst.r.rd, g_state.regs.lo);
         }
         break;
@@ -699,6 +693,9 @@ void ExecuteInstruction()
         case InstructionFunct::mtlo:
         {
           const u32 value = ReadReg(inst.r.rs);
+          if constexpr (pgxp_mode == PGXPMode::CPU)
+            PGXP::CPU_MTLO(inst.bits, g_state.regs.lo, value);
+
           g_state.regs.lo = value;
         }
         break;
@@ -709,8 +706,12 @@ void ExecuteInstruction()
           const u32 rhs = ReadReg(inst.r.rt);
           const u64 result =
             static_cast<u64>(static_cast<s64>(SignExtend64(lhs)) * static_cast<s64>(SignExtend64(rhs)));
+
           g_state.regs.hi = Truncate32(result >> 32);
           g_state.regs.lo = Truncate32(result);
+
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_MULT(inst.bits, g_state.regs.hi, g_state.regs.lo, lhs, rhs);
         }
         break;
 
@@ -719,6 +720,10 @@ void ExecuteInstruction()
           const u32 lhs = ReadReg(inst.r.rs);
           const u32 rhs = ReadReg(inst.r.rt);
           const u64 result = ZeroExtend64(lhs) * ZeroExtend64(rhs);
+
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_MULTU(inst.bits, g_state.regs.hi, g_state.regs.lo, lhs, rhs);
+
           g_state.regs.hi = Truncate32(result >> 32);
           g_state.regs.lo = Truncate32(result);
         }
@@ -746,6 +751,9 @@ void ExecuteInstruction()
             g_state.regs.lo = static_cast<u32>(num / denom);
             g_state.regs.hi = static_cast<u32>(num % denom);
           }
+
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_DIV(inst.bits, g_state.regs.hi, g_state.regs.lo, num, denom);
         }
         break;
 
@@ -765,6 +773,9 @@ void ExecuteInstruction()
             g_state.regs.lo = num / denom;
             g_state.regs.hi = num % denom;
           }
+
+          if constexpr (pgxp_mode >= PGXPMode::CPU)
+            PGXP::CPU_DIVU(inst.bits, g_state.regs.hi, g_state.regs.lo, num, denom);
         }
         break;
 
@@ -808,25 +819,44 @@ void ExecuteInstruction()
 
     case InstructionOp::lui:
     {
-      WriteReg(inst.i.rt, inst.i.imm_zext32() << 16);
+      const u32 value = inst.i.imm_zext32() << 16;
+      WriteReg(inst.i.rt, value);
+
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_LUI(inst.bits, value);
     }
     break;
 
     case InstructionOp::andi:
     {
-      WriteReg(inst.i.rt, ReadReg(inst.i.rs) & inst.i.imm_zext32());
+      const u32 new_value = ReadReg(inst.i.rs) & inst.i.imm_zext32();
+
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_ANDI(inst.bits, new_value, ReadReg(inst.i.rs));
+
+      WriteReg(inst.i.rt, new_value);
     }
     break;
 
     case InstructionOp::ori:
     {
-      WriteReg(inst.i.rt, ReadReg(inst.i.rs) | inst.i.imm_zext32());
+      const u32 new_value = ReadReg(inst.i.rs) | inst.i.imm_zext32();
+
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_ORI(inst.bits, new_value, ReadReg(inst.i.rs));
+
+      WriteReg(inst.i.rt, new_value);
     }
     break;
 
     case InstructionOp::xori:
     {
-      WriteReg(inst.i.rt, ReadReg(inst.i.rs) ^ inst.i.imm_zext32());
+      const u32 new_value = ReadReg(inst.i.rs) ^ inst.i.imm_zext32();
+
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_XORI(inst.bits, new_value, ReadReg(inst.i.rs));
+
+      WriteReg(inst.i.rt, new_value);
     }
     break;
 
@@ -841,19 +871,31 @@ void ExecuteInstruction()
         return;
       }
 
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_ANDI(inst.bits, new_value, ReadReg(inst.i.rs));
+
       WriteReg(inst.i.rt, new_value);
     }
     break;
 
     case InstructionOp::addiu:
     {
-      WriteReg(inst.i.rt, ReadReg(inst.i.rs) + inst.i.imm_sext32());
+      const u32 new_value = ReadReg(inst.i.rs) + inst.i.imm_sext32();
+
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_ADDIU(inst.bits, new_value, ReadReg(inst.i.rs));
+
+      WriteReg(inst.i.rt, new_value);
     }
     break;
 
     case InstructionOp::slti:
     {
       const u32 result = BoolToUInt32(static_cast<s32>(ReadReg(inst.i.rs)) < static_cast<s32>(inst.i.imm_sext32()));
+
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_SLTI(inst.bits, result, ReadReg(inst.i.rs));
+
       WriteReg(inst.i.rt, result);
     }
     break;
@@ -861,6 +903,10 @@ void ExecuteInstruction()
     case InstructionOp::sltiu:
     {
       const u32 result = BoolToUInt32(ReadReg(inst.i.rs) < inst.i.imm_sext32());
+
+      if constexpr (pgxp_mode >= PGXPMode::CPU)
+        PGXP::CPU_SLTIU(inst.bits, result, ReadReg(inst.i.rs));
+
       WriteReg(inst.i.rt, result);
     }
     break;
@@ -876,7 +922,7 @@ void ExecuteInstruction()
 
       WriteRegDelayed(inst.i.rt, sxvalue);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_LBx(inst.bits, sxvalue, addr);
     }
     break;
@@ -891,7 +937,7 @@ void ExecuteInstruction()
       const u32 sxvalue = SignExtend32(value);
       WriteRegDelayed(inst.i.rt, sxvalue);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_LHx(inst.bits, sxvalue, addr);
     }
     break;
@@ -905,7 +951,7 @@ void ExecuteInstruction()
 
       WriteRegDelayed(inst.i.rt, value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_LW(inst.bits, value, addr);
     }
     break;
@@ -920,7 +966,7 @@ void ExecuteInstruction()
       const u32 zxvalue = ZeroExtend32(value);
       WriteRegDelayed(inst.i.rt, zxvalue);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_LBx(inst.bits, zxvalue, addr);
     }
     break;
@@ -935,7 +981,7 @@ void ExecuteInstruction()
       const u32 zxvalue = ZeroExtend32(value);
       WriteRegDelayed(inst.i.rt, zxvalue);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_LHx(inst.bits, zxvalue, addr);
     }
     break;
@@ -966,7 +1012,7 @@ void ExecuteInstruction()
 
       WriteRegDelayed(inst.i.rt, new_value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_LW(inst.bits, new_value, addr);
     }
     break;
@@ -977,7 +1023,7 @@ void ExecuteInstruction()
       const u8 value = Truncate8(ReadReg(inst.i.rt));
       WriteMemoryByte(addr, value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_SB(inst.bits, value, addr);
     }
     break;
@@ -988,7 +1034,7 @@ void ExecuteInstruction()
       const u16 value = Truncate16(ReadReg(inst.i.rt));
       WriteMemoryHalfWord(addr, value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_SH(inst.bits, value, addr);
     }
     break;
@@ -999,7 +1045,7 @@ void ExecuteInstruction()
       const u32 value = ReadReg(inst.i.rt);
       WriteMemoryWord(addr, value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_SW(inst.bits, value, addr);
     }
     break;
@@ -1029,7 +1075,7 @@ void ExecuteInstruction()
 
       WriteMemoryWord(aligned_addr, new_value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_SW(inst.bits, new_value, addr);
     }
     break;
@@ -1114,7 +1160,58 @@ void ExecuteInstruction()
         return;
       }
 
-      ExecuteCop0Instruction();
+      if (inst.cop.IsCommonInstruction())
+      {
+        switch (inst.cop.CommonOp())
+        {
+          case CopCommonInstruction::mfcn:
+          {
+            const std::optional<u32> value = ReadCop0Reg(static_cast<Cop0Reg>(inst.r.rd.GetValue()));
+
+            if constexpr (pgxp_mode == PGXPMode::CPU)
+              PGXP::CPU_MFC0(inst.bits, value.value_or(0), ReadReg(inst.i.rs));
+
+            if (value)
+              WriteRegDelayed(inst.r.rt, value.value());
+            else
+              RaiseException(Exception::RI);
+          }
+          break;
+
+          case CopCommonInstruction::mtcn:
+          {
+            WriteCop0Reg(static_cast<Cop0Reg>(inst.r.rd.GetValue()), ReadReg(inst.r.rt));
+
+            if constexpr (pgxp_mode == PGXPMode::CPU)
+            {
+              PGXP::CPU_MTC0(inst.bits, ReadCop0Reg(static_cast<Cop0Reg>(inst.r.rd.GetValue())).value_or(0),
+                             ReadReg(inst.i.rs));
+            }
+          }
+          break;
+
+          default:
+            Panic("Missing implementation");
+            break;
+        }
+      }
+      else
+      {
+        switch (inst.cop.Cop0Op())
+        {
+          case Cop0Instruction::rfe:
+          {
+            // restore mode
+            g_state.cop0_regs.sr.mode_bits =
+              (g_state.cop0_regs.sr.mode_bits & UINT32_C(0b110000)) | (g_state.cop0_regs.sr.mode_bits >> 2);
+          }
+          break;
+
+          default:
+            Panic("Missing implementation");
+            break;
+        }
+      }
     }
     break;
 
@@ -1127,7 +1224,61 @@ void ExecuteInstruction()
         return;
       }
 
-      ExecuteCop2Instruction();
+      if (inst.cop.IsCommonInstruction())
+      {
+        // TODO: Combine with cop0.
+        switch (inst.cop.CommonOp())
+        {
+          case CopCommonInstruction::cfcn:
+          {
+            const u32 value = GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32);
+            WriteRegDelayed(inst.r.rt, value);
+
+            if constexpr (pgxp_mode >= PGXPMode::Memory)
+              PGXP::CPU_CFC2(inst.bits, value, value);
+          }
+          break;
+
+          case CopCommonInstruction::ctcn:
+          {
+            const u32 value = ReadReg(inst.r.rt);
+            GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32, value);
+
+            if constexpr (pgxp_mode >= PGXPMode::Memory)
+              PGXP::CPU_CTC2(inst.bits, value, value);
+          }
+          break;
+
+          case CopCommonInstruction::mfcn:
+          {
+            const u32 value = GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue()));
+            WriteRegDelayed(inst.r.rt, value);
+
+            if constexpr (pgxp_mode >= PGXPMode::Memory)
+              PGXP::CPU_MFC2(inst.bits, value, value);
+          }
+          break;
+
+          case CopCommonInstruction::mtcn:
+          {
+            const u32 value = ReadReg(inst.r.rt);
+            GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()), value);
+
+            if constexpr (pgxp_mode >= PGXPMode::Memory)
+              PGXP::CPU_MTC2(inst.bits, value, value);
+          }
+          break;
+
+          case CopCommonInstruction::bcnc:
+          default:
+            Panic("Missing implementation");
+            break;
+        }
+      }
+      else
+      {
+        GTE::ExecuteInstruction(inst.bits);
+      }
     }
     break;
 
@@ -1147,7 +1298,7 @@ void ExecuteInstruction()
 
       GTE::WriteRegister(ZeroExtend32(static_cast<u8>(inst.i.rt.GetValue())), value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_LWC2(inst.bits, value, addr);
     }
     break;
@@ -1165,12 +1316,12 @@ void ExecuteInstruction()
       const u32 value = GTE::ReadRegister(ZeroExtend32(static_cast<u8>(inst.i.rt.GetValue())));
       WriteMemoryWord(addr, value);
 
-      if (g_settings.gpu_pgxp_enable)
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
         PGXP::CPU_SWC2(inst.bits, value, addr);
     }
     break;
 
-    // swc0/lwc0/cop1/cop3 are essentially no-ops
+      // swc0/lwc0/cop1/cop3 are essentially no-ops
     case InstructionOp::cop1:
     case InstructionOp::cop3:
     case InstructionOp::lwc0:
@@ -1183,7 +1334,7 @@ void ExecuteInstruction()
     }
     break;
 
-    // everything else is reserved/invalid
+      // everything else is reserved/invalid
     default:
     {
       RaiseException(Exception::RI);
@@ -1192,117 +1343,71 @@ void ExecuteInstruction()
   }
 }
 
-void ExecuteCop0Instruction()
+template<PGXPMode pgxp_mode>
+static void ExecuteImpl()
 {
-  const Instruction inst = g_state.current_instruction;
-
-  if (inst.cop.IsCommonInstruction())
+  g_state.frame_done = false;
+  while (!g_state.frame_done)
   {
-    switch (inst.cop.CommonOp())
+    TimingEvents::UpdateCPUDowncount();
+
+    while (g_state.pending_ticks <= g_state.downcount)
     {
-      case CopCommonInstruction::mfcn:
-      {
-        const std::optional<u32> value = ReadCop0Reg(static_cast<Cop0Reg>(inst.r.rd.GetValue()));
-        if (value)
-          WriteRegDelayed(inst.r.rt, value.value());
-        else
-          RaiseException(Exception::RI);
-      }
-      break;
+      if (HasPendingInterrupt())
+        DispatchInterrupt();
 
-      case CopCommonInstruction::mtcn:
-      {
-        WriteCop0Reg(static_cast<Cop0Reg>(inst.r.rd.GetValue()), ReadReg(inst.r.rt));
-      }
-      break;
+      g_state.pending_ticks++;
 
-      default:
-        Panic("Missing implementation");
-        break;
+      // now executing the instruction we previously fetched
+      g_state.current_instruction.bits = g_state.next_instruction.bits;
+      g_state.current_instruction_pc = g_state.regs.pc;
+      g_state.current_instruction_in_branch_delay_slot = g_state.next_instruction_is_branch_delay_slot;
+      g_state.current_instruction_was_branch_taken = g_state.branch_was_taken;
+      g_state.next_instruction_is_branch_delay_slot = false;
+      g_state.branch_was_taken = false;
+      g_state.exception_raised = false;
+
+      // fetch the next instruction
+      if (!FetchInstruction())
+        continue;
+
+#if 0 // GTE flag test debugging
+      if (g_state.m_current_instruction_pc == 0x8002cdf4)
+      {
+        if (g_state.m_regs.v1 != g_state.m_regs.v0)
+          printf("Got %08X Expected? %08X\n", g_state.m_regs.v1, g_state.m_regs.v0);
+      }
+#endif
+
+      // execute the instruction we previously fetched
+      ExecuteInstruction<pgxp_mode>();
+
+      // next load delay
+      UpdateLoadDelay();
     }
-  }
-  else
-  {
-    switch (inst.cop.Cop0Op())
-    {
-      case Cop0Instruction::rfe:
-      {
-        // restore mode
-        g_state.cop0_regs.sr.mode_bits =
-          (g_state.cop0_regs.sr.mode_bits & UINT32_C(0b110000)) | (g_state.cop0_regs.sr.mode_bits >> 2);
-      }
-      break;
 
-      default:
-        Panic("Missing implementation");
-        break;
-    }
+    TimingEvents::RunEvents();
   }
 }
 
-void ExecuteCop2Instruction()
+void Execute()
 {
-  const Instruction inst = g_state.current_instruction;
-
-  if (inst.cop.IsCommonInstruction())
+  if (g_settings.gpu_pgxp_enable)
   {
-    // TODO: Combine with cop0.
-    switch (inst.cop.CommonOp())
-    {
-      case CopCommonInstruction::cfcn:
-      {
-        const u32 value = GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32);
-        WriteRegDelayed(inst.r.rt, value);
-
-        if (g_settings.gpu_pgxp_enable)
-          PGXP::CPU_CFC2(inst.bits, value, value);
-      }
-      break;
-
-      case CopCommonInstruction::ctcn:
-      {
-        const u32 value = ReadReg(inst.r.rt);
-        GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32, value);
-
-        if (g_settings.gpu_pgxp_enable)
-          PGXP::CPU_CTC2(inst.bits, value, value);
-      }
-      break;
-
-      case CopCommonInstruction::mfcn:
-      {
-        const u32 value = GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue()));
-        WriteRegDelayed(inst.r.rt, value);
-
-        if (g_settings.gpu_pgxp_enable)
-          PGXP::CPU_MFC2(inst.bits, value, value);
-      }
-      break;
-
-      case CopCommonInstruction::mtcn:
-      {
-        const u32 value = ReadReg(inst.r.rt);
-        GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()), value);
-
-        if (g_settings.gpu_pgxp_enable)
-          PGXP::CPU_MTC2(inst.bits, value, value);
-      }
-      break;
-
-      case CopCommonInstruction::bcnc:
-      default:
-        Panic("Missing implementation");
-        break;
-    }
+    if (g_settings.gpu_pgxp_cpu)
+      ExecuteImpl<PGXPMode::CPU>();
+    else
+      ExecuteImpl<PGXPMode::Memory>();
   }
   else
   {
-    GTE::ExecuteInstruction(inst.bits);
+    ExecuteImpl<PGXPMode::Disabled>();
   }
 }
 
 namespace CodeCache {
 
+template<PGXPMode pgxp_mode>
 void InterpretCachedBlock(const CodeBlock& block)
 {
   // set up the state so we've already fetched the instruction
@@ -1327,7 +1432,7 @@ void InterpretCachedBlock(const CodeBlock& block)
     g_state.regs.npc += 4;
 
     // execute the instruction we previously fetched
-    ExecuteInstruction();
+    ExecuteInstruction<pgxp_mode>();
 
     // next load delay
     UpdateLoadDelay();
@@ -1340,6 +1445,10 @@ void InterpretCachedBlock(const CodeBlock& block)
   g_state.next_instruction_is_branch_delay_slot = false;
 }
 
+template void InterpretCachedBlock<PGXPMode::Disabled>(const CodeBlock& block);
+template void InterpretCachedBlock<PGXPMode::Memory>(const CodeBlock& block);
+template void InterpretCachedBlock<PGXPMode::CPU>(const CodeBlock& block);
+
 void InterpretUncachedBlock()
 {
   Panic("Fixme with regards to re-fetching PC");
@@ -1365,7 +1474,7 @@ void InterpretUncachedBlock()
       break;
 
     // execute the instruction we previously fetched
-    ExecuteInstruction();
+    ExecuteInstruction<PGXPMode::Disabled>();
 
     // next load delay
     UpdateLoadDelay();
@@ -1387,7 +1496,13 @@ namespace Recompiler::Thunks {
 
 bool InterpretInstruction()
 {
-  ExecuteInstruction();
+  ExecuteInstruction<PGXPMode::Disabled>();
+  return g_state.exception_raised;
+}
+
+bool InterpretInstructionPGXP()
+{
+  ExecuteInstruction<PGXPMode::Memory>();
   return g_state.exception_raised;
 }
 
diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp
index 2e8802c17..1c7ae0db4 100644
--- a/src/core/cpu_recompiler_code_generator.cpp
+++ b/src/core/cpu_recompiler_code_generator.cpp
@@ -1014,12 +1014,14 @@ bool CodeGenerator::Compile_Fallback(const CodeBlockInstruction& cbi)
   {
     // TODO: Use carry flag or something here too
     Value return_value = m_register_cache.AllocateScratch(RegSize_8);
-    EmitFunctionCall(&return_value, &Thunks::InterpretInstruction);
+    EmitFunctionCall(&return_value,
+                     g_settings.gpu_pgxp_enable ? &Thunks::InterpretInstructionPGXP : &Thunks::InterpretInstruction);
     EmitExceptionExitOnBool(return_value);
   }
   else
   {
-    EmitFunctionCall(nullptr, &Thunks::InterpretInstruction);
+    EmitFunctionCall(nullptr,
+                     g_settings.gpu_pgxp_enable ? &Thunks::InterpretInstructionPGXP : &Thunks::InterpretInstruction);
   }
 
   m_current_instruction_in_branch_delay_slot_dirty = cbi.is_branch_instruction;
diff --git a/src/core/cpu_recompiler_thunks.h b/src/core/cpu_recompiler_thunks.h
index 9b9316dde..602f522af 100644
--- a/src/core/cpu_recompiler_thunks.h
+++ b/src/core/cpu_recompiler_thunks.h
@@ -13,6 +13,7 @@ namespace Recompiler::Thunks {
 // TODO: Abuse carry flag or something else for exception
 //////////////////////////////////////////////////////////////////////////
 bool InterpretInstruction();
+bool InterpretInstructionPGXP();
 
 // Memory access functions for the JIT - MSB is set on exception.
 u64 ReadMemoryByte(u32 address);
diff --git a/src/core/cpu_types.h b/src/core/cpu_types.h
index 48021af6d..785c746fb 100644
--- a/src/core/cpu_types.h
+++ b/src/core/cpu_types.h
@@ -135,7 +135,6 @@ enum class InstructionFunct : u8
   or_ = 37,
   xor_ = 38,
   nor = 39,
-  sh = 41,
   slt = 42,
   sltu = 43
 };
diff --git a/src/core/host_interface.cpp b/src/core/host_interface.cpp
index e72cb1fa1..26a8634bc 100644
--- a/src/core/host_interface.cpp
+++ b/src/core/host_interface.cpp
@@ -375,6 +375,7 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si)
   si.SetBoolValue("GPU", "PGXPCulling", true);
   si.SetBoolValue("GPU", "PGXPTextureCorrection", true);
   si.SetBoolValue("GPU", "PGXPVertexCache", false);
+  si.SetBoolValue("GPU", "PGXPCPU", false);
 
   si.SetStringValue("Display", "CropMode", Settings::GetDisplayCropModeName(Settings::DEFAULT_DISPLAY_CROP_MODE));
   si.SetStringValue("Display", "AspectRatio",
@@ -438,6 +439,25 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si)
 void HostInterface::LoadSettings(SettingsInterface& si)
 {
   g_settings.Load(si);
+
+  FixIncompatibleSettings();
+}
+
+void HostInterface::FixIncompatibleSettings()
+{
+  if (g_settings.gpu_pgxp_enable)
+  {
+    if (g_settings.gpu_renderer == GPURenderer::Software)
+    {
+      Log_WarningPrintf("PGXP enabled with software renderer, disabling");
+      g_settings.gpu_pgxp_enable = false;
+    }
+    else if (g_settings.gpu_pgxp_cpu && g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler)
+    {
+      Log_WarningPrintf("Recompiler selected with PGXP CPU mode, falling back to cached interpreter");
+      g_settings.cpu_execution_mode = CPUExecutionMode::CachedInterpreter;
+    }
+  }
 }
 
 void HostInterface::SaveSettings(SettingsInterface& si)
diff --git a/src/core/host_interface.h b/src/core/host_interface.h
index 2f92313a2..3999cf355 100644
--- a/src/core/host_interface.h
+++ b/src/core/host_interface.h
@@ -134,6 +134,9 @@ protected:
   /// Saves current settings variables to ini.
   virtual void SaveSettings(SettingsInterface& si);
 
+  /// Checks and fixes up any incompatible settings.
+  virtual void FixIncompatibleSettings();
+
   /// Checks for settings changes, std::move() the old settings away for comparing beforehand.
   virtual void CheckForSettingsChanges(const Settings& old_settings);
 
diff --git a/src/core/pgxp.cpp b/src/core/pgxp.cpp
index 7ca7c4f94..0c7de1894 100644
--- a/src/core/pgxp.cpp
+++ b/src/core/pgxp.cpp
@@ -77,6 +77,11 @@ typedef enum
   INVALID_8BIT_STORE = 6
 } PGXP_error_states;
 
+typedef enum
+{
+  VALID_HALF = (1 << 0)
+} PGXP_half_flags;
+
 #define NONE 0
 #define ALL 0xFFFFFFFF
 #define VALID 1
@@ -92,9 +97,15 @@ typedef enum
 static const PGXP_value PGXP_value_invalid_address = {0.f, 0.f, 0.f, {0}, 0, 0, INVALID_ADDRESS, 0, 0};
 static const PGXP_value PGXP_value_zero = {0.f, 0.f, 0.f, {0}, 0, VALID_ALL, 0, 0, 0};
 
+static void MakeValid(PGXP_value* pV, u32 psxV);
 static void Validate(PGXP_value* pV, u32 psxV);
 static void MaskValidate(PGXP_value* pV, u32 psxV, u32 mask, u32 validMask);
 
+static double f16Sign(double in);
+static double f16Unsign(double in);
+static double fu16Trunc(double in);
+static double f16Overflow(double in);
+
 typedef union
 {
   struct
@@ -138,6 +149,20 @@ static PGXP_value* CPU_reg = CPU_reg_mem;
 static PGXP_value* CP0_reg = CP0_reg_mem;
 
 // pgxp_value.c
+void MakeValid(PGXP_value* pV, u32 psxV)
+{
+  psx_value psx;
+  psx.d = psxV;
+  if (VALID_01 != (pV->flags & VALID_01))
+  {
+    pV->x = psx.sw.l;
+    pV->y = psx.sw.h;
+    pV->z = 0.f;
+    pV->flags |= VALID_01;
+    pV->value = psx.d;
+  }
+}
+
 void Validate(PGXP_value* pV, u32 psxV)
 {
   // assume pV is not NULL
@@ -150,6 +175,23 @@ void MaskValidate(PGXP_value* pV, u32 psxV, u32 mask, u32 validMask)
   pV->flags &= ((pV->value & mask) == (psxV & mask)) ? ALL : (ALL ^ (validMask));
 }
 
+double f16Sign(double in)
+{
+  u32 s = (u32)(in * (double)((u32)1 << 16));
+  return ((double)*((s32*)&s)) / (double)((s32)1 << 16);
+}
+double f16Unsign(double in)
+{
+  return (in >= 0) ? in : ((double)in + (double)USHRT_MAX + 1);
+}
+double f16Overflow(double in)
+{
+  double out = 0;
+  s64 v = ((s64)in) >> 16;
+  out = (double)v;
+  return out;
+}
+
 // pgxp_mem.c
 static void PGXP_InitMem();
 static PGXP_value Mem[3 * 2048 * 1024 / 4]; // mirror 2MB in 32-bit words * 3
@@ -797,4 +839,1001 @@ void CPU_SW(u32 instr, u32 rtVal, u32 addr)
   WriteMem(&CPU_reg[rt(instr)], addr);
 }
 
+void CPU_ADDI(u32 instr, u32 rtVal, u32 rsVal)
+{
+  // Rt = Rs + Imm (signed)
+  psx_value tempImm;
+  PGXP_value ret;
+
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  ret = CPU_reg[rs(instr)];
+  tempImm.d = imm(instr);
+  tempImm.sd = (tempImm.sd << 16) >> 16; // sign extend
+
+  ret.x = (float)f16Unsign(ret.x);
+  ret.x += (float)tempImm.w.l;
+
+  // carry on over/underflow
+  float of = (ret.x > USHRT_MAX) ? 1.f : (ret.x < 0) ? -1.f : 0.f;
+  ret.x = (float)f16Sign(ret.x);
+  // ret.x -= of * (USHRT_MAX + 1);
+  ret.y += tempImm.sw.h + of;
+
+  // truncate on overflow/underflow
+  ret.y += (ret.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (ret.y < SHRT_MIN) ? USHRT_MAX + 1 : 0.f;
+
+  CPU_reg[rt(instr)] = ret;
+  CPU_reg[rt(instr)].value = rtVal;
+}
+
+void CPU_ADDIU(u32 instr, u32 rtVal, u32 rsVal)
+{
+  // Rt = Rs + Imm (signed) (unsafe?)
+  CPU_ADDI(instr, rtVal, rsVal);
+}
+
+void CPU_ANDI(u32 instr, u32 rtVal, u32 rsVal)
+{
+  // Rt = Rs & Imm
+  psx_value vRt;
+  PGXP_value ret;
+
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  ret = CPU_reg[rs(instr)];
+
+  vRt.d = rtVal;
+
+  ret.y = 0.f; // remove upper 16-bits
+
+  switch (imm(instr))
+  {
+    case 0:
+      // if 0 then x == 0
+      ret.x = 0.f;
+      break;
+    case 0xFFFF:
+      // if saturated then x == x
+      break;
+    default:
+      // otherwise x is low precision value
+      ret.x = vRt.sw.l;
+      ret.flags |= VALID_0;
+  }
+
+  ret.flags |= VALID_1;
+
+  CPU_reg[rt(instr)] = ret;
+  CPU_reg[rt(instr)].value = rtVal;
+}
+
+void CPU_ORI(u32 instr, u32 rtVal, u32 rsVal)
+{
+  // Rt = Rs | Imm
+  psx_value vRt;
+  PGXP_value ret;
+
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  ret = CPU_reg[rs(instr)];
+
+  vRt.d = rtVal;
+
+  switch (imm(instr))
+  {
+    case 0:
+      // if 0 then x == x
+      break;
+    default:
+      // otherwise x is low precision value
+      ret.x = vRt.sw.l;
+      ret.flags |= VALID_0;
+  }
+
+  ret.value = rtVal;
+  CPU_reg[rt(instr)] = ret;
+}
+
+void CPU_XORI(u32 instr, u32 rtVal, u32 rsVal)
+{
+  // Rt = Rs ^ Imm
+  psx_value vRt;
+  PGXP_value ret;
+
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  ret = CPU_reg[rs(instr)];
+
+  vRt.d = rtVal;
+
+  switch (imm(instr))
+  {
+    case 0:
+      // if 0 then x == x
+      break;
+    default:
+      // otherwise x is low precision value
+      ret.x = vRt.sw.l;
+      ret.flags |= VALID_0;
+  }
+
+  ret.value = rtVal;
+  CPU_reg[rt(instr)] = ret;
+}
+
+void CPU_SLTI(u32 instr, u32 rtVal, u32 rsVal)
+{
+  // Rt = Rs < Imm (signed)
+  psx_value tempImm;
+  PGXP_value ret;
+
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  ret = CPU_reg[rs(instr)];
+
+  tempImm.w.h = imm(instr);
+  ret.y = 0.f;
+  ret.x = (CPU_reg[rs(instr)].x < tempImm.sw.h) ? 1.f : 0.f;
+  ret.flags |= VALID_1;
+  ret.value = rtVal;
+
+  CPU_reg[rt(instr)] = ret;
+}
+
+void CPU_SLTIU(u32 instr, u32 rtVal, u32 rsVal)
+{
+  // Rt = Rs < Imm (Unsigned)
+  psx_value tempImm;
+  PGXP_value ret;
+
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  ret = CPU_reg[rs(instr)];
+
+  tempImm.w.h = imm(instr);
+  ret.y = 0.f;
+  ret.x = (f16Unsign(CPU_reg[rs(instr)].x) < tempImm.w.h) ? 1.f : 0.f;
+  ret.flags |= VALID_1;
+  ret.value = rtVal;
+
+  CPU_reg[rt(instr)] = ret;
+}
+
+////////////////////////////////////
+// Load Upper
+////////////////////////////////////
+void CPU_LUI(u32 instr, u32 rtVal)
+{
+  // Rt = Imm << 16
+  CPU_reg[rt(instr)] = PGXP_value_zero;
+  CPU_reg[rt(instr)].y = (float)(s16)imm(instr);
+  CPU_reg[rt(instr)].hFlags = VALID_HALF;
+  CPU_reg[rt(instr)].value = rtVal;
+  CPU_reg[rt(instr)].flags = VALID_01;
+}
+
+////////////////////////////////////
+// Register Arithmetic
+////////////////////////////////////
+
+void CPU_ADD(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs + Rt (signed)
+  PGXP_value ret;
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  // iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  ret = CPU_reg[rs(instr)];
+
+  ret.x = (float)f16Unsign(ret.x);
+  ret.x += (float)f16Unsign(CPU_reg[rt(instr)].x);
+
+  // carry on over/underflow
+  float of = (ret.x > USHRT_MAX) ? 1.f : (ret.x < 0) ? -1.f : 0.f;
+  ret.x = (float)f16Sign(ret.x);
+  // ret.x -= of * (USHRT_MAX + 1);
+  ret.y += CPU_reg[rt(instr)].y + of;
+
+  // truncate on overflow/underflow
+  ret.y += (ret.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (ret.y < SHRT_MIN) ? USHRT_MAX + 1 : 0.f;
+
+  // TODO: decide which "z/w" component to use
+
+  ret.halfFlags[0] &= CPU_reg[rt(instr)].halfFlags[0];
+  ret.gFlags |= CPU_reg[rt(instr)].gFlags;
+  ret.lFlags |= CPU_reg[rt(instr)].lFlags;
+  ret.hFlags |= CPU_reg[rt(instr)].hFlags;
+
+  ret.value = rdVal;
+
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_ADDU(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs + Rt (signed) (unsafe?)
+  CPU_ADD(instr, rdVal, rsVal, rtVal);
+}
+
+void CPU_SUB(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs - Rt (signed)
+  PGXP_value ret;
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  // iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  ret = CPU_reg[rs(instr)];
+
+  ret.x = (float)f16Unsign(ret.x);
+  ret.x -= (float)f16Unsign(CPU_reg[rt(instr)].x);
+
+  // carry on over/underflow
+  float of = (ret.x > USHRT_MAX) ? 1.f : (ret.x < 0) ? -1.f : 0.f;
+  ret.x = (float)f16Sign(ret.x);
+  // ret.x -= of * (USHRT_MAX + 1);
+  ret.y -= CPU_reg[rt(instr)].y - of;
+
+  // truncate on overflow/underflow
+  ret.y += (ret.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (ret.y < SHRT_MIN) ? USHRT_MAX + 1 : 0.f;
+
+  ret.halfFlags[0] &= CPU_reg[rt(instr)].halfFlags[0];
+  ret.gFlags |= CPU_reg[rt(instr)].gFlags;
+  ret.lFlags |= CPU_reg[rt(instr)].lFlags;
+  ret.hFlags |= CPU_reg[rt(instr)].hFlags;
+
+  ret.value = rdVal;
+
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_SUBU(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs - Rt (signed) (unsafe?)
+  CPU_SUB(instr, rdVal, rsVal, rtVal);
+}
+
+void CPU_AND(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs & Rt
+  psx_value vald, vals, valt;
+  PGXP_value ret;
+
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  // iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  vald.d = rdVal;
+  vals.d = rsVal;
+  valt.d = rtVal;
+
+  //	CPU_reg[rd(instr)].valid = CPU_reg[rs(instr)].valid && CPU_reg[rt(instr)].valid;
+  ret.flags = VALID_01;
+
+  if (vald.w.l == 0)
+  {
+    ret.x = 0.f;
+    ret.lFlags = VALID_HALF;
+  }
+  else if (vald.w.l == vals.w.l)
+  {
+    ret.x = CPU_reg[rs(instr)].x;
+    ret.lFlags = CPU_reg[rs(instr)].lFlags;
+    ret.compFlags[0] = CPU_reg[rs(instr)].compFlags[0];
+  }
+  else if (vald.w.l == valt.w.l)
+  {
+    ret.x = CPU_reg[rt(instr)].x;
+    ret.lFlags = CPU_reg[rt(instr)].lFlags;
+    ret.compFlags[0] = CPU_reg[rt(instr)].compFlags[0];
+  }
+  else
+  {
+    ret.x = (float)vald.sw.l;
+    ret.compFlags[0] = VALID;
+    ret.lFlags = 0;
+  }
+
+  if (vald.w.h == 0)
+  {
+    ret.y = 0.f;
+    ret.hFlags = VALID_HALF;
+  }
+  else if (vald.w.h == vals.w.h)
+  {
+    ret.y = CPU_reg[rs(instr)].y;
+    ret.hFlags = CPU_reg[rs(instr)].hFlags;
+    ret.compFlags[1] &= CPU_reg[rs(instr)].compFlags[1];
+  }
+  else if (vald.w.h == valt.w.h)
+  {
+    ret.y = CPU_reg[rt(instr)].y;
+    ret.hFlags = CPU_reg[rt(instr)].hFlags;
+    ret.compFlags[1] &= CPU_reg[rt(instr)].compFlags[1];
+  }
+  else
+  {
+    ret.y = (float)vald.sw.h;
+    ret.compFlags[1] = VALID;
+    ret.hFlags = 0;
+  }
+
+  // iCB Hack: Force validity if even one half is valid
+  // if ((ret.hFlags & VALID_HALF) || (ret.lFlags & VALID_HALF))
+  //	ret.valid = 1;
+  // /iCB Hack
+
+  // Get a valid W
+  if ((CPU_reg[rs(instr)].flags & VALID_2) == VALID_2)
+  {
+    ret.z = CPU_reg[rs(instr)].z;
+    ret.compFlags[2] = CPU_reg[rs(instr)].compFlags[2];
+  }
+  else if ((CPU_reg[rt(instr)].flags & VALID_2) == VALID_2)
+  {
+    ret.z = CPU_reg[rt(instr)].z;
+    ret.compFlags[2] = CPU_reg[rt(instr)].compFlags[2];
+  }
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_OR(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs | Rt
+  CPU_AND(instr, rdVal, rsVal, rtVal);
+}
+
+void CPU_XOR(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs ^ Rt
+  CPU_AND(instr, rdVal, rsVal, rtVal);
+}
+
+void CPU_NOR(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs NOR Rt
+  CPU_AND(instr, rdVal, rsVal, rtVal);
+}
+
+void CPU_SLT(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs < Rt (signed)
+  PGXP_value ret;
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  // iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  ret = CPU_reg[rs(instr)];
+  ret.y = 0.f;
+  ret.compFlags[1] = VALID;
+
+  ret.x = (CPU_reg[rs(instr)].y < CPU_reg[rt(instr)].y) ?
+            1.f :
+            (f16Unsign(CPU_reg[rs(instr)].x) < f16Unsign(CPU_reg[rt(instr)].x)) ? 1.f : 0.f;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_SLTU(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal)
+{
+  // Rd = Rs < Rt (unsigned)
+  PGXP_value ret;
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  // iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  ret = CPU_reg[rs(instr)];
+  ret.y = 0.f;
+  ret.compFlags[1] = VALID;
+
+  ret.x = (f16Unsign(CPU_reg[rs(instr)].y) < f16Unsign(CPU_reg[rt(instr)].y)) ?
+            1.f :
+            (f16Unsign(CPU_reg[rs(instr)].x) < f16Unsign(CPU_reg[rt(instr)].x)) ? 1.f : 0.f;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+////////////////////////////////////
+// Register mult/div
+////////////////////////////////////
+
+void CPU_MULT(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal)
+{
+  // Hi/Lo = Rs * Rt (signed)
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  // iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  CPU_Lo = CPU_Hi = CPU_reg[rs(instr)];
+
+  CPU_Lo.halfFlags[0] = CPU_Hi.halfFlags[0] = (CPU_reg[rs(instr)].halfFlags[0] & CPU_reg[rt(instr)].halfFlags[0]);
+
+  double xx, xy, yx, yy;
+  double lx = 0, ly = 0, hx = 0, hy = 0;
+
+  // Multiply out components
+  xx = f16Unsign(CPU_reg[rs(instr)].x) * f16Unsign(CPU_reg[rt(instr)].x);
+  xy = f16Unsign(CPU_reg[rs(instr)].x) * (CPU_reg[rt(instr)].y);
+  yx = (CPU_reg[rs(instr)].y) * f16Unsign(CPU_reg[rt(instr)].x);
+  yy = (CPU_reg[rs(instr)].y) * (CPU_reg[rt(instr)].y);
+
+  // Split values into outputs
+  lx = xx;
+
+  ly = f16Overflow(xx);
+  ly += xy + yx;
+
+  hx = f16Overflow(ly);
+  hx += yy;
+
+  hy = f16Overflow(hx);
+
+  CPU_Lo.x = (float)f16Sign(lx);
+  CPU_Lo.y = (float)f16Sign(ly);
+  CPU_Hi.x = (float)f16Sign(hx);
+  CPU_Hi.y = (float)f16Sign(hy);
+
+  CPU_Lo.value = loVal;
+  CPU_Hi.value = hiVal;
+}
+
+void CPU_MULTU(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal)
+{
+  // Hi/Lo = Rs * Rt (unsigned)
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  // iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  CPU_Lo = CPU_Hi = CPU_reg[rs(instr)];
+
+  CPU_Lo.halfFlags[0] = CPU_Hi.halfFlags[0] = (CPU_reg[rs(instr)].halfFlags[0] & CPU_reg[rt(instr)].halfFlags[0]);
+
+  double xx, xy, yx, yy;
+  double lx = 0, ly = 0, hx = 0, hy = 0;
+
+  // Multiply out components
+  xx = f16Unsign(CPU_reg[rs(instr)].x) * f16Unsign(CPU_reg[rt(instr)].x);
+  xy = f16Unsign(CPU_reg[rs(instr)].x) * f16Unsign(CPU_reg[rt(instr)].y);
+  yx = f16Unsign(CPU_reg[rs(instr)].y) * f16Unsign(CPU_reg[rt(instr)].x);
+  yy = f16Unsign(CPU_reg[rs(instr)].y) * f16Unsign(CPU_reg[rt(instr)].y);
+
+  // Split values into outputs
+  lx = xx;
+
+  ly = f16Overflow(xx);
+  ly += xy + yx;
+
+  hx = f16Overflow(ly);
+  hx += yy;
+
+  hy = f16Overflow(hx);
+
+  CPU_Lo.x = (float)f16Sign(lx);
+  CPU_Lo.y = (float)f16Sign(ly);
+  CPU_Hi.x = (float)f16Sign(hx);
+  CPU_Hi.y = (float)f16Sign(hy);
+
+  CPU_Lo.value = loVal;
+  CPU_Hi.value = hiVal;
+}
+
+void CPU_DIV(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal)
+{
+  // Lo = Rs / Rt (signed)
+  // Hi = Rs % Rt (signed)
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  //// iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  CPU_Lo = CPU_Hi = CPU_reg[rs(instr)];
+
+  CPU_Lo.halfFlags[0] = CPU_Hi.halfFlags[0] = (CPU_reg[rs(instr)].halfFlags[0] & CPU_reg[rt(instr)].halfFlags[0]);
+
+  double vs = f16Unsign(CPU_reg[rs(instr)].x) + (CPU_reg[rs(instr)].y) * (double)(1 << 16);
+  double vt = f16Unsign(CPU_reg[rt(instr)].x) + (CPU_reg[rt(instr)].y) * (double)(1 << 16);
+
+  double lo = vs / vt;
+  CPU_Lo.y = (float)f16Sign(f16Overflow(lo));
+  CPU_Lo.x = (float)f16Sign(lo);
+
+  double hi = fmod(vs, vt);
+  CPU_Hi.y = (float)f16Sign(f16Overflow(hi));
+  CPU_Hi.x = (float)f16Sign(hi);
+
+  CPU_Lo.value = loVal;
+  CPU_Hi.value = hiVal;
+}
+
+void CPU_DIVU(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal)
+{
+  // Lo = Rs / Rt (unsigned)
+  // Hi = Rs % Rt (unsigned)
+  Validate(&CPU_reg[rs(instr)], rsVal);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  //// iCB: Only require one valid input
+  if (((CPU_reg[rt(instr)].flags & VALID_01) != VALID_01) != ((CPU_reg[rs(instr)].flags & VALID_01) != VALID_01))
+  {
+    MakeValid(&CPU_reg[rs(instr)], rsVal);
+    MakeValid(&CPU_reg[rt(instr)], rtVal);
+  }
+
+  CPU_Lo = CPU_Hi = CPU_reg[rs(instr)];
+
+  CPU_Lo.halfFlags[0] = CPU_Hi.halfFlags[0] = (CPU_reg[rs(instr)].halfFlags[0] & CPU_reg[rt(instr)].halfFlags[0]);
+
+  double vs = f16Unsign(CPU_reg[rs(instr)].x) + f16Unsign(CPU_reg[rs(instr)].y) * (double)(1 << 16);
+  double vt = f16Unsign(CPU_reg[rt(instr)].x) + f16Unsign(CPU_reg[rt(instr)].y) * (double)(1 << 16);
+
+  double lo = vs / vt;
+  CPU_Lo.y = (float)f16Sign(f16Overflow(lo));
+  CPU_Lo.x = (float)f16Sign(lo);
+
+  double hi = fmod(vs, vt);
+  CPU_Hi.y = (float)f16Sign(f16Overflow(hi));
+  CPU_Hi.x = (float)f16Sign(hi);
+
+  CPU_Lo.value = loVal;
+  CPU_Hi.value = hiVal;
+}
+
+////////////////////////////////////
+// Shift operations (sa)
+////////////////////////////////////
+void CPU_SLL(u32 instr, u32 rdVal, u32 rtVal)
+{
+  // Rd = Rt << Sa
+  PGXP_value ret;
+  u32 sh = sa(instr);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  ret = CPU_reg[rt(instr)];
+
+  // TODO: Shift flags
+  double x = f16Unsign(CPU_reg[rt(instr)].x);
+  double y = f16Unsign(CPU_reg[rt(instr)].y);
+  if (sh >= 32)
+  {
+    x = 0.f;
+    y = 0.f;
+  }
+  else if (sh == 16)
+  {
+    y = f16Sign(x);
+    x = 0.f;
+  }
+  else if (sh >= 16)
+  {
+    y = x * (1 << (sh - 16));
+    y = f16Sign(y);
+    x = 0.f;
+  }
+  else
+  {
+    x = x * (1 << sh);
+    y = y * (1 << sh);
+    y += f16Overflow(x);
+    x = f16Sign(x);
+    y = f16Sign(y);
+  }
+
+  ret.x = (float)x;
+  ret.y = (float)y;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_SRL(u32 instr, u32 rdVal, u32 rtVal)
+{
+  // Rd = Rt >> Sa
+  PGXP_value ret;
+  u32 sh = sa(instr);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+
+  ret = CPU_reg[rt(instr)];
+
+  double x = CPU_reg[rt(instr)].x, y = f16Unsign(CPU_reg[rt(instr)].y);
+
+  psx_value iX;
+  iX.d = rtVal;
+  psx_value iY;
+  iY.d = rtVal;
+
+  iX.sd = (iX.sd << 16) >> 16; // remove Y
+  iY.sw.l = iX.sw.h;           // overwrite x with sign(x)
+
+  // Shift test values
+  psx_value dX;
+  dX.sd = iX.sd >> sh;
+  psx_value dY;
+  dY.d = iY.d >> sh;
+
+  if (dX.sw.l != iX.sw.h)
+    x = x / (1 << sh);
+  else
+    x = dX.sw.l; // only sign bits left
+
+  if (dY.sw.l != iX.sw.h)
+  {
+    if (sh == 16)
+    {
+      x = y;
+    }
+    else if (sh < 16)
+    {
+      x += y * (1 << (16 - sh));
+      if (CPU_reg[rt(instr)].x < 0)
+        x += 1 << (16 - sh);
+    }
+    else
+    {
+      x += y / (1 << (sh - 16));
+    }
+  }
+
+  if ((dY.sw.h == 0) || (dY.sw.h == -1))
+    y = dY.sw.h;
+  else
+    y = y / (1 << sh);
+
+  x = f16Sign(x);
+  y = f16Sign(y);
+
+  ret.x = (float)x;
+  ret.y = (float)y;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_SRA(u32 instr, u32 rdVal, u32 rtVal)
+{
+  // Rd = Rt >> Sa
+  PGXP_value ret;
+  u32 sh = sa(instr);
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  ret = CPU_reg[rt(instr)];
+
+  double x = CPU_reg[rt(instr)].x, y = CPU_reg[rt(instr)].y;
+
+  psx_value iX;
+  iX.d = rtVal;
+  psx_value iY;
+  iY.d = rtVal;
+
+  iX.sd = (iX.sd << 16) >> 16; // remove Y
+  iY.sw.l = iX.sw.h;           // overwrite x with sign(x)
+
+  // Shift test values
+  psx_value dX;
+  dX.sd = iX.sd >> sh;
+  psx_value dY;
+  dY.sd = iY.sd >> sh;
+
+  if (dX.sw.l != iX.sw.h)
+    x = x / (1 << sh);
+  else
+    x = dX.sw.l; // only sign bits left
+
+  if (dY.sw.l != iX.sw.h)
+  {
+    if (sh == 16)
+    {
+      x = y;
+    }
+    else if (sh < 16)
+    {
+      x += y * (1 << (16 - sh));
+      if (CPU_reg[rt(instr)].x < 0)
+        x += 1 << (16 - sh);
+    }
+    else
+    {
+      x += y / (1 << (sh - 16));
+    }
+  }
+
+  if ((dY.sw.h == 0) || (dY.sw.h == -1))
+    y = dY.sw.h;
+  else
+    y = y / (1 << sh);
+
+  x = f16Sign(x);
+  y = f16Sign(y);
+
+  ret.x = (float)x;
+  ret.y = (float)y;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+////////////////////////////////////
+// Shift operations variable
+////////////////////////////////////
+void CPU_SLLV(u32 instr, u32 rdVal, u32 rtVal, u32 rsVal)
+{
+  // Rd = Rt << Rs
+  PGXP_value ret;
+  u32 sh = rsVal & 0x1F;
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  Validate(&CPU_reg[rs(instr)], rsVal);
+
+  ret = CPU_reg[rt(instr)];
+
+  double x = f16Unsign(CPU_reg[rt(instr)].x);
+  double y = f16Unsign(CPU_reg[rt(instr)].y);
+  if (sh >= 32)
+  {
+    x = 0.f;
+    y = 0.f;
+  }
+  else if (sh == 16)
+  {
+    y = f16Sign(x);
+    x = 0.f;
+  }
+  else if (sh >= 16)
+  {
+    y = x * (1 << (sh - 16));
+    y = f16Sign(y);
+    x = 0.f;
+  }
+  else
+  {
+    x = x * (1 << sh);
+    y = y * (1 << sh);
+    y += f16Overflow(x);
+    x = f16Sign(x);
+    y = f16Sign(y);
+  }
+
+  ret.x = (float)x;
+  ret.y = (float)y;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_SRLV(u32 instr, u32 rdVal, u32 rtVal, u32 rsVal)
+{
+  // Rd = Rt >> Sa
+  PGXP_value ret;
+  u32 sh = rsVal & 0x1F;
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  Validate(&CPU_reg[rs(instr)], rsVal);
+
+  ret = CPU_reg[rt(instr)];
+
+  double x = CPU_reg[rt(instr)].x, y = f16Unsign(CPU_reg[rt(instr)].y);
+
+  psx_value iX;
+  iX.d = rtVal;
+  psx_value iY;
+  iY.d = rtVal;
+
+  iX.sd = (iX.sd << 16) >> 16; // remove Y
+  iY.sw.l = iX.sw.h;           // overwrite x with sign(x)
+
+  // Shift test values
+  psx_value dX;
+  dX.sd = iX.sd >> sh;
+  psx_value dY;
+  dY.d = iY.d >> sh;
+
+  if (dX.sw.l != iX.sw.h)
+    x = x / (1 << sh);
+  else
+    x = dX.sw.l; // only sign bits left
+
+  if (dY.sw.l != iX.sw.h)
+  {
+    if (sh == 16)
+    {
+      x = y;
+    }
+    else if (sh < 16)
+    {
+      x += y * (1 << (16 - sh));
+      if (CPU_reg[rt(instr)].x < 0)
+        x += 1 << (16 - sh);
+    }
+    else
+    {
+      x += y / (1 << (sh - 16));
+    }
+  }
+
+  if ((dY.sw.h == 0) || (dY.sw.h == -1))
+    y = dY.sw.h;
+  else
+    y = y / (1 << sh);
+
+  x = f16Sign(x);
+  y = f16Sign(y);
+
+  ret.x = (float)x;
+  ret.y = (float)y;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_SRAV(u32 instr, u32 rdVal, u32 rtVal, u32 rsVal)
+{
+  // Rd = Rt >> Sa
+  PGXP_value ret;
+  u32 sh = rsVal & 0x1F;
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  Validate(&CPU_reg[rs(instr)], rsVal);
+
+  ret = CPU_reg[rt(instr)];
+
+  double x = CPU_reg[rt(instr)].x, y = CPU_reg[rt(instr)].y;
+
+  psx_value iX;
+  iX.d = rtVal;
+  psx_value iY;
+  iY.d = rtVal;
+
+  iX.sd = (iX.sd << 16) >> 16; // remove Y
+  iY.sw.l = iX.sw.h;           // overwrite x with sign(x)
+
+  // Shift test values
+  psx_value dX;
+  dX.sd = iX.sd >> sh;
+  psx_value dY;
+  dY.sd = iY.sd >> sh;
+
+  if (dX.sw.l != iX.sw.h)
+    x = x / (1 << sh);
+  else
+    x = dX.sw.l; // only sign bits left
+
+  if (dY.sw.l != iX.sw.h)
+  {
+    if (sh == 16)
+    {
+      x = y;
+    }
+    else if (sh < 16)
+    {
+      x += y * (1 << (16 - sh));
+      if (CPU_reg[rt(instr)].x < 0)
+        x += 1 << (16 - sh);
+    }
+    else
+    {
+      x += y / (1 << (sh - 16));
+    }
+  }
+
+  if ((dY.sw.h == 0) || (dY.sw.h == -1))
+    y = dY.sw.h;
+  else
+    y = y / (1 << sh);
+
+  x = f16Sign(x);
+  y = f16Sign(y);
+
+  ret.x = (float)x;
+  ret.y = (float)y;
+
+  ret.value = rdVal;
+  CPU_reg[rd(instr)] = ret;
+}
+
+void CPU_MFHI(u32 instr, u32 rdVal, u32 hiVal)
+{
+  // Rd = Hi
+  Validate(&CPU_Hi, hiVal);
+
+  CPU_reg[rd(instr)] = CPU_Hi;
+}
+
+void CPU_MTHI(u32 instr, u32 hiVal, u32 rdVal)
+{
+  // Hi = Rd
+  Validate(&CPU_reg[rd(instr)], rdVal);
+
+  CPU_Hi = CPU_reg[rd(instr)];
+}
+
+void CPU_MFLO(u32 instr, u32 rdVal, u32 loVal)
+{
+  // Rd = Lo
+  Validate(&CPU_Lo, loVal);
+
+  CPU_reg[rd(instr)] = CPU_Lo;
+}
+
+void CPU_MTLO(u32 instr, u32 loVal, u32 rdVal)
+{
+  // Lo = Rd
+  Validate(&CPU_reg[rd(instr)], rdVal);
+
+  CPU_Lo = CPU_reg[rd(instr)];
+}
+
+void CPU_MFC0(u32 instr, u32 rtVal, u32 rdVal)
+{
+  // CPU[Rt] = CP0[Rd]
+  Validate(&CP0_reg[rd(instr)], rdVal);
+  CPU_reg[rt(instr)] = CP0_reg[rd(instr)];
+  CPU_reg[rt(instr)].value = rtVal;
+}
+
+void CPU_MTC0(u32 instr, u32 rdVal, u32 rtVal)
+{
+  // CP0[Rd] = CPU[Rt]
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  CP0_reg[rd(instr)] = CPU_reg[rt(instr)];
+  CP0_reg[rd(instr)].value = rdVal;
+}
+
+void CPU_CFC0(u32 instr, u32 rtVal, u32 rdVal)
+{
+  // CPU[Rt] = CP0[Rd]
+  Validate(&CP0_reg[rd(instr)], rdVal);
+  CPU_reg[rt(instr)] = CP0_reg[rd(instr)];
+  CPU_reg[rt(instr)].value = rtVal;
+}
+
+void CPU_CTC0(u32 instr, u32 rdVal, u32 rtVal)
+{
+  // CP0[Rd] = CPU[Rt]
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  CP0_reg[rd(instr)] = CPU_reg[rt(instr)];
+  CP0_reg[rd(instr)].value = rdVal;
+}
+
 } // namespace PGXP
\ No newline at end of file
diff --git a/src/core/pgxp.h b/src/core/pgxp.h
index 02b996615..94fb0fb49 100644
--- a/src/core/pgxp.h
+++ b/src/core/pgxp.h
@@ -51,4 +51,56 @@ void CPU_SB(u32 instr, u8 rtVal, u32 addr);
 void CPU_SH(u32 instr, u16 rtVal, u32 addr);
 void CPU_SW(u32 instr, u32 rtVal, u32 addr);
 
+// Arithmetic with immediate value
+void CPU_ADDI(u32 instr, u32 rtVal, u32 rsVal);
+void CPU_ADDIU(u32 instr, u32 rtVal, u32 rsVal);
+void CPU_ANDI(u32 instr, u32 rtVal, u32 rsVal);
+void CPU_ORI(u32 instr, u32 rtVal, u32 rsVal);
+void CPU_XORI(u32 instr, u32 rtVal, u32 rsVal);
+void CPU_SLTI(u32 instr, u32 rtVal, u32 rsVal);
+void CPU_SLTIU(u32 instr, u32 rtVal, u32 rsVal);
+
+// Load Upper
+void CPU_LUI(u32 instr, u32 rtVal);
+
+// Register Arithmetic
+void CPU_ADD(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_ADDU(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_SUB(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_SUBU(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_AND(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_OR(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_XOR(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_NOR(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_SLT(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+void CPU_SLTU(u32 instr, u32 rdVal, u32 rsVal, u32 rtVal);
+
+// Register mult/div
+void CPU_MULT(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal);
+void CPU_MULTU(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal);
+void CPU_DIV(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal);
+void CPU_DIVU(u32 instr, u32 hiVal, u32 loVal, u32 rsVal, u32 rtVal);
+
+// Shift operations (sa)
+void CPU_SLL(u32 instr, u32 rdVal, u32 rtVal);
+void CPU_SRL(u32 instr, u32 rdVal, u32 rtVal);
+void CPU_SRA(u32 instr, u32 rdVal, u32 rtVal);
+
+// Shift operations variable
+void CPU_SLLV(u32 instr, u32 rdVal, u32 rtVal, u32 rsVal);
+void CPU_SRLV(u32 instr, u32 rdVal, u32 rtVal, u32 rsVal);
+void CPU_SRAV(u32 instr, u32 rdVal, u32 rtVal, u32 rsVal);
+
+// Move registers
+void CPU_MFHI(u32 instr, u32 rdVal, u32 hiVal);
+void CPU_MTHI(u32 instr, u32 hiVal, u32 rdVal);
+void CPU_MFLO(u32 instr, u32 rdVal, u32 loVal);
+void CPU_MTLO(u32 instr, u32 loVal, u32 rdVal);
+
+// CP0 Data transfer tracking
+void CPU_MFC0(u32 instr, u32 rtVal, u32 rdVal);
+void CPU_MTC0(u32 instr, u32 rdVal, u32 rtVal);
+void CPU_CFC0(u32 instr, u32 rtVal, u32 rdVal);
+void CPU_CTC0(u32 instr, u32 rdVal, u32 rtVal);
+
 } // namespace PGXP
\ No newline at end of file
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index 9458112fb..e99dc3c59 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -106,6 +106,7 @@ void Settings::Load(SettingsInterface& si)
   gpu_pgxp_culling = si.GetBoolValue("GPU", "PGXPCulling", true);
   gpu_pgxp_texture_correction = si.GetBoolValue("GPU", "PGXPTextureCorrection", true);
   gpu_pgxp_vertex_cache = si.GetBoolValue("GPU", "PGXPVertexCache", false);
+  gpu_pgxp_cpu = si.GetBoolValue("GPU", "PGXPCPU", false);
 
   display_crop_mode =
     ParseDisplayCropMode(
@@ -215,6 +216,7 @@ void Settings::Save(SettingsInterface& si) const
   si.SetBoolValue("GPU", "PGXPCulling", gpu_pgxp_culling);
   si.SetBoolValue("GPU", "PGXPTextureCorrection", gpu_pgxp_texture_correction);
   si.SetBoolValue("GPU", "PGXPVertexCache", gpu_pgxp_vertex_cache);
+  si.SetBoolValue("GPU", "PGXPCPU", gpu_pgxp_cpu);
 
   si.SetStringValue("Display", "CropMode", GetDisplayCropModeName(display_crop_mode));
   si.SetStringValue("Display", "AspectRatio", GetDisplayAspectRatioName(display_aspect_ratio));
diff --git a/src/core/settings.h b/src/core/settings.h
index ba8ead79a..1aa2f93fa 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -93,6 +93,7 @@ struct Settings
   bool gpu_pgxp_culling = true;
   bool gpu_pgxp_texture_correction = true;
   bool gpu_pgxp_vertex_cache = false;
+  bool gpu_pgxp_cpu = false;
   DisplayCropMode display_crop_mode = DisplayCropMode::None;
   DisplayAspectRatio display_aspect_ratio = DisplayAspectRatio::R4_3;
   bool display_linear_filtering = true;
@@ -157,6 +158,11 @@ struct Settings
   ALWAYS_INLINE bool IsUsingRecompiler() const { return (cpu_execution_mode == CPUExecutionMode::Recompiler); }
   ALWAYS_INLINE bool IsUsingSoftwareRenderer() const { return (gpu_renderer == GPURenderer::Software); }
 
+  ALWAYS_INLINE PGXPMode GetPGXPMode()
+  {
+    return gpu_pgxp_enable ? (gpu_pgxp_cpu ? PGXPMode::CPU : PGXPMode::Memory) : PGXPMode::Disabled;
+  }
+
   bool HasAnyPerGameMemoryCards() const;
 
   enum : u32
diff --git a/src/core/types.h b/src/core/types.h
index 3c234cd90..46482d951 100644
--- a/src/core/types.h
+++ b/src/core/types.h
@@ -48,6 +48,13 @@ enum class CPUExecutionMode : u8
   Count
 };
 
+enum class PGXPMode : u8
+{
+  Disabled,
+  Memory,
+  CPU
+};
+
 enum class GPURenderer : u8
 {
 #ifdef WIN32
diff --git a/src/duckstation-libretro/libretro_host_interface.cpp b/src/duckstation-libretro/libretro_host_interface.cpp
index 986709fb1..e193770e8 100644
--- a/src/duckstation-libretro/libretro_host_interface.cpp
+++ b/src/duckstation-libretro/libretro_host_interface.cpp
@@ -369,7 +369,7 @@ void LibretroHostInterface::OnSystemDestroyed()
   m_using_hardware_renderer = false;
 }
 
-static std::array<retro_core_option_definition, 30> s_option_definitions = {{
+static std::array<retro_core_option_definition, 31> s_option_definitions = {{
   {"duckstation_Console.Region",
    "Console Region",
    "Determines which region/hardware to emulate. Auto-Detect will use the region of the disc inserted.",
@@ -500,6 +500,12 @@ static std::array<retro_core_option_definition, 30> s_option_definitions = {{
    "Uses screen coordinates as a fallback when tracking vertices through memory fails. May improve PGXP compatibility.",
    {{"true", "Enabled"}, {"false", "Disabled"}},
    "false"},
+  {"duckstation_GPU.PGXPCPU",
+   "PGXP CPU Mode",
+   "Tries to track vertex manipulation through the CPU. Some games require this option for PGXP to be effective. "
+   "Very slow, and incompatible with the recompiler.",
+   {{"true", "Enabled"}, {"false", "Disabled"}},
+   "false"},
   {"duckstation_Display.CropMode",
    "Crop Mode",
    "Changes how much of the image is cropped. Some games display garbage in the overscan area which is typically "
@@ -607,7 +613,7 @@ bool LibretroHostInterface::HasCoreVariablesChanged()
 void LibretroHostInterface::LoadSettings()
 {
   LibretroSettingsInterface si;
-  g_settings.Load(si);
+  HostInterface::LoadSettings(si);
 
   // Assume BIOS files are located in system directory.
   const char* system_directory = nullptr;
diff --git a/src/duckstation-qt/gpusettingswidget.cpp b/src/duckstation-qt/gpusettingswidget.cpp
index 15707b271..eae0af7fa 100644
--- a/src/duckstation-qt/gpusettingswidget.cpp
+++ b/src/duckstation-qt/gpusettingswidget.cpp
@@ -44,6 +44,7 @@ GPUSettingsWidget::GPUSettingsWidget(QtHostInterface* host_interface, QWidget* p
   SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.pgxpTextureCorrection, "GPU",
                                                "PGXPTextureCorrection", true);
   SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.pgxpVertexCache, "GPU", "PGXPVertexCache", false);
+  SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.pgxpCPUMode, "GPU", "PGXPCPUMode", false);
 
   connect(m_ui.resolutionScale, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
           &GPUSettingsWidget::updateScaledDitheringEnabled);
@@ -142,6 +143,9 @@ GPUSettingsWidget::GPUSettingsWidget(QtHostInterface* host_interface, QWidget* p
   dialog->registerWidgetHelp(m_ui.pgxpVertexCache, tr("Vertex Cache"), tr("Unchecked"),
                              tr("Uses screen coordinates as a fallback when tracking vertices through memory fails. "
                                 "May improve PGXP compatibility."));
+  dialog->registerWidgetHelp(m_ui.pgxpCPUMode, tr("CPU Mode"), tr("Unchecked"),
+    tr("Tries to track vertex manipulation through the CPU. Some games require this option for PGXP to be effective. "
+       "Very slow, and incompatible with the recompiler."));
 }
 
 GPUSettingsWidget::~GPUSettingsWidget() = default;
@@ -255,4 +259,5 @@ void GPUSettingsWidget::updatePGXPSettingsEnabled()
   m_ui.pgxpCulling->setEnabled(enabled);
   m_ui.pgxpTextureCorrection->setEnabled(enabled);
   m_ui.pgxpVertexCache->setEnabled(enabled);
+  m_ui.pgxpCPUMode->setEnabled(enabled);
 }
diff --git a/src/duckstation-qt/gpusettingswidget.ui b/src/duckstation-qt/gpusettingswidget.ui
index 33dcae1c2..0d858dde2 100644
--- a/src/duckstation-qt/gpusettingswidget.ui
+++ b/src/duckstation-qt/gpusettingswidget.ui
@@ -215,6 +215,13 @@
             </property>
            </widget>
           </item>
+          <item>
+           <widget class="QCheckBox" name="pgxpCPUMode">
+            <property name="text">
+             <string>CPU Mode</string>
+            </property>
+           </widget>
+          </item>
          </layout>
         </widget>
        </item>
diff --git a/src/duckstation-sdl/sdl_host_interface.cpp b/src/duckstation-sdl/sdl_host_interface.cpp
index 06dcb5bd2..762692a3a 100644
--- a/src/duckstation-sdl/sdl_host_interface.cpp
+++ b/src/duckstation-sdl/sdl_host_interface.cpp
@@ -874,6 +874,8 @@ void SDLHostInterface::DrawQuickSettingsMenu()
                                         &m_settings_copy.gpu_pgxp_texture_correction, m_settings_copy.gpu_pgxp_enable);
     settings_changed |= ImGui::MenuItem("PGXP Vertex Cache", nullptr, &m_settings_copy.gpu_pgxp_vertex_cache,
                                         m_settings_copy.gpu_pgxp_enable);
+    settings_changed |=
+      ImGui::MenuItem("PGXP CPU Instructions", nullptr, &m_settings_copy.gpu_pgxp_cpu, m_settings_copy.gpu_pgxp_enable);
     ImGui::EndMenu();
   }
 
@@ -1347,6 +1349,7 @@ void SDLHostInterface::DrawSettingsWindow()
         settings_changed |= ImGui::Checkbox("PGXP Culling", &m_settings_copy.gpu_pgxp_culling);
         settings_changed |= ImGui::Checkbox("PGXP Texture Correction", &m_settings_copy.gpu_pgxp_texture_correction);
         settings_changed |= ImGui::Checkbox("PGXP Vertex Cache", &m_settings_copy.gpu_pgxp_vertex_cache);
+        settings_changed |= ImGui::Checkbox("PGXP CPU", &m_settings_copy.gpu_pgxp_cpu);
       }
 
       ImGui::EndTabItem();