Merge pull request #5110 from MerryMage/const-pool

Jit64: Implement a constant pool
2017-03-20 13:29:57 -07:00 · 2017-03-20 13:29:57 -07:00 · d2690568f9
parent 26bb26fe54 4491e9b829
commit d2690568f9
14 changed files with 232 additions and 68 deletions
--- a/Source/Core/Common/CodeBlock.h
+++ b/Source/Core/Common/CodeBlock.h
@ -42,7 +42,7 @@ public:
  }

  // Call this before you generate any code.
-  void AllocCodeSpace(size_t size, bool need_low = true)
+  virtual void AllocCodeSpace(size_t size, bool need_low = true)
  {
    region_size = size;
    region = static_cast<u8*>(Common::AllocateExecutableMemory(region_size, need_low));
@ -51,7 +51,7 @@ public:

  // Always clear code space with breakpoints, so that if someone accidentally executes
  // uninitialized, it just breaks into the debugger.
-  void ClearCodeSpace()
+  virtual void ClearCodeSpace()
  {
    PoisonMemory();
    ResetCodePtr();
--- a/Source/Core/Core/CMakeLists.txt
+++ b/Source/Core/Core/CMakeLists.txt
@ -245,6 +245,7 @@ if(_M_X86)
    PowerPC/Jit64/JitRegCache.cpp
    PowerPC/Jit64/Jit_SystemRegisters.cpp
    PowerPC/Jit64Common/BlockCache.cpp
+    PowerPC/Jit64Common/ConstantPool.cpp
    PowerPC/Jit64Common/EmuCodeBlock.cpp
    PowerPC/Jit64Common/FarCodeCache.cpp
    PowerPC/Jit64Common/Jit64AsmCommon.cpp
--- a/Source/Core/Core/Core.vcxproj
+++ b/Source/Core/Core/Core.vcxproj
@ -244,6 +244,7 @@
    <ClCompile Include="PowerPC\Interpreter\Interpreter_Paired.cpp" />
    <ClCompile Include="PowerPC\Interpreter\Interpreter_SystemRegisters.cpp" />
    <ClCompile Include="PowerPC\Interpreter\Interpreter_Tables.cpp" />
+    <ClCompile Include="PowerPC\Jit64Common\ConstantPool.cpp" />
    <ClCompile Include="PowerPC\JitILCommon\IR.cpp" />
    <ClCompile Include="PowerPC\JitILCommon\JitILBase_Branch.cpp" />
    <ClCompile Include="PowerPC\JitILCommon\JitILBase_FloatingPoint.cpp" />
@ -486,6 +487,7 @@
    <ClInclude Include="PowerPC\CachedInterpreter\InterpreterBlockCache.h" />
    <ClInclude Include="PowerPC\Interpreter\Interpreter.h" />
    <ClInclude Include="PowerPC\Interpreter\Interpreter_FPUtils.h" />
+    <ClInclude Include="PowerPC\Jit64Common\ConstantPool.h" />
    <ClInclude Include="PowerPC\Jit64IL\JitIL.h" />
    <ClInclude Include="PowerPC\Jit64\FPURegCache.h" />
    <ClInclude Include="PowerPC\Jit64\GPRRegCache.h" />
--- a/Source/Core/Core/Core.vcxproj.filters
+++ b/Source/Core/Core/Core.vcxproj.filters
@ -870,6 +870,9 @@
    <ClCompile Include="IOS\USB\Bluetooth\WiimoteHIDAttr.cpp">
      <Filter>IOS\USB\Bluetooth</Filter>
    </ClCompile>
+    <ClCompile Include="PowerPC\Jit64Common\ConstantPool.cpp">
+      <Filter>PowerPC\Jit64Common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="BootManager.h" />
@ -1493,6 +1496,9 @@
    <ClInclude Include="IOS\MIOS.h">
      <Filter>IOS</Filter>
    </ClInclude>
+    <ClInclude Include="PowerPC\Jit64Common\ConstantPool.h">
+      <Filter>PowerPC\Jit64Common</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Text Include="CMakeLists.txt" />
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -108,7 +108,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Re
      UCOMISD(xmm, R(xmm));
      fixups.push_back(J_CC(CC_P));
    }
-    MOVDDUP(xmm, M(psGeneratedQNaN));
+    MOVDDUP(xmm, MConst(psGeneratedQNaN));
    for (FixupBranch fixup : fixups)
      SetJumpTarget(fixup);
    FixupBranch done = J(true);
@ -127,7 +127,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Re
      SwitchToFarCode();
      SetJumpTarget(handle_nan);
      _assert_msg_(DYNA_REC, clobber == XMM0, "BLENDVPD implicitly uses XMM0");
-      BLENDVPD(xmm, M(psGeneratedQNaN));
+      BLENDVPD(xmm, MConst(psGeneratedQNaN));
      for (u32 x : inputs)
      {
        avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, fpr.R(x), fpr.R(x), CMP_UNORD);
@ -151,7 +151,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Re
      SetJumpTarget(handle_nan);
      MOVAPD(tmp, R(clobber));
      ANDNPD(clobber, R(xmm));
-      ANDPD(tmp, M(psGeneratedQNaN));
+      ANDPD(tmp, MConst(psGeneratedQNaN));
      ORPD(tmp, R(clobber));
      MOVAPD(xmm, R(tmp));
      for (u32 x : inputs)
@ -350,7 +350,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
        ADDSD(XMM1, fpr.R(b));
    }
    if (inst.SUBOP5 == 31)  // nmadd
-      XORPD(XMM1, M(packed ? psSignBits2 : psSignBits));
+      XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
  }
  fpr.BindToRegister(d, !single);
  if (single)
@ -385,15 +385,15 @@ void Jit64::fsign(UGeckoInstruction inst)
  {
  case 40:  // neg
    avx_op(&XEmitter::VXORPD, &XEmitter::XORPD, fpr.RX(d), src,
-           M(packed ? psSignBits2 : psSignBits), packed);
+           MConst(packed ? psSignBits2 : psSignBits), packed);
    break;
  case 136:  // nabs
-    avx_op(&XEmitter::VORPD, &XEmitter::ORPD, fpr.RX(d), src, M(packed ? psSignBits2 : psSignBits),
-           packed);
+    avx_op(&XEmitter::VORPD, &XEmitter::ORPD, fpr.RX(d), src,
+           MConst(packed ? psSignBits2 : psSignBits), packed);
    break;
  case 264:  // abs
-    avx_op(&XEmitter::VANDPD, &XEmitter::ANDPD, fpr.RX(d), src, M(packed ? psAbsMask2 : psAbsMask),
-           packed);
+    avx_op(&XEmitter::VANDPD, &XEmitter::ANDPD, fpr.RX(d), src,
+           MConst(packed ? psAbsMask2 : psAbsMask), packed);
    break;
  default:
    PanicAlert("fsign bleh");
@ -608,7 +608,7 @@ void Jit64::fctiwx(UGeckoInstruction inst)
  // The upper 32 bits of the result are set to 0xfff80000,
  // except for -0.0 where they are set to 0xfff80001 (TODO).

-  MOVAPD(XMM0, M(half_qnan_and_s32_max));
+  MOVAPD(XMM0, MConst(half_qnan_and_s32_max));
  MINSD(XMM0, fpr.R(b));
  switch (inst.SUBOP10)
  {
--- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
@ -623,7 +623,7 @@ void Jit64::mcrfs(UGeckoInstruction inst)
  }
  AND(32, R(RSCRATCH), Imm32(mask));
  MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
-  LEA(64, RSCRATCH, M(m_crTable.data()));
+  LEA(64, RSCRATCH, MConst(m_crTable));
  MOV(64, R(RSCRATCH), MComplex(RSCRATCH, RSCRATCH2, SCALE_8, 0));
  MOV(64, PPCSTATE(cr_val[inst.CRFD]), R(RSCRATCH));
 }
@ -657,14 +657,14 @@ void Jit64::mffsx(UGeckoInstruction inst)
 }

 // MXCSR = s_fpscr_to_mxcsr[FPSCR & 7]
-static const u32 s_fpscr_to_mxcsr[] = {
+static const u32 s_fpscr_to_mxcsr[8] = {
    0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
 };

 // Needs value of FPSCR in RSCRATCH.
 void Jit64::UpdateMXCSR()
 {
-  LEA(64, RSCRATCH2, M(&s_fpscr_to_mxcsr));
+  LEA(64, RSCRATCH2, MConst(s_fpscr_to_mxcsr));
  AND(32, R(RSCRATCH), Imm32(7));
  LDMXCSR(MComplex(RSCRATCH2, RSCRATCH, SCALE_4, 0));
 }
@ -730,7 +730,7 @@ void Jit64::mtfsfix(UGeckoInstruction inst)

  // Field 7 contains NI and RN.
  if (inst.CRFD == 7)
-    LDMXCSR(M(&s_fpscr_to_mxcsr[imm & 7]));
+    LDMXCSR(MConst(s_fpscr_to_mxcsr, imm & 7));
 }

 void Jit64::mtfsfx(UGeckoInstruction inst)
--- a/Source/Core/Core/PowerPC/Jit64Common/ConstantPool.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/ConstantPool.cpp
@ -0,0 +1,67 @@
+// Copyright 2017 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <memory>
+#include <utility>
+
+#include "Common/Assert.h"
+#include "Common/x64Emitter.h"
+#include "Core/PowerPC/Jit64Common/ConstantPool.h"
+
+ConstantPool::ConstantPool(Gen::X64CodeBlock* parent) : m_parent(parent)
+{
+}
+
+ConstantPool::~ConstantPool() = default;
+
+void ConstantPool::AllocCodeSpace()
+{
+  _assert_(!m_current_ptr);
+  Init();
+}
+
+void ConstantPool::ClearCodeSpace()
+{
+  Init();
+}
+
+Gen::OpArg ConstantPool::GetConstantOpArg(const void* value, size_t element_size,
+                                          size_t num_elements, size_t index)
+{
+  const size_t value_size = element_size * num_elements;
+  auto iter = m_const_info.find(value);
+
+  if (iter == m_const_info.end())
+  {
+    void* ptr = std::align(ALIGNMENT, value_size, m_current_ptr, m_remaining_size);
+    _assert_msg_(DYNA_REC, ptr, "Constant pool has run out of space.");
+
+    m_current_ptr = static_cast<u8*>(m_current_ptr) + value_size;
+    m_remaining_size -= value_size;
+
+    std::memcpy(ptr, value, value_size);
+    iter = m_const_info.emplace(std::make_pair(value, ConstantInfo{ptr, value_size})).first;
+  }
+
+  const ConstantInfo& info = iter->second;
+  _assert_msg_(DYNA_REC, info.m_size == value_size,
+               "Constant has incorrect size in constant pool.");
+  u8* location = static_cast<u8*>(info.m_location);
+  return Gen::M(location + element_size * index);
+}
+
+void ConstantPool::Init()
+{
+  // If execution happens to run to the start of the constant pool, halt.
+  m_parent->INT3();
+  m_parent->AlignCode16();
+
+  // Reserve a block of memory CONST_POOL_SIZE in size.
+  m_current_ptr = m_parent->GetWritableCodePtr();
+  m_parent->ReserveCodeSpace(CONST_POOL_SIZE);
+
+  m_remaining_size = CONST_POOL_SIZE;
+  m_const_info.clear();
+}
--- a/Source/Core/Core/PowerPC/Jit64Common/ConstantPool.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/ConstantPool.h
@ -0,0 +1,52 @@
+// Copyright 2017 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <map>
+
+namespace Gen
+{
+struct OpArg;
+class X64CodeBlock;
+}
+
+// Constants are copied into this pool so that they live at a memory location
+// that is close to the code that references it. This ensures that the 32-bit
+// limitation on RIP addressing is not an issue.
+class ConstantPool
+{
+public:
+  static constexpr size_t CONST_POOL_SIZE = 1024 * 32;
+  static constexpr size_t ALIGNMENT = 16;
+
+  explicit ConstantPool(Gen::X64CodeBlock* parent);
+  ~ConstantPool();
+
+  // ConstantPool reserves CONST_POOL_SIZE bytes from parent, and uses
+  // that space to store its constants.
+  void AllocCodeSpace();
+  void ClearCodeSpace();
+
+  // Copies the value into the pool if it doesn't exist. Returns a pointer
+  // to existing values if they were already copied. Pointer equality is
+  // used to determine if two constants are the same.
+  Gen::OpArg GetConstantOpArg(const void* value, size_t element_size, size_t num_elements,
+                              size_t index);
+
+private:
+  void Init();
+
+  struct ConstantInfo
+  {
+    void* m_location;
+    size_t m_size;
+  };
+
+  Gen::X64CodeBlock* m_parent;
+  void* m_current_ptr = nullptr;
+  size_t m_remaining_size = CONST_POOL_SIZE;
+  std::map<const void*, ConstantInfo> m_const_info;
+};
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@ -40,6 +40,18 @@ OpArg FixImmediate(int access_size, OpArg arg)
 }
 }  // Anonymous namespace

+void EmuCodeBlock::ClearCodeSpace()
+{
+  X64CodeBlock::ClearCodeSpace();
+  m_const_pool.ClearCodeSpace();
+}
+
+void EmuCodeBlock::AllocCodeSpace(size_t size, bool need_low)
+{
+  X64CodeBlock::AllocCodeSpace(size + ConstantPool::CONST_POOL_SIZE, need_low);
+  m_const_pool.AllocCodeSpace();
+}
+
 void EmuCodeBlock::MemoryExceptionCheck()
 {
  // TODO: We really should untangle the trampolines, exception handlers and
@ -836,16 +848,16 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
    // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
    if (input.IsSimpleReg() && cpu_info.bAVX)
    {
-      VPAND(tmp, input.GetSimpleReg(), M(psRoundBit));
-      VPAND(output, input.GetSimpleReg(), M(psMantissaTruncate));
+      VPAND(tmp, input.GetSimpleReg(), MConst(psRoundBit));
+      VPAND(output, input.GetSimpleReg(), MConst(psMantissaTruncate));
      PADDQ(output, R(tmp));
    }
    else
    {
      if (!input.IsSimpleReg(output))
        MOVAPD(output, input);
-      avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), M(psRoundBit), true, true);
-      PAND(output, M(psMantissaTruncate));
+      avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), MConst(psRoundBit), true, true);
+      PAND(output, MConst(psMantissaTruncate));
      PADDQ(output, R(tmp));
    }
  }
@ -890,7 +902,7 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
  MOVSD(XMM1, R(src));

  // Grab Exponent
-  PAND(XMM1, M(&double_exponent));
+  PAND(XMM1, MConst(double_exponent));
  PSRLQ(XMM1, 52);
  MOVD_xmm(R(RSCRATCH), XMM1);

@ -909,15 +921,15 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)

  // xmm1 = fraction | 0x0010000000000000
  MOVSD(XMM1, R(src));
-  PAND(XMM1, M(&double_fraction));
-  POR(XMM1, M(&double_explicit_top_bit));
+  PAND(XMM1, MConst(double_fraction));
+  POR(XMM1, MConst(double_explicit_top_bit));

  // fraction >> shift
  PSRLQ(XMM1, R(XMM0));

  // OR the sign bit in.
  MOVSD(XMM0, R(src));
-  PAND(XMM0, M(&double_sign_bit));
+  PAND(XMM0, MConst(double_sign_bit));
  PSRLQ(XMM0, 32);
  POR(XMM1, R(XMM0));

@ -930,12 +942,12 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)

  // We want bits 0, 1
  MOVSD(XMM1, R(src));
-  PAND(XMM1, M(&double_top_two_bits));
+  PAND(XMM1, MConst(double_top_two_bits));
  PSRLQ(XMM1, 32);

  // And 5 through to 34
  MOVSD(XMM0, R(src));
-  PAND(XMM0, M(&double_bottom_bits));
+  PAND(XMM0, MConst(double_bottom_bits));
  PSRLQ(XMM0, 29);

  // OR them togther
@ -967,8 +979,8 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
  // Here, check to see if the source is small enough that it will result in a denormal, and pass it
  // to the x87 unit
  // if it is.
-  avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM0, R(src), M(&double_sign_bit), true, true);
-  UCOMISD(XMM0, M(&min_norm_single));
+  avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM0, R(src), MConst(double_sign_bit), true, true);
+  UCOMISD(XMM0, MConst(min_norm_single));
  FixupBranch nanConversion = J_CC(CC_P, true);
  FixupBranch denormalConversion = J_CC(CC_B, true);
  CVTSD2SS(dst, R(src));
@ -982,7 +994,7 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
  FixupBranch continue1 = J_CC(CC_C, true);
  // Clear the quiet bit of the SNaN, which was 0 (signalling) but got set to 1 (quiet) by
  // conversion.
-  ANDPS(dst, M(&single_qnan_bit));
+  ANDPS(dst, MConst(single_qnan_bit));
  FixupBranch continue2 = J(true);

  SetJumpTarget(denormalConversion);
@ -1025,7 +1037,7 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
  SetJumpTarget(nanConversion);
  TEST(32, R(gprsrc), Imm32(0x00400000));
  FixupBranch continue1 = J_CC(CC_NZ, true);
-  ANDPD(dst, M(&double_qnan_bit));
+  ANDPD(dst, MConst(double_qnan_bit));
  FixupBranch continue2 = J(true);
  SwitchToNearCode();

@ -1057,7 +1069,7 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
  {
    MOVQ_xmm(R(RSCRATCH), xmm);
    SHR(64, R(RSCRATCH), Imm8(63));  // Get the sign bit; almost all the branches need it.
-    PTEST(xmm, M(psDoubleExp));
+    PTEST(xmm, MConst(psDoubleExp));
    FixupBranch maxExponent = J_CC(CC_C);
    FixupBranch zeroExponent = J_CC(CC_Z);

@ -1067,7 +1079,7 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
    continue1 = J();

    SetJumpTarget(maxExponent);
-    PTEST(xmm, M(psDoubleFrac));
+    PTEST(xmm, MConst(psDoubleFrac));
    FixupBranch notNAN = J_CC(CC_Z);

    // Max exponent + mantissa: PPC_FPCLASS_QNAN
@ -1097,10 +1109,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
  else
  {
    MOVQ_xmm(R(RSCRATCH), xmm);
-    TEST(64, R(RSCRATCH), M(psDoubleExp));
+    TEST(64, R(RSCRATCH), MConst(psDoubleExp));
    FixupBranch zeroExponent = J_CC(CC_Z);
-    AND(64, R(RSCRATCH), M(psDoubleNoSign));
-    CMP(64, R(RSCRATCH), M(psDoubleExp));
+    AND(64, R(RSCRATCH), MConst(psDoubleNoSign));
+    CMP(64, R(RSCRATCH), MConst(psDoubleExp));
    FixupBranch nan =
        J_CC(CC_G);  // This works because if the sign bit is set, RSCRATCH is negative
    FixupBranch infinity = J_CC(CC_E);
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
@ -10,6 +10,7 @@
 #include "Common/CommonTypes.h"
 #include "Common/x64Emitter.h"

+#include "Core/PowerPC/Jit64Common/ConstantPool.h"
 #include "Core/PowerPC/Jit64Common/FarCodeCache.h"
 #include "Core/PowerPC/Jit64Common/TrampolineInfo.h"

@ -22,12 +23,27 @@ class Mapping;
 class EmuCodeBlock : public Gen::X64CodeBlock
 {
 public:
+  void ClearCodeSpace() override;
+  void AllocCodeSpace(size_t size, bool need_low = true) override;
+
  void MemoryExceptionCheck();

  // Simple functions to switch between near and far code emitting
  void SwitchToFarCode();
  void SwitchToNearCode();

+  template <typename T>
+  Gen::OpArg MConst(const T& value)
+  {
+    return m_const_pool.GetConstantOpArg(&value, sizeof(T), 1, 0);
+  }
+
+  template <typename T, size_t N>
+  Gen::OpArg MConst(const T (&value)[N], size_t index = 0)
+  {
+    return m_const_pool.GetConstantOpArg(&value, sizeof(T), N, index);
+  }
+
  Gen::FixupBranch CheckIfSafeAddress(const Gen::OpArg& reg_value, Gen::X64Reg reg_addr,
                                      BitSet32 registers_in_use);
  void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize,
@ -105,6 +121,7 @@ public:
  void Clear();

 protected:
+  ConstantPool m_const_pool{this};
  FarCodeCache m_far_code;
  u8* m_near_code;  // Backed up when we switch to far code.

--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
@ -180,8 +180,6 @@ void CommonAsmRoutines::GenMfcr()
  X64Reg tmp = RSCRATCH2;
  X64Reg cr_val = RSCRATCH_EXTRA;
  XOR(32, R(dst), R(dst));
-  // we only need to zero the high bits of tmp once
-  XOR(32, R(tmp), R(tmp));
  for (int i = 0; i < 8; i++)
  {
    static const u32 m_flagTable[8] = {0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9};
@ -190,9 +188,13 @@ void CommonAsmRoutines::GenMfcr()

    MOV(64, R(cr_val), PPCSTATE(cr_val[i]));

+    // Upper bits of tmp need to be zeroed.
+    // Note: tmp is used later for address calculations and thus
+    //       can't be zero-ed once. This also prevents partial
+    //       register stalls due to SETcc.
+    XOR(32, R(tmp), R(tmp));
    // EQ: Bits 31-0 == 0; set flag bit 1
    TEST(32, R(cr_val), R(cr_val));
-    // FIXME: is there a better way to do this without the partial register merging?
    SETcc(CC_Z, R(tmp));
    LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));

@ -204,7 +206,8 @@ void CommonAsmRoutines::GenMfcr()
    // SO: Bit 61 set; set flag bit 0
    // LT: Bit 62 set; set flag bit 3
    SHR(64, R(cr_val), Imm8(61));
-    OR(32, R(dst), MScaled(cr_val, SCALE_4, PtrOffset(m_flagTable)));
+    LEA(64, tmp, MConst(m_flagTable));
+    OR(32, R(dst), MComplex(tmp, cr_val, SCALE_4, 0));
  }
  RET();

@ -297,11 +300,12 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
    if (quantize == -1)
    {
      SHR(32, R(RSCRATCH2), Imm8(5));
-      MULSS(XMM0, MDisp(RSCRATCH2, PtrOffset(m_quantizeTableS)));
+      LEA(64, RSCRATCH, MConst(m_quantizeTableS));
+      MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
    }
    else if (quantize > 0)
    {
-      MULSS(XMM0, M(&m_quantizeTableS[quantize * 2]));
+      MULSS(XMM0, MConst(m_quantizeTableS, quantize * 2));
    }

    switch (type)
@ -309,20 +313,20 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
    case QUANTIZE_U8:
      XORPS(XMM1, R(XMM1));
      MAXSS(XMM0, R(XMM1));
-      MINSS(XMM0, M(&m_255));
+      MINSS(XMM0, MConst(m_255));
      break;
    case QUANTIZE_S8:
-      MAXSS(XMM0, M(&m_m128));
-      MINSS(XMM0, M(&m_127));
+      MAXSS(XMM0, MConst(m_m128));
+      MINSS(XMM0, MConst(m_127));
      break;
    case QUANTIZE_U16:
      XORPS(XMM1, R(XMM1));
      MAXSS(XMM0, R(XMM1));
-      MINSS(XMM0, M(m_65535));
+      MINSS(XMM0, MConst(m_65535));
      break;
    case QUANTIZE_S16:
-      MAXSS(XMM0, M(&m_m32768));
-      MINSS(XMM0, M(&m_32767));
+      MAXSS(XMM0, MConst(m_m32768));
+      MINSS(XMM0, MConst(m_32767));
      break;
    default:
      break;
@ -335,12 +339,13 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
    if (quantize == -1)
    {
      SHR(32, R(RSCRATCH2), Imm8(5));
-      MOVQ_xmm(XMM1, MDisp(RSCRATCH2, PtrOffset(m_quantizeTableS)));
+      LEA(64, RSCRATCH, MConst(m_quantizeTableS));
+      MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
      MULPS(XMM0, R(XMM1));
    }
    else if (quantize > 0)
    {
-      MOVQ_xmm(XMM1, M(&m_quantizeTableS[quantize * 2]));
+      MOVQ_xmm(XMM1, MConst(m_quantizeTableS, quantize * 2));
      MULPS(XMM0, R(XMM1));
    }

@ -358,7 +363,7 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
    // is out of int32 range while it's OK for large negatives, it isn't for positives
    // I don't know whether the overflow actually happens in any games but it potentially can
    // cause problems, so we need some clamping
-    MINPS(XMM0, M(m_65535));
+    MINPS(XMM0, MConst(m_65535));
    CVTTPS2DQ(XMM0, R(XMM0));

    switch (type)
@ -419,7 +424,7 @@ void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
  {
    if (cpu_info.bSSSE3)
    {
-      PSHUFB(XMM0, M(pbswapShuffle2x4));
+      PSHUFB(XMM0, MConst(pbswapShuffle2x4));
      MOVQ_xmm(R(RSCRATCH), XMM0);
    }
    else
@ -492,13 +497,14 @@ void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type,
    if (quantize == -1)
    {
      SHR(32, R(RSCRATCH2), Imm8(5));
-      MULSS(XMM0, MDisp(RSCRATCH2, PtrOffset(m_dequantizeTableS)));
+      LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
+      MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
    }
    else if (quantize > 0)
    {
-      MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
+      MULSS(XMM0, MConst(m_dequantizeTableS, quantize * 2));
    }
-    UNPCKLPS(XMM0, M(m_one));
+    UNPCKLPS(XMM0, MConst(m_one));
  }
  else
  {
@ -564,12 +570,13 @@ void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type,
    if (quantize == -1)
    {
      SHR(32, R(RSCRATCH2), Imm8(5));
-      MOVQ_xmm(XMM1, MDisp(RSCRATCH2, PtrOffset(m_dequantizeTableS)));
+      LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
+      MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
      MULPS(XMM0, R(XMM1));
    }
    else if (quantize > 0)
    {
-      MOVQ_xmm(XMM1, M(&m_dequantizeTableS[quantize * 2]));
+      MOVQ_xmm(XMM1, MConst(m_dequantizeTableS, quantize * 2));
      MULPS(XMM0, R(XMM1));
    }
  }
@ -597,7 +604,7 @@ void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
    else if (cpu_info.bSSSE3)
    {
      MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
-      PSHUFB(XMM0, M(pbswapShuffle1x4));
+      PSHUFB(XMM0, MConst(pbswapShuffle1x4));
    }
    else
    {
@ -605,7 +612,7 @@ void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
      MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
    }

-    UNPCKLPS(XMM0, M(m_one));
+    UNPCKLPS(XMM0, MConst(m_one));
  }
  else
  {
@ -623,7 +630,7 @@ void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
    else if (cpu_info.bSSSE3)
    {
      MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
-      PSHUFB(XMM0, M(pbswapShuffle2x4));
+      PSHUFB(XMM0, MConst(pbswapShuffle2x4));
    }
    else
    {
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@ -1753,7 +1753,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)

      X64Reg reg = fregURegWithMov(RI, I);
      alignas(16) static const u32 ssSignBits[4] = {0x80000000};
-      Jit->PXOR(reg, M(ssSignBits));
+      Jit->PXOR(reg, Jit->MConst(ssSignBits));
      RI.fregs[reg] = I;
      fregNormalRegClear(RI, I);
      break;
@ -1765,7 +1765,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)

      X64Reg reg = fregURegWithMov(RI, I);
      alignas(16) static const u64 sdSignBits[2] = {0x8000000000000000ULL};
-      Jit->PXOR(reg, M(sdSignBits));
+      Jit->PXOR(reg, Jit->MConst(sdSignBits));
      RI.fregs[reg] = I;
      fregNormalRegClear(RI, I);
      break;
@ -1777,7 +1777,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)

      X64Reg reg = fregURegWithMov(RI, I);
      alignas(16) static const u32 psSignBits[4] = {0x80000000, 0x80000000};
-      Jit->PXOR(reg, M(psSignBits));
+      Jit->PXOR(reg, Jit->MConst(psSignBits));
      RI.fregs[reg] = I;
      fregNormalRegClear(RI, I);
      break;
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp
@ -7,7 +7,7 @@
 alignas(16) const u8 pbswapShuffle1x4[16] = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 alignas(16) const u8 pbswapShuffle2x4[16] = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};

-alignas(16) const float m_quantizeTableS[] = {
+alignas(16) const float m_quantizeTableS[128] = {
    (1ULL << 0),        (1ULL << 0),        (1ULL << 1),        (1ULL << 1),
    (1ULL << 2),        (1ULL << 2),        (1ULL << 3),        (1ULL << 3),
    (1ULL << 4),        (1ULL << 4),        (1ULL << 5),        (1ULL << 5),
@ -42,7 +42,7 @@ alignas(16) const float m_quantizeTableS[] = {
    1.0 / (1ULL << 2),  1.0 / (1ULL << 2),  1.0 / (1ULL << 1),  1.0 / (1ULL << 1),
 };

-alignas(16) const float m_dequantizeTableS[] = {
+alignas(16) const float m_dequantizeTableS[128] = {
    1.0 / (1ULL << 0),  1.0 / (1ULL << 0),  1.0 / (1ULL << 1),  1.0 / (1ULL << 1),
    1.0 / (1ULL << 2),  1.0 / (1ULL << 2),  1.0 / (1ULL << 3),  1.0 / (1ULL << 3),
    1.0 / (1ULL << 4),  1.0 / (1ULL << 4),  1.0 / (1ULL << 5),  1.0 / (1ULL << 5),
@ -77,4 +77,4 @@ alignas(16) const float m_dequantizeTableS[] = {
    (1ULL << 2),        (1ULL << 2),        (1ULL << 1),        (1ULL << 1),
 };

-alignas(16) const float m_one[] = {1.0f, 0.0f, 0.0f, 0.0f};
+alignas(16) const float m_one[4] = {1.0f, 0.0f, 0.0f, 0.0f};
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
@ -8,9 +8,9 @@

 alignas(16) extern const u8 pbswapShuffle1x4[16];
 alignas(16) extern const u8 pbswapShuffle2x4[16];
-alignas(16) extern const float m_one[];
-alignas(16) extern const float m_quantizeTableS[];
-alignas(16) extern const float m_dequantizeTableS[];
+alignas(16) extern const float m_one[4];
+alignas(16) extern const float m_quantizeTableS[128];
+alignas(16) extern const float m_dequantizeTableS[128];

 class CommonAsmRoutinesBase
 {