diff --git a/Assets/dll/ares64.wbx.zst b/Assets/dll/ares64.wbx.zst
index c8bcda7e57..7a228052ba 100644
Binary files a/Assets/dll/ares64.wbx.zst and b/Assets/dll/ares64.wbx.zst differ
diff --git a/waterbox/ares64/BizInterface.cpp b/waterbox/ares64/BizInterface.cpp
index dd7074429c..9ff179d9b4 100644
--- a/waterbox/ares64/BizInterface.cpp
+++ b/waterbox/ares64/BizInterface.cpp
@@ -31,19 +31,13 @@ typedef enum
 	START   = 1 << 13,
 } Buttons_t;
 
-static u64 biztime = 0;
-
-static u64 GetBizTime()
-{
-	return biztime;
-}
-
 struct BizPlatform : ares::Platform
 {
 	auto attach(ares::Node::Object) -> void override;
 	auto pak(ares::Node::Object) -> ares::VFS::Pak override;
 	auto audio(ares::Node::Audio::Stream) -> void override;
 	auto input(ares::Node::Input::Input) -> void override;
+	auto time() -> n64 override;
 
 	ares::VFS::Pak bizpak = nullptr;
 	u16* soundbuf = alloc_invisible<u16>(1024 * 2);
@@ -51,6 +45,7 @@ struct BizPlatform : ares::Platform
 	bool hack = false;
 	void (*inputcb)() = nullptr;
 	bool lagged = true;
+	u64 biztime = 0;
 };
 
 auto BizPlatform::attach(ares::Node::Object node) -> void
@@ -88,7 +83,12 @@ auto BizPlatform::input(ares::Node::Input::Input node) -> void
 			if (inputcb) inputcb();
 		}
 	}
-};
+}
+
+auto BizPlatform::time() -> n64
+{
+	return biztime;
+}
 
 static ares::Node::System root = nullptr;
 static BizPlatform* platform = nullptr;
@@ -98,6 +98,7 @@ static array_view<u8>* romData = nullptr;
 static array_view<u8>* diskData = nullptr;
 static array_view<u8>* diskErrorData = nullptr;
 static array_view<u8>* saveData = nullptr;
+static array_view<u8>* rtcData = nullptr;
 static array_view<u8>* gbRomData[4] = { nullptr, nullptr, nullptr, nullptr, };
 
 typedef enum
@@ -311,7 +312,8 @@ static inline SaveType DetectSaveType(u8* rom)
 	if (id == "NW4") ret = FLASH128KB;
 	if (id == "NDP") ret = FLASH128KB;
 
-	if(id[1] == 'E' && id[2] == 'D') {
+	if (id[1] == 'E' && id[2] == 'D')
+	{
 		n8 config = revision;
 		if (config.bit(4,7) == 1) ret = EEPROM512;
 		else if (config.bit(4,7) == 2) ret = EEPROM2KB;
@@ -324,6 +326,26 @@ static inline SaveType DetectSaveType(u8* rom)
 	return ret;
 }
 
+static inline bool DetectRtc(u8* rom)
+{
+	string id;
+	id.append((char)rom[0x3B]);
+	id.append((char)rom[0x3C]);
+	id.append((char)rom[0x3D]);
+
+	u8 revision = rom[0x3f];
+
+	if (id == "NAF") return true;
+
+	if (id[1] == 'E' && id[2] == 'D')
+	{
+		n8 config = revision;
+		return config.bit(0) == 1;
+	}
+
+	return false;
+}
+
 namespace ares::Nintendo64
 {
 	extern bool BobDeinterlace;
@@ -351,16 +373,6 @@ typedef struct
 	GbRom GbRoms[4];
 } LoadData;
 
-#define SET_RTC_CALLBACK(NUM) do { \
-	if (auto pad = dynamic_cast<ares::Nintendo64::Gamepad*>(ares::Nintendo64::controllerPort##NUM.device.data())) \
-	{ \
-		if (auto mbc3 = dynamic_cast<ares::Nintendo64::Mbc3*>(pad->transferPak.mbc.data())) \
-		{ \
-			mbc3->rtcCallback = GetBizTime; \
-		} \
-	} \
-} while (0)
-
 static bool LoadRom(LoadData* loadData, bool isPal)
 {
 	u8* data;
@@ -406,6 +418,15 @@ static bool LoadRom(LoadData* loadData, bool isPal)
 		platform->bizpak->append(name, *saveData);
 	}
 
+	if (DetectRtc(data))
+	{
+		len = 32, name = "save.rtc";
+		data = new u8[len];
+		memset(data, 0xFF, len);
+		rtcData = new array_view<u8>(data, len);
+		platform->bizpak->append(name, *rtcData);
+	}
+
 	if (auto port = root->find<ares::Node::Port>("Cartridge Slot"))
 	{
 		port->allocate();
@@ -464,8 +485,7 @@ ECL_EXPORT bool Init(LoadData* loadData, ControllerType* controllers, bool isPal
 	platform->bizpak = new vfs::directory;
 	ares::platform = platform;
 
-	biztime = initTime;
-	ares::Nintendo64::dd.rtcCallback = GetBizTime;
+	platform->biztime = initTime;
 
 	angrylion::OutFrameBuffer = NULL;
 	angrylion::OutHeight = isPal ? 576 : 480;
@@ -579,11 +599,6 @@ ECL_EXPORT bool Init(LoadData* loadData, ControllerType* controllers, bool isPal
 		}
 	}
 
-	SET_RTC_CALLBACK(1);
-	SET_RTC_CALLBACK(2);
-	SET_RTC_CALLBACK(3);
-	SET_RTC_CALLBACK(4);
-
 	root->power(false);
 	root->run(); // HACK, first frame dirties a ton of memory, so we emulate it then seal (this should be investigated, not sure why 60MBish of memory would be dirtied in a single frame?)
 	return true;
@@ -682,15 +697,17 @@ static u8 PeekFunc(u64 address)
 		}
 	}
 
-	return ares::Nintendo64::bus.read<ares::Nintendo64::Byte>(addr);
+	u32 unused = 0;
+	return ares::Nintendo64::bus.read<ares::Nintendo64::Byte>(addr, unused);
 }
 
 static void SysBusAccess(u8* buffer, u64 address, u64 count, bool write)
 {
 	if (write)
 	{
+		u32 unused = 0;
 		while (count--)
-			ares::Nintendo64::bus.write<ares::Nintendo64::Byte>(address++, *buffer++);
+			ares::Nintendo64::bus.write<ares::Nintendo64::Byte>(address++, *buffer++, unused);
 	}
 	else
 	{
@@ -791,7 +808,7 @@ ECL_EXPORT void FrameAdvance(MyFrameInfo* f)
 
 	angrylion::OutFrameBuffer = f->SkipDraw ? NULL : f->VideoBuffer;
 
-	biztime = f->Time;
+	platform->biztime = f->Time;
 
 	if (f->Power)
 	{
@@ -828,7 +845,10 @@ ECL_EXPORT void SetInputCallback(void (*callback)())
 
 ECL_EXPORT void PostLoadState()
 {
+	ares::Nintendo64::cpu.recompiler.allocator.release(bump_allocator::zero_fill);
 	ares::Nintendo64::cpu.recompiler.reset();
+	ares::Nintendo64::rsp.recompiler.allocator.release(bump_allocator::zero_fill);
+	ares::Nintendo64::rsp.recompiler.reset();
 }
 
 ECL_EXPORT void GetDisassembly(u32 address, u32 instruction, char* buf)
diff --git a/waterbox/ares64/Makefile b/waterbox/ares64/Makefile
index 4cfd2cd859..19696ec6a1 100644
--- a/waterbox/ares64/Makefile
+++ b/waterbox/ares64/Makefile
@@ -1,10 +1,13 @@
 NEED_LIBCO := 1
 
 ARES_PATH = $(ROOT_DIR)/ares/ares
+NALL_PATH = $(ROOT_DIR)/ares/nall
 THIRDPARTY_PATH = $(ROOT_DIR)/ares/thirdparty
 ANGRYLION_PATH = $(THIRDPARTY_PATH)/angrylion-rdp/mylittle-nocomment
 SLJIT_PATH = $(THIRDPARTY_PATH)/sljit/sljit_src
 
+CCFLAGS := -march=x86-64-v2 -I.$(THIRDPARTY_PATH) -DSLJIT_HAVE_CONFIG_PRE=1 -DSLJIT_HAVE_CONFIG_POST=1
+
 CXXFLAGS := -std=gnu++17 -march=x86-64-v2 \
 	-I../libco -I.$(ROOT_DIR)/ares -I.$(ARES_PATH) -I.$(THIRDPARTY_PATH) -I.$(ANGRYLION_PATH) \
 	-Werror=int-to-pointer-cast -Wno-unused-but-set-variable -Wno-format-security \
@@ -12,11 +15,14 @@ CXXFLAGS := -std=gnu++17 -march=x86-64-v2 \
 	-Wno-sign-compare -Wno-switch -Wno-unused-local-typedefs -Wno-bool-operation \
 	-Wno-mismatched-tags -Wno-missing-braces -Wno-overloaded-virtual \
 	-Wno-unused-private-field -Wno-sometimes-uninitialized \
-	-fno-strict-aliasing -fwrapv -fno-operator-names \
+	-fno-strict-aliasing -fwrapv \
 	-DSLJIT_HAVE_CONFIG_PRE=1 -DSLJIT_HAVE_CONFIG_POST=1
 
 TARGET = ares64.wbx
 
+SRCS_NALL = \
+	$(NALL_PATH)/nall.cpp
+
 SRCS_PROCESSORS = \
 	$(ARES_PATH)/component/processor/sm5k/sm5k.cpp
 
@@ -28,6 +34,7 @@ SRCS_N64 = \
 	$(ARES_PATH)/n64/memory/memory.cpp \
 	$(ARES_PATH)/n64/system/system.cpp \
 	$(ARES_PATH)/n64/cartridge/cartridge.cpp \
+	$(ARES_PATH)/n64/cic/cic.cpp \
 	$(ARES_PATH)/n64/controller/controller.cpp \
 	$(ARES_PATH)/n64/dd/dd.cpp \
 	$(ARES_PATH)/n64/mi/mi.cpp \
@@ -50,6 +57,6 @@ SRCS_SLJIT = \
 	$(SLJIT_PATH)/sljitLir.c \
 	$(THIRDPARTY_PATH)/sljitAllocator.cpp
 
-SRCS = $(SRCS_PROCESSORS) $(SRCS_ARES) $(SRCS_N64) $(SRCS_ANGRYLION) $(SRCS_SLJIT) BizInterface.cpp
+SRCS = $(SRCS_NALL) $(SRCS_PROCESSORS) $(SRCS_ARES) $(SRCS_N64) $(SRCS_ANGRYLION) $(SRCS_SLJIT) BizInterface.cpp
 
 include ../common.mak
diff --git a/waterbox/ares64/ares/ares/ares/ares.hpp b/waterbox/ares64/ares/ares/ares/ares.hpp
index f406d455f5..0bba0202c0 100644
--- a/waterbox/ares64/ares/ares/ares/ares.hpp
+++ b/waterbox/ares64/ares/ares/ares/ares.hpp
@@ -10,6 +10,7 @@
 #include <nall/any.hpp>
 #include <nall/array.hpp>
 #include <nall/bump-allocator.hpp>
+#include <nall/case-range.hpp>
 #include <nall/chrono.hpp>
 #include <nall/directory.hpp>
 #include <nall/dl.hpp>
@@ -38,10 +39,11 @@
 #include <nall/hash/crc32.hpp>
 #include <nall/hash/sha256.hpp>
 using namespace nall;
+using namespace nall::primitives;
 
 namespace ares {
   static const string Name       = "ares";
-  static const string Version    = "130.1";
+  static const string Version    = "132";
   static const string Copyright  = "ares team, Near";
   static const string License    = "ISC";
   static const string LicenseURI = "https://opensource.org/licenses/ISC";
@@ -50,7 +52,6 @@ namespace ares {
 
   //incremented only when serialization format changes
   static const u32    SerializerSignature = 0x31545342;  //"BST1" (little-endian)
-  static const string SerializerVersion   = "130.3";
 
   namespace VFS {
     using Pak = shared_pointer<vfs::directory>;
diff --git a/waterbox/ares64/ares/ares/ares/memory/fixed-allocator.cpp b/waterbox/ares64/ares/ares/ares/memory/fixed-allocator.cpp
index db5c2ff7b8..ee92ac771e 100644
--- a/waterbox/ares64/ares/ares/ares/memory/fixed-allocator.cpp
+++ b/waterbox/ares64/ares/ares/ares/memory/fixed-allocator.cpp
@@ -7,7 +7,7 @@ namespace ares::Memory {
 FixedAllocator::FixedAllocator() {
 }
 #else
-alignas(4096) u8 fixedBuffer[8_MiB];
+alignas(4096) u8 fixedBuffer[128_MiB] ECL_INVISIBLE;
 
 FixedAllocator::FixedAllocator() {
   _allocator.resize(sizeof(fixedBuffer), 0, fixedBuffer);
diff --git a/waterbox/ares64/ares/ares/ares/node/audio/stream.hpp b/waterbox/ares64/ares/ares/ares/node/audio/stream.hpp
index 13c3007751..a928c1d5dd 100644
--- a/waterbox/ares64/ares/ares/ares/node/audio/stream.hpp
+++ b/waterbox/ares64/ares/ares/ares/node/audio/stream.hpp
@@ -25,7 +25,7 @@ struct Stream : Audio {
   template<typename... P>
   auto frame(P&&... p) -> void {
     if(runAhead()) return;
-    f64 samples[sizeof...(p)] = {std::forward<P>(p)...};
+    f64 samples[sizeof...(p)] = {f64(std::forward<P>(p))...};
     write(samples);
   }
 
diff --git a/waterbox/ares64/ares/ares/ares/node/debugger/tracer/instruction.hpp b/waterbox/ares64/ares/ares/ares/node/debugger/tracer/instruction.hpp
index 19b9545d37..0c354c3b2a 100644
--- a/waterbox/ares64/ares/ares/ares/node/debugger/tracer/instruction.hpp
+++ b/waterbox/ares64/ares/ares/ares/node/debugger/tracer/instruction.hpp
@@ -31,7 +31,7 @@ struct Instruction : Tracer {
   auto address(u64 address) -> bool {
     address &= ~0ull >> (64 - _addressBits);  //mask upper bits of address
     _address = address;
-    /*address >>= _addressMask;  //clip unneeded alignment bits (to reduce _masks size)
+    address >>= _addressMask;  //clip unneeded alignment bits (to reduce _masks size)
 
     if(_mask) {
       auto mask = _masks.find(address);
@@ -50,7 +50,7 @@ struct Instruction : Tracer {
         _history[index] = _history[index + 1];
       }
       _history.last() = _address;
-    }*/
+    }
 
     return true;
   }
@@ -58,13 +58,13 @@ struct Instruction : Tracer {
   //mark an already-executed address as not executed yet for trace masking.
   //call when writing to executable RAM to support self-modifying code.
   auto invalidate(u64 address) -> void {
-    /*if(unlikely(_mask)) {
+    if(unlikely(_mask)) {
       address &= ~0ull >> (64 - _addressBits);
       address >>= _addressMask;
 
       auto mask = _masks.find(address);
       if(mask) mask->unvisit(address);
-    }*/
+    }
   }
 
   auto notify(const string& instruction, const string& context, const string& extra = {}) -> void {
diff --git a/waterbox/ares64/ares/ares/ares/platform.hpp b/waterbox/ares64/ares/ares/ares/platform.hpp
index 5dd0c693b8..c2e670f4ac 100644
--- a/waterbox/ares64/ares/ares/ares/platform.hpp
+++ b/waterbox/ares64/ares/ares/ares/platform.hpp
@@ -20,6 +20,7 @@ struct Platform {
   virtual auto video(Node::Video::Screen, const u32* data, u32 pitch, u32 width, u32 height) -> void {}
   virtual auto audio(Node::Audio::Stream) -> void {}
   virtual auto input(Node::Input::Input) -> void {}
+  virtual auto time() -> n64 { return ::time(0); }
 };
 
 extern Platform* platform;
diff --git a/waterbox/ares64/ares/ares/component/processor/sm5k/disassembler.cpp b/waterbox/ares64/ares/ares/component/processor/sm5k/disassembler.cpp
index 5c84321eb3..c305ea8f97 100644
--- a/waterbox/ares64/ares/ares/component/processor/sm5k/disassembler.cpp
+++ b/waterbox/ares64/ares/ares/component/processor/sm5k/disassembler.cpp
@@ -12,54 +12,54 @@ auto SM5K::disassembleInstruction() -> string {
   string pc = {"0x", hex(n4(opcode) << 8 | operand, 3L)};
 
   switch(opcode) {
-  case 0x00 ... 0x0f: s = {"adx  ", p4}; break;
-  case 0x10 ... 0x1f: s = {"lax  ", p4}; break;
-  case 0x20 ... 0x2f: s = {"lblx ", p4}; break;
-  case 0x30 ... 0x3f: s = {"lbmx ", p4}; break;
-  case 0x40 ... 0x43: s = {"rm   ", p2}; break;
-  case 0x44 ... 0x47: s = {"sm   ", p2}; break;
-  case 0x48 ... 0x4b: s = {"tm   ", p2}; break;
-  case 0x4c ... 0x4f: s = {"tpb  ", p2}; break;
-  case 0x50 ... 0x53: s = {"lda  ", p2}; break;
-  case 0x54 ... 0x57: s = {"exc  ", p2}; break;
-  case 0x58 ... 0x5b: s = {"exci ", p2}; break;
-  case 0x5c ... 0x5f: s = {"excd ", p2}; break;
-  case 0x60:          s = {"rc   "    }; break;
-  case 0x61:          s = {"sc   "    }; break;
-  case 0x62:          s = {"id   "    }; break;
-  case 0x63:          s = {"ie   "    }; break;
-  case 0x64:          s = {"exax "    }; break;
-  case 0x65:          s = {"atx  "    }; break;
-  case 0x66:          s = {"exbm "    }; break;
-  case 0x67:          s = {"exbl "    }; break;
-  case 0x68:          s = {"ex   "    }; break;
-  case 0x69:          s = {"dta  ", p8}; break;
-  case 0x6a:          s = {"pat  ", p8}; break;
-  case 0x6b:          s = {"tabl "    }; break;
-  case 0x6c:          s = {"ta   "    }; break;
-  case 0x6d:          s = {"tb   "    }; break;
-  case 0x6e:          s = {"tc   "    }; break;
-  case 0x6f:          s = {"tam  "    }; break;
-  case 0x70:          s = {"inl  "    }; break;
-  case 0x71:          s = {"outl "    }; break;
-  case 0x72:          s = {"anp  "    }; break;
-  case 0x73:          s = {"orp  "    }; break;
-  case 0x74:          s = {"in   "    }; break;
-  case 0x75:          s = {"out  "    }; break;
-  case 0x76:          s = {"stop "    }; break;
-  case 0x77:          s = {"halt "    }; break;
-  case 0x78:          s = {"incb "    }; break;
-  case 0x79:          s = {"coma "    }; break;
-  case 0x7a:          s = {"add  "    }; break;
-  case 0x7b:          s = {"adc  "    }; break;
-  case 0x7c:          s = {"decb "    }; break;
-  case 0x7d:          s = {"rtn  "    }; break;
-  case 0x7e:          s = {"rtns "    }; break;
-  case 0x7f:          s = {"rtni "    }; break;
-  case 0x80 ... 0xbf: s = {"tr   ", p6}; break;
-  case 0xc0 ... 0xdf: s = {"trs  ", p5}; break;
-  case 0xe0 ... 0xef: s = {"tl   ", pc}; break;
-  case 0xf0 ... 0xff: s = {"call ", pc}; break;
+  case range16(0x00, 0x0f): s = {"adx  ", p4}; break;
+  case range16(0x10, 0x1f): s = {"lax  ", p4}; break;
+  case range16(0x20, 0x2f): s = {"lblx ", p4}; break;
+  case range16(0x30, 0x3f): s = {"lbmx ", p4}; break;
+  case range4 (0x40, 0x43): s = {"rm   ", p2}; break;
+  case range4 (0x44, 0x47): s = {"sm   ", p2}; break;
+  case range4 (0x48, 0x4b): s = {"tm   ", p2}; break;
+  case range4 (0x4c, 0x4f): s = {"tpb  ", p2}; break;
+  case range4 (0x50, 0x53): s = {"lda  ", p2}; break;
+  case range4 (0x54, 0x57): s = {"exc  ", p2}; break;
+  case range4 (0x58, 0x5b): s = {"exci ", p2}; break;
+  case range4 (0x5c, 0x5f): s = {"excd ", p2}; break;
+  case 0x60:                s = {"rc   "    }; break;
+  case 0x61:                s = {"sc   "    }; break;
+  case 0x62:                s = {"id   "    }; break;
+  case 0x63:                s = {"ie   "    }; break;
+  case 0x64:                s = {"exax "    }; break;
+  case 0x65:                s = {"atx  "    }; break;
+  case 0x66:                s = {"exbm "    }; break;
+  case 0x67:                s = {"exbl "    }; break;
+  case 0x68:                s = {"ex   "    }; break;
+  case 0x69:                s = {"dta  ", p8}; break;
+  case 0x6a:                s = {"pat  ", p8}; break;
+  case 0x6b:                s = {"tabl "    }; break;
+  case 0x6c:                s = {"ta   "    }; break;
+  case 0x6d:                s = {"tb   "    }; break;
+  case 0x6e:                s = {"tc   "    }; break;
+  case 0x6f:                s = {"tam  "    }; break;
+  case 0x70:                s = {"inl  "    }; break;
+  case 0x71:                s = {"outl "    }; break;
+  case 0x72:                s = {"anp  "    }; break;
+  case 0x73:                s = {"orp  "    }; break;
+  case 0x74:                s = {"in   "    }; break;
+  case 0x75:                s = {"out  "    }; break;
+  case 0x76:                s = {"stop "    }; break;
+  case 0x77:                s = {"halt "    }; break;
+  case 0x78:                s = {"incb "    }; break;
+  case 0x79:                s = {"coma "    }; break;
+  case 0x7a:                s = {"add  "    }; break;
+  case 0x7b:                s = {"adc  "    }; break;
+  case 0x7c:                s = {"decb "    }; break;
+  case 0x7d:                s = {"rtn  "    }; break;
+  case 0x7e:                s = {"rtns "    }; break;
+  case 0x7f:                s = {"rtni "    }; break;
+  case range64(0x80, 0xbf): s = {"tr   ", p6}; break;
+  case range32(0xc0, 0xdf): s = {"trs  ", p5}; break;
+  case range16(0xe0, 0xef): s = {"tl   ", pc}; break;
+  case range16(0xf0, 0xff): s = {"call ", pc}; break;
   }
 
   while(s.size() < 10) s.append(" ");
diff --git a/waterbox/ares64/ares/ares/component/processor/sm5k/instruction.cpp b/waterbox/ares64/ares/ares/component/processor/sm5k/instruction.cpp
index dc38b50f3d..b992f1f4fe 100644
--- a/waterbox/ares64/ares/ares/component/processor/sm5k/instruction.cpp
+++ b/waterbox/ares64/ares/ares/component/processor/sm5k/instruction.cpp
@@ -20,54 +20,54 @@ auto SM5K::instruction() -> void {
 
   n8 opcode = fetch();
   switch(opcode) {
-  op(0x00 ... 0x0f, ADX,  n4(opcode));
-  op(0x10 ... 0x1f, LAX,  n4(opcode));
-  op(0x20 ... 0x2f, LBLX, n4(opcode));
-  op(0x30 ... 0x3f, LBMX, n4(opcode));
-  op(0x40 ... 0x43, RM,   n2(opcode));
-  op(0x44 ... 0x47, SM,   n2(opcode));
-  op(0x48 ... 0x4b, TM,   n2(opcode));
-  op(0x4c ... 0x4f, TPB,  n2(opcode));
-  op(0x50 ... 0x53, LDA,  n2(opcode));
-  op(0x54 ... 0x57, EXC,  n2(opcode));
-  op(0x58 ... 0x5b, EXCI, n2(opcode));
-  op(0x5c ... 0x5f, EXCD, n2(opcode));
-  op(0x60,          RC    );
-  op(0x61,          SC    );
-  op(0x62,          ID    );
-  op(0x63,          IE    );
-  op(0x64,          EXAX  );
-  op(0x65,          ATX   );
-  op(0x66,          EXBM  );
-  op(0x67,          EXBL  );
-  op(0x68,          EX    );
-  op(0x69,          DTA,  fetch());
-  op(0x6a,          PAT,  fetch());
-  op(0x6b,          TABL  );
-  op(0x6c,          TA    );
-  op(0x6d,          TB    );
-  op(0x6e,          TC    );
-  op(0x6f,          TAM   );
-  op(0x70,          INL   );
-  op(0x71,          OUTL  );
-  op(0x72,          ANP   );
-  op(0x73,          ORP   );
-  op(0x74,          IN    );
-  op(0x75,          OUT   );
-  op(0x76,          STOP  );
-  op(0x77,          HALT  );
-  op(0x78,          INCB  );
-  op(0x79,          COMA  );
-  op(0x7a,          ADD   );
-  op(0x7b,          ADC   );
-  op(0x7c,          DECB  );
-  op(0x7d,          RTN   );
-  op(0x7e,          RTNS  );
-  op(0x7f,          RTNI  );
-  op(0x80 ... 0xbf, TR,   n6(opcode));
-  op(0xc0 ... 0xdf, TRS,  n5(opcode));
-  op(0xe0 ... 0xef, TL,   n4(opcode) << 8 | fetch());
-  op(0xf0 ... 0xff, CALL, n4(opcode) << 8 | fetch());
+  op(range16(0x00, 0x0f), ADX,  n4(opcode));
+  op(range16(0x10, 0x1f), LAX,  n4(opcode));
+  op(range16(0x20, 0x2f), LBLX, n4(opcode));
+  op(range16(0x30, 0x3f), LBMX, n4(opcode));
+  op(range4 (0x40, 0x43), RM,   n2(opcode));
+  op(range4 (0x44, 0x47), SM,   n2(opcode));
+  op(range4 (0x48, 0x4b), TM,   n2(opcode));
+  op(range4 (0x4c, 0x4f), TPB,  n2(opcode));
+  op(range4 (0x50, 0x53), LDA,  n2(opcode));
+  op(range4 (0x54, 0x57), EXC,  n2(opcode));
+  op(range4 (0x58, 0x5b), EXCI, n2(opcode));
+  op(range4 (0x5c, 0x5f), EXCD, n2(opcode));
+  op(0x60,                RC    );
+  op(0x61,                SC    );
+  op(0x62,                ID    );
+  op(0x63,                IE    );
+  op(0x64,                EXAX  );
+  op(0x65,                ATX   );
+  op(0x66,                EXBM  );
+  op(0x67,                EXBL  );
+  op(0x68,                EX    );
+  op(0x69,                DTA,  fetch());
+  op(0x6a,                PAT,  fetch());
+  op(0x6b,                TABL  );
+  op(0x6c,                TA    );
+  op(0x6d,                TB    );
+  op(0x6e,                TC    );
+  op(0x6f,                TAM   );
+  op(0x70,                INL   );
+  op(0x71,                OUTL  );
+  op(0x72,                ANP   );
+  op(0x73,                ORP   );
+  op(0x74,                IN    );
+  op(0x75,                OUT   );
+  op(0x76,                STOP  );
+  op(0x77,                HALT  );
+  op(0x78,                INCB  );
+  op(0x79,                COMA  );
+  op(0x7a,                ADD   );
+  op(0x7b,                ADC   );
+  op(0x7c,                DECB  );
+  op(0x7d,                RTN   );
+  op(0x7e,                RTNS  );
+  op(0x7f,                RTNI  );
+  op(range64(0x80, 0xbf), TR,   n6(opcode));
+  op(range32(0xc0, 0xdf), TRS,  n5(opcode));
+  op(range16(0xe0, 0xef), TL,   n4(opcode) << 8 | fetch());
+  op(range16(0xf0, 0xff), CALL, n4(opcode) << 8 | fetch());
   }
 }
 
diff --git a/waterbox/ares64/ares/ares/n64/accuracy.hpp b/waterbox/ares64/ares/ares/n64/accuracy.hpp
index 76875d61b0..007ecd4641 100644
--- a/waterbox/ares64/ares/ares/n64/accuracy.hpp
+++ b/waterbox/ares64/ares/ares/n64/accuracy.hpp
@@ -3,7 +3,7 @@ struct Accuracy {
   static constexpr bool Reference = 0;
 
   struct CPU {
-    static constexpr bool Interpreter = 0 | Reference;
+    static constexpr bool Interpreter = 0 | Reference | !recompiler::generic::supported;
     static constexpr bool Recompiler = !Interpreter;
 
     //exceptions when the CPU accesses unaligned memory addresses
@@ -11,7 +11,7 @@ struct Accuracy {
   };
 
   struct RSP {
-    static constexpr bool Interpreter = 0 | Reference;
+    static constexpr bool Interpreter = 0 | Reference | !recompiler::generic::supported;
     static constexpr bool Recompiler = !Interpreter;
 
     //VU instructions
@@ -22,4 +22,9 @@ struct Accuracy {
   struct RDRAM {
     static constexpr bool Broadcasting = 0;
   };
+
+  struct PIF {
+    // Emulate a region-locked console
+    static constexpr bool RegionLock = false;
+  };
 };
diff --git a/waterbox/ares64/ares/ares/n64/ai/ai.hpp b/waterbox/ares64/ares/ares/n64/ai/ai.hpp
index 5f3dee5313..a3b5a13f33 100644
--- a/waterbox/ares64/ares/ares/n64/ai/ai.hpp
+++ b/waterbox/ares64/ares/ares/n64/ai/ai.hpp
@@ -1,6 +1,6 @@
 //Audio Interface
 
-struct AI : Thread, Memory::IO<AI> {
+struct AI : Thread, Memory::RCP<AI> {
   Node::Object node;
   Node::Audio::Stream stream;
 
@@ -23,8 +23,8 @@ struct AI : Thread, Memory::IO<AI> {
   auto power(bool reset) -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
 
   //serialization.cpp
   auto serialize(serializer&) -> void;
diff --git a/waterbox/ares64/ares/ares/n64/ai/io.cpp b/waterbox/ares64/ares/ares/n64/ai/io.cpp
index 3f293a7e78..011d203033 100644
--- a/waterbox/ares64/ares/ares/n64/ai/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/ai/io.cpp
@@ -1,4 +1,4 @@
-auto AI::readWord(u32 address) -> u32 {
+auto AI::readWord(u32 address, u32& cycles) -> u32 {
   address = (address & 0xfffff) >> 2;
   n32 data;
 
@@ -21,7 +21,7 @@ auto AI::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto AI::writeWord(u32 address, u32 data_) -> void {
+auto AI::writeWord(u32 address, u32 data_, u32& cycles) -> void {
   address = (address & 0xfffff) >> 2;
   n32 data = data_;
 
diff --git a/waterbox/ares64/ares/ares/n64/cartridge/cartridge.cpp b/waterbox/ares64/ares/ares/n64/cartridge/cartridge.cpp
index de58f54f1f..9826499ec4 100644
--- a/waterbox/ares64/ares/ares/n64/cartridge/cartridge.cpp
+++ b/waterbox/ares64/ares/ares/n64/cartridge/cartridge.cpp
@@ -5,6 +5,8 @@ namespace ares::Nintendo64 {
 Cartridge& cartridge = cartridgeSlot.cartridge;
 #include "slot.cpp"
 #include "flash.cpp"
+#include "rtc.cpp"
+#include "joybus.cpp"
 #include "isviewer.cpp"
 #include "debugger.cpp"
 #include "serialization.cpp"
@@ -43,6 +45,8 @@ auto Cartridge::connect() -> void {
     flash.load(fp);
   }
 
+  rtc.load();
+
   isviewer.ram.allocate(64_KiB);
 
   debugger.load(node);
@@ -77,6 +81,8 @@ auto Cartridge::save() -> void {
   if(auto fp = pak->write("save.flash")) {
     flash.save(fp);
   }
+
+  rtc.save();
 }
 
 auto Cartridge::power(bool reset) -> void {
@@ -85,6 +91,7 @@ auto Cartridge::power(bool reset) -> void {
   flash.source = 0;
   flash.offset = 0;
   isviewer.ram.fill(0);
+  rtc.power(reset);
 }
 
 }
diff --git a/waterbox/ares64/ares/ares/n64/cartridge/cartridge.hpp b/waterbox/ares64/ares/ares/n64/cartridge/cartridge.hpp
index 190a9526b3..22825ba5e3 100644
--- a/waterbox/ares64/ares/ares/n64/cartridge/cartridge.hpp
+++ b/waterbox/ares64/ares/ares/n64/cartridge/cartridge.hpp
@@ -38,14 +38,38 @@ struct Cartridge {
     u32  source = 0;
     u32  offset = 0;
   } flash;
-  struct ISViewer : Memory::IO<ISViewer> {
+  struct ISViewer : Memory::PI<ISViewer> {
     Memory::Writable ram;  //unserialized
 
     //isviewer.cpp
+    auto readHalf(u32 address) -> u16;
+    auto writeHalf(u32 address, u16 data) -> void;
     auto readWord(u32 address) -> u32;
     auto writeWord(u32 address, u32 data) -> void;
   } isviewer;
 
+  struct RTC {
+    Cartridge& self;
+    RTC(Cartridge &self) : self(self) {}
+
+    Memory::Writable ram;
+    n1 present;
+    n8 status;
+    n3 writeLock;
+
+    // rtc.cpp
+    auto power(bool reset) -> void;
+    auto run(bool run) -> void;
+    auto running() -> bool;
+    auto load() -> void;
+    auto save() -> void;
+    auto tick(int nsec=1) -> void;
+    auto advance(int nsec) -> void;
+    auto serialize(serializer& s) -> void;
+    auto read(u2 block, n8 *data) -> void;
+    auto write(u2 block, n8 *data) -> void;
+  } rtc{*this};
+
   struct Debugger {
     //debugger.cpp
     auto load(Node::Object) -> void;
@@ -67,10 +91,12 @@ struct Cartridge {
   auto allocate(Node::Port) -> Node::Peripheral;
   auto connect() -> void;
   auto disconnect() -> void;
-
   auto save() -> void;
   auto power(bool reset) -> void;
 
+  //joybus.cpp
+  auto joybusComm(n8 send, n8 recv, n8 input[], n8 output[]) -> n2;
+
   //serialization.cpp
   auto serialize(serializer&) -> void;
 
diff --git a/waterbox/ares64/ares/ares/n64/cartridge/isviewer.cpp b/waterbox/ares64/ares/ares/n64/cartridge/isviewer.cpp
index 496ad4d00a..f7df89abfa 100644
--- a/waterbox/ares64/ares/ares/n64/cartridge/isviewer.cpp
+++ b/waterbox/ares64/ares/ares/n64/cartridge/isviewer.cpp
@@ -1,22 +1,39 @@
-auto Cartridge::ISViewer::readWord(u32 address) -> u32 {
-  u32 data = ram.read<Word>(address);
-  address = (address & 0xffff) >> 2;
-
-  if(address == 0) {
-    data = 0x49533634;  //'IS64'
-  }
-
-  return data;
+auto Cartridge::ISViewer::readHalf(u32 address) -> u16 {
+  address = (address & 0xffff);
+  return ram.read<Half>(address);
 }
 
-auto Cartridge::ISViewer::writeWord(u32 address, u32 data) -> void {
-  ram.write<Word>(address, data);
-  address = (address & 0xffff) >> 2;
+auto Cartridge::ISViewer::readWord(u32 address) -> u32 {
+  address = (address & 0xffff);
+  return ram.read<Word>(address);
+}
 
-  if(address == 5) {
-    for(auto address : range(u16(data))) {
+auto Cartridge::ISViewer::writeHalf(u32 address, u16 data) -> void {
+  address = (address & 0xffff);
+
+  if(address == 0x16) {
+    // HACK: allow printf output to work for both libultra and libdragon
+    // Libultra expects a real IS-Viewer device and treats this address as a
+    // pointer to the end of the buffer, reading the current value, writing N
+    // bytes, then updating the buffer pointer.
+    // libdragon instead treats this as a "number of bytes" register, only
+    // writing an "output byte count"
+    // In order to satisfy both libraries, we assume it behaves as libdragon
+    // expects, and by forcing the write to never hit ram, libultra remains
+    // functional.
+    for(auto address : range(data)) {
       char c = ram.read<Byte>(0x20 + address);
       fputc(c, stdout);
     }
+    return;
   }
+
+  ram.write<Half>(address, data);
 }
+
+auto Cartridge::ISViewer::writeWord(u32 address, u32 data) -> void {
+  address = (address & 0xffff);
+  writeHalf(address+0, data >> 16);
+  writeHalf(address+2, data & 0xffff);
+}
+
diff --git a/waterbox/ares64/ares/ares/n64/cartridge/joybus.cpp b/waterbox/ares64/ares/ares/n64/cartridge/joybus.cpp
new file mode 100644
index 0000000000..a2461d6b65
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/cartridge/joybus.cpp
@@ -0,0 +1,75 @@
+
+auto Cartridge::joybusComm(n8 send, n8 recv, n8 input[], n8 output[]) -> n2 {
+  n1 valid = 0, over = 0;
+  
+  //status
+  if(input[0] == 0x00 || input[0] == 0xff) {
+    //cartridge EEPROM (4kbit)
+    if(cartridge.eeprom.size == 512) {
+      output[0] = 0x00;
+      output[1] = 0x80;
+      output[2] = 0x00;
+      valid = 1;
+    }
+
+    //cartridge EEPROM (16kbit)
+    if(cartridge.eeprom.size == 2048) {
+      output[0] = 0x00;
+      output[1] = 0xc0;
+      output[2] = 0x00;
+      valid = 1;
+    }
+  }
+
+  //read EEPROM
+  if(input[0] == 0x04 && send >= 2) {
+    u32 address = input[1] * 8;
+    for(u32 index : range(recv)) {
+      output[index] = cartridge.eeprom.read<Byte>(address++);
+    }
+    valid = 1;
+  }
+
+  //write EEPROM
+  if(input[0] == 0x05 && send >= 2 && recv >= 1) {
+    u32 address = input[1] * 8;
+    for(u32 index : range(send - 2)) {
+      cartridge.eeprom.write<Byte>(address++, input[2 + index]);
+    }
+    output[0] = 0x00;
+    valid = 1;
+  }
+
+  //RTC status
+  if(input[0] == 0x06 && send >= 1 && recv >= 3) {
+    if(cartridge.rtc.present) {
+      output[0] = 0x00;
+      output[1] = 0x10;
+      output[2] = rtc.status;
+      valid = 1;
+    }
+  }
+
+  //RTC read
+  if(input[0] == 0x07 && send >= 2 && recv >= 9) {
+    if(cartridge.rtc.present) {
+      rtc.read(input[1], &output[0]);
+      output[8] = 0x00;
+      valid = 1;
+    }
+  }
+
+  //RTC write
+  if(input[0] == 0x08 && send >= 10 && recv >= 1) {
+    if(cartridge.rtc.present) {
+      rtc.write(input[1], &input[2]);
+      output[0] = 0x00;
+      valid = 1;
+    }
+  }
+
+  n2 status;
+  status.bit(0) = valid;
+  status.bit(1) = over;
+  return status;
+}
diff --git a/waterbox/ares64/ares/ares/n64/cartridge/rtc.cpp b/waterbox/ares64/ares/ares/n64/cartridge/rtc.cpp
new file mode 100644
index 0000000000..66530fa593
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/cartridge/rtc.cpp
@@ -0,0 +1,102 @@
+
+auto Cartridge::RTC::power(bool reset) -> void {
+  if(present) run(!status.bit(7));
+}
+
+auto Cartridge::RTC::load() -> void {
+  if(auto fp = self.pak->read("save.rtc")) {
+    ram.allocate(fp->size());
+    ram.load(fp);
+  
+    present = 1;
+    n64 timestamp = ram.read<Dual>(24);
+    if(!~timestamp) {
+      ram.fill(0);
+      ram.write<Byte>(21, 1);
+    }
+
+    timestamp = platform->time() - timestamp;
+    advance(timestamp);
+  }
+}
+
+auto Cartridge::RTC::save() -> void {
+#if false
+  if(auto fp = self.pak->write("save.rtc")) {
+    ram.write<Dual>(24, time(0));
+    ram.save(fp);
+  }
+#endif
+}
+
+auto Cartridge::RTC::tick(int nsec) -> void {
+  advance(nsec);
+  run(true);
+}
+
+auto Cartridge::RTC::run(bool run) -> void {
+  status.bit(7) = !run;
+  queue.remove(Queue::RTC_Tick);
+  if(run) queue.insert(Queue::RTC_Tick, 187'500'000);
+}
+
+auto Cartridge::RTC::running() -> bool {
+  return !status.bit(7);
+}
+
+auto Cartridge::RTC::advance(int nsec) -> void {
+  struct tm tmm = {};
+  tmm.tm_sec = BCD::decode(ram.read<Byte>(16));
+  tmm.tm_min = BCD::decode(ram.read<Byte>(17));
+  tmm.tm_hour = BCD::decode(ram.read<Byte>(18) & 0x7f);
+  tmm.tm_mday = BCD::decode(ram.read<Byte>(19));
+  tmm.tm_mon = BCD::decode(ram.read<Byte>(21)) - 1;
+  tmm.tm_year = BCD::decode(ram.read<Byte>(22)) + 100 * BCD::decode(ram.read<Byte>(23));
+  time_t t = mktime(&tmm);
+
+  t += nsec;
+
+  tmm = *localtime(&t);
+  ram.write<Byte>(16, BCD::encode(tmm.tm_sec));
+  ram.write<Byte>(17, BCD::encode(tmm.tm_min));
+  ram.write<Byte>(18, BCD::encode(tmm.tm_hour) | 0x80);
+  ram.write<Byte>(19, BCD::encode(tmm.tm_mday));
+  ram.write<Byte>(20, BCD::encode(tmm.tm_wday));
+  ram.write<Byte>(21, BCD::encode(tmm.tm_mon + 1));
+  ram.write<Byte>(22, BCD::encode(tmm.tm_year % 100));
+  ram.write<Byte>(23, BCD::encode(tmm.tm_year / 100));
+}
+
+auto Cartridge::RTC::read(u2 block, n8* data) -> void {
+  data[0] = ram.read<Byte>(block*8 + 0);
+  data[1] = ram.read<Byte>(block*8 + 1);
+  data[2] = ram.read<Byte>(block*8 + 2);
+  data[3] = ram.read<Byte>(block*8 + 3);
+  data[4] = ram.read<Byte>(block*8 + 4);
+  data[5] = ram.read<Byte>(block*8 + 5);
+  data[6] = ram.read<Byte>(block*8 + 6);
+  data[7] = ram.read<Byte>(block*8 + 7);
+}
+
+auto Cartridge::RTC::write(u2 block, n8* data) -> void {
+  if (writeLock.bit(block)) return;
+  ram.write<Byte>(block*8 + 0, data[0]);
+  ram.write<Byte>(block*8 + 1, data[1]);
+  ram.write<Byte>(block*8 + 2, data[2]);
+  ram.write<Byte>(block*8 + 3, data[3]);
+  ram.write<Byte>(block*8 + 4, data[4]);
+  ram.write<Byte>(block*8 + 5, data[5]);
+  ram.write<Byte>(block*8 + 6, data[6]);
+  ram.write<Byte>(block*8 + 7, data[7]);
+  if(block == 0) {
+    n16 control = ram.read<Half>(0);
+    writeLock.bit(1,2) = control.bit(8,9);
+    run(!control.bit(2));
+  }
+}
+
+auto Cartridge::RTC::serialize(serializer& s) -> void {
+  s(ram);
+  s(status);
+  s(writeLock);
+}
diff --git a/waterbox/ares64/ares/ares/n64/cartridge/serialization.cpp b/waterbox/ares64/ares/ares/n64/cartridge/serialization.cpp
index 49a70aac66..72c96f2d97 100644
--- a/waterbox/ares64/ares/ares/n64/cartridge/serialization.cpp
+++ b/waterbox/ares64/ares/ares/n64/cartridge/serialization.cpp
@@ -2,4 +2,5 @@ auto Cartridge::serialize(serializer& s) -> void {
   s(ram);
   s(eeprom);
   s(flash);
+  s(rtc);
 }
diff --git a/waterbox/ares64/ares/ares/n64/cic/cic.cpp b/waterbox/ares64/ares/ares/n64/cic/cic.cpp
new file mode 100644
index 0000000000..c0d85195de
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/cic/cic.cpp
@@ -0,0 +1,93 @@
+#include <n64/n64.hpp>
+
+namespace ares::Nintendo64 {
+
+CIC cic;
+#include "io.cpp"
+#include "commands.cpp"
+#include "serialization.cpp"
+
+auto CIC::power(bool reset) -> void {
+  model = cartridge.node ? cartridge.cic() : dd.cic();
+  type = Cartridge;
+  challengeAlgo = DummyChallenge;
+  if(model == "CIC-NUS-6101") region = NTSC, seed = 0x3f, checksum = 0x45cc73ee317aull;
+  if(model == "CIC-NUS-6102") region = NTSC, seed = 0x3f, checksum = 0xa536c0f1d859ull;
+  if(model == "CIC-NUS-7101") region = PAL,  seed = 0x3f, checksum = 0xa536c0f1d859ull;
+  if(model == "CIC-NUS-7102") region = PAL,  seed = 0x3f, checksum = 0x44160ec5d9afull;
+  if(model == "CIC-NUS-6103") region = NTSC, seed = 0x78, checksum = 0x586fd4709867ull;
+  if(model == "CIC-NUS-7103") region = PAL,  seed = 0x78, checksum = 0x586fd4709867ull;
+  if(model == "CIC-NUS-6105") region = NTSC, seed = 0x91, checksum = 0x8618a45bc2d3ull, challengeAlgo = RealChallenge;
+  if(model == "CIC-NUS-7105") region = PAL,  seed = 0x91, checksum = 0x8618a45bc2d3ull, challengeAlgo = RealChallenge;
+  if(model == "CIC-NUS-6106") region = NTSC, seed = 0x85, checksum = 0x2bbad4e6eb74ull;
+  if(model == "CIC-NUS-7106") region = PAL,  seed = 0x85, checksum = 0x2bbad4e6eb74ull;
+  if(model == "CIC-NUS-8303") region = NTSC, seed = 0xdd, checksum = 0x32b294e2ab90ull, type = DD64;
+  if(model == "CIC-NUS-8401") region = NTSC, seed = 0xdd, checksum = 0x6ee8d9e84970ull, type = DD64;
+  if(model == "CIC-NUS-5167") region = NTSC, seed = 0xdd, checksum = 0x083c6c77e0b1ull;
+  if(model == "CIC-NUS-DDUS") region = NTSC, seed = 0xde, checksum = 0x05ba2ef0a5f1ull, type = DD64;
+  state = BootRegion;
+  fifo.bits.resize(32*4);
+}
+
+auto CIC::scramble(n4 *buf, int size) -> void {
+  for(int i : range(1,size)) buf[i] += buf[i-1] + 1;
+}
+
+auto CIC::poll() -> void {
+  if(state == BootRegion) {
+    fifo.write(type);
+    fifo.write(region == PAL);
+    fifo.write(0);
+    fifo.write(1);
+    state = BootSeed;
+    return;
+  }
+
+  if(state == BootSeed) {
+    n4 buf[6];
+    buf[0] = 0xB;
+    buf[1] = 0x5;
+    buf[2] = seed.bit(4,7);
+    buf[3] = seed.bit(0,3);
+    buf[4] = seed.bit(4,7);
+    buf[5] = seed.bit(0,3);
+    for (auto i : range(2)) scramble(buf, 6);
+    for (auto i : range(6)) fifo.writeNibble(buf[i]);
+    state = BootChecksum;
+    return;
+  }
+
+  if(state == BootChecksum) {
+    n4 buf[16];
+    buf[0] = 0x4;  //true random
+    buf[1] = 0x7;  //true random
+    buf[2] = 0xA;  //true random
+    buf[3] = 0x1;  //true random
+    for (auto i : range(12)) buf[i+4] = checksum.bit(44-i*4,47-i*4);
+    for (auto i : range(4))  scramble(buf, 16);
+    for (auto i : range(16)) fifo.writeNibble(buf[i]);
+    state = Run;
+    return;
+  }
+
+  if(state == Run && fifo.size() >= 2) {
+    n2 cmd;
+    cmd.bit(1) = fifo.read();
+    cmd.bit(0) = fifo.read();
+    if(cmd == 0b00) return cmdCompare();
+    if(cmd == 0b01) return cmdDie();
+    if(cmd == 0b10) return cmdChallenge();
+    if(cmd == 0b11) return cmdReset();
+    return;
+  }
+
+  if(state == Challenge) {
+    return cmdChallenge();
+  }
+  
+  if(state == Dead) {
+    return;
+  }
+}
+
+}
\ No newline at end of file
diff --git a/waterbox/ares64/ares/ares/n64/cic/cic.hpp b/waterbox/ares64/ares/ares/n64/cic/cic.hpp
new file mode 100644
index 0000000000..95b2e57f0c
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/cic/cic.hpp
@@ -0,0 +1,60 @@
+
+struct CIC {
+  enum State : u32 { BootRegion, BootSeed, BootChecksum, Run, Challenge, Dead };
+  enum Region : u32 { NTSC, PAL };
+  enum ChallengeAlgo : bool { DummyChallenge, RealChallenge };
+  enum Type : u32 { Cartridge, DD64 };
+
+  struct {
+    nall::queue<n1> bits;
+
+    auto empty() -> bool { return bits.empty(); }
+    auto size() -> u32 { return bits.size(); }
+    auto write(n1 data) -> void { bits.write(data); }
+    auto read() -> n1 { return bits.read(); }
+    auto writeNibble(n4 data) -> void {
+      write(data.bit(3));
+      write(data.bit(2));
+      write(data.bit(1));
+      write(data.bit(0));
+    }
+    auto readNibble() -> n4 {
+      n4 data;
+      data.bit(3) = read();
+      data.bit(2) = read();
+      data.bit(1) = read();
+      data.bit(0) = read();
+      return data;
+    }
+  } fifo;
+  n8 seed;
+  n48 checksum; //ipl2 checksum
+  n1 type;
+  n1 region;
+  n1 challengeAlgo;
+  u32 state;
+  string model;
+
+  //cic.cpp
+  auto power(bool reset) -> void;
+  auto poll() -> void;
+  auto scramble(n4 *buf, int size) -> void;
+
+  //io.cpp
+  auto readBit() -> n1;
+  auto readNibble() -> n4;
+  auto writeBit(n1 cmd) -> void;
+  auto writeNibble(n4 cmd) -> void;
+
+  //commands.cpp
+  auto cmdCompare() -> void;
+  auto cmdDie() -> void;
+  auto cmdChallenge() -> void;
+  auto cmdReset() -> void;
+  auto challenge(n4 data[30]) -> void;
+
+  //serialization.cpp
+  auto serialize(serializer&) -> void;
+};
+
+extern CIC cic;
diff --git a/waterbox/ares64/ares/ares/n64/cic/commands.cpp b/waterbox/ares64/ares/ares/n64/cic/commands.cpp
new file mode 100644
index 0000000000..bb7fee96ea
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/cic/commands.cpp
@@ -0,0 +1,65 @@
+
+auto CIC::cmdCompare() -> void {   
+}
+
+auto CIC::challenge(n4 mem[30]) -> void {
+  if(challengeAlgo == DummyChallenge) {
+    for(u32 address : range(30))
+      mem[address] = ~mem[address];
+    return;
+  }
+
+  //CIC-NUS-6105 anti-piracy challenge
+  if(challengeAlgo == RealChallenge) {
+    static n4 lut[32] = {
+      0x4, 0x7, 0xa, 0x7, 0xe, 0x5, 0xe, 0x1,
+      0xc, 0xf, 0x8, 0xf, 0x6, 0x3, 0x6, 0x9,
+      0x4, 0x1, 0xa, 0x7, 0xe, 0x5, 0xe, 0x1,
+      0xc, 0x9, 0x8, 0x5, 0x6, 0x3, 0xc, 0x9,
+    };
+
+    n4 key = 0xb;
+    n1 sel = 0;
+    for(u32 address : range(30)) {
+      n4 data = key + 5 * mem[address];
+      mem[address] = data;
+      key = lut[sel << 4 | data];
+      n1 mod = data >> 3;
+      n3 mag = data >> 0;
+      if(mod) mag = ~mag;
+      if(mag % 3 != 1) mod = !mod;
+      if(sel) {
+        if(data == 0x1 || data == 0x9) mod = 1;
+        if(data == 0xb || data == 0xe) mod = 0;
+      }
+      sel = mod;
+    }
+    return;
+  }
+}
+
+auto CIC::cmdChallenge() -> void {
+  if(state == Run) {
+    fifo.writeNibble(0xa);
+    fifo.writeNibble(0xa);
+    state = Challenge;
+  }
+  if(state == Challenge && fifo.size() == 30*4) {
+    n4 data[30];
+    for (auto i : range(30)) data[i] = fifo.readNibble();
+    challenge(data);
+    fifo.write(0); // write 0 bit
+    for (auto i : range(30)) fifo.writeNibble(data[i]);
+    state = Run;
+    printf("CIC challenge complete %d\n", fifo.size());
+  }
+}
+
+auto CIC::cmdDie() -> void {
+  debug(unusual, "[CIC::cmdDie] die command received by PIF");
+  state = Dead;
+}
+
+auto CIC::cmdReset() -> void {
+  debug(unimplemented, "[CIC::cmdReset]");
+}
diff --git a/waterbox/ares64/ares/ares/n64/cic/io.cpp b/waterbox/ares64/ares/ares/n64/cic/io.cpp
new file mode 100644
index 0000000000..78c2df701b
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/cic/io.cpp
@@ -0,0 +1,21 @@
+
+auto CIC::writeBit(n1 data) -> void {
+  fifo.write(data);
+  poll();
+}
+
+auto CIC::writeNibble(n4 data) -> void {
+  fifo.writeNibble(data);
+  poll();
+}
+
+auto CIC::readBit() -> n1 {
+  if(fifo.empty()) cic.poll();
+  return fifo.read();
+}
+
+auto CIC::readNibble() -> n4 {
+  if (fifo.empty()) cic.poll();
+  return fifo.readNibble();
+}
+
diff --git a/waterbox/ares64/ares/ares/n64/cic/serialization.cpp b/waterbox/ares64/ares/ares/n64/cic/serialization.cpp
new file mode 100644
index 0000000000..3f9b50877d
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/cic/serialization.cpp
@@ -0,0 +1,10 @@
+
+auto CIC::serialize(serializer& s) -> void {
+  s(fifo.bits);
+  s(seed);
+  s(checksum);
+  s(type);
+  s(region);
+  s(challengeAlgo);
+  s(state);
+}
diff --git a/waterbox/ares64/ares/ares/n64/controller/controller.hpp b/waterbox/ares64/ares/ares/n64/controller/controller.hpp
index 3dc1f706b6..64b9db84c7 100644
--- a/waterbox/ares64/ares/ares/n64/controller/controller.hpp
+++ b/waterbox/ares64/ares/ares/n64/controller/controller.hpp
@@ -4,6 +4,7 @@ struct Controller {
   virtual ~Controller() = default;
   virtual auto save() -> void {}
   virtual auto comm(n8 send, n8 recv, n8 input[], n8 output[]) -> n2 { return 1; }
+  virtual auto reset() -> void {}
   virtual auto read() -> n32 { return 0; }
   virtual auto serialize(serializer&) -> void {}
 };
diff --git a/waterbox/ares64/ares/ares/n64/controller/gamepad/gamepad.cpp b/waterbox/ares64/ares/ares/n64/controller/gamepad/gamepad.cpp
index 98b203a2f9..25f2aeecda 100644
--- a/waterbox/ares64/ares/ares/n64/controller/gamepad/gamepad.cpp
+++ b/waterbox/ares64/ares/ares/n64/controller/gamepad/gamepad.cpp
@@ -33,7 +33,7 @@ Gamepad::~Gamepad() {
 }
 
 auto Gamepad::save() -> void {
-/*
+#if false
   if(!slot) return;
   if(slot->name() == "Controller Pak") {
     ram.save(pak->write("save.pak"));
@@ -43,7 +43,7 @@ auto Gamepad::save() -> void {
       transferPak.ram.save(pak->write("gbram.pak"));
     }
   }
-*/
+#endif
 }
 
 auto Gamepad::allocate(string name) -> Node::Peripheral {
@@ -135,36 +135,41 @@ auto Gamepad::comm(n8 send, n8 recv, n8 input[], n8 output[]) -> n2 {
   if(input[0] == 0x02 && send >= 3 && recv >= 1) {
     //controller pak
     if(ram) {
-      u32 address = (input[1] << 8 | input[2] << 0) & ~31;
+      u16 address = (input[1] << 8 | input[2] << 0) & ~31;
       if(pif.addressCRC(address) == (n5)input[2]) {
         for(u32 index : range(recv - 1)) {
-          output[index] = ram.read<Byte>(address++);
+          if(address <= 0x7FFF) output[index] = ram.read<Byte>(address);
+          else output[index] = 0;
+          address++;
         }
-        output[recv - 1] = pif.dataCRC({&output[0], recv - 1});
+        output[recv - 1] = pif.dataCRC({&output[0], recv - 1u});
         valid = 1;
       }
     }
 
     //rumble pak
     if(motor) {
-      u32 address = (input[1] << 8 | input[2] << 0) & ~31;
+      u16 address = (input[1] << 8 | input[2] << 0) & ~31;
       if(pif.addressCRC(address) == (n5)input[2]) {
         for(u32 index : range(recv - 1)) {
-          output[index] = 0x80;
+          if(address <= 0x7FFF) output[index] = 0;
+          else if(address <= 0x8FFF) output[index] = 0x80;
+          else output[index] = motor->enable() ? 0xFF : 0x00;
+          address++;
         }
-        output[recv - 1] = pif.dataCRC({&output[0], recv - 1});
+        output[recv - 1] = pif.dataCRC({&output[0], recv - 1u});
         valid = 1;
       }
     }
 
     //transfer pak
     if(transferPak) {
-      u32 address = (input[1] << 8 | input[2] << 0) & ~31;
+      u16 address = (input[1] << 8 | input[2] << 0) & ~31;
       if(pif.addressCRC(address) == (n5)input[2]) {
         for(u32 index : range(recv - 1)) {
           output[index] = transferPak.read(address++);
         }
-        output[recv - 1] = pif.dataCRC({&output[0], recv - 1});
+        output[recv - 1] = pif.dataCRC({&output[0], recv - 1u});
         valid = 1;
       }
     }
@@ -174,34 +179,35 @@ auto Gamepad::comm(n8 send, n8 recv, n8 input[], n8 output[]) -> n2 {
   if(input[0] == 0x03 && send >= 3 && recv >= 1) {
     //controller pak
     if(ram) {
-      u32 address = (input[1] << 8 | input[2] << 0) & ~31;
+      u16 address = (input[1] << 8 | input[2] << 0) & ~31;
       if(pif.addressCRC(address) == (n5)input[2]) {
         for(u32 index : range(send - 3)) {
-          ram.write<Byte>(address++, input[3 + index]);
+          if(address <= 0x7FFF) ram.write<Byte>(address, input[3 + index]);
+          address++;
         }
-        output[0] = pif.dataCRC({&input[3], send - 3});
+        output[0] = pif.dataCRC({&input[3], send - 3u});
         valid = 1;
       }
     }
 
     //rumble pak
     if(motor) {
-      u32 address = (input[1] << 8 | input[2] << 0) & ~31;
+      u16 address = (input[1] << 8 | input[2] << 0) & ~31;
       if(pif.addressCRC(address) == (n5)input[2]) {
-        output[0] = pif.dataCRC({&input[3], send - 3});
+        output[0] = pif.dataCRC({&input[3], send - 3u});
         valid = 1;
-        rumble(input[3] & 1);
+        if(address >= 0xC000) rumble(input[3] & 1);
       }
     }
 
     //transfer pak
     if(transferPak) {
-      u32 address = (input[1] << 8 | input[2] << 0) & ~31;
+      u16 address = (input[1] << 8 | input[2] << 0) & ~31;
       if(pif.addressCRC(address) == (n5)input[2]) {
         for(u32 index : range(send - 3)) {
           transferPak.write(address++, input[3 + index]);
         }
-        output[0] = pif.dataCRC({&input[3], send - 3});
+        output[0] = pif.dataCRC({&input[3], send - 3u});
         valid = 1;
       }
     }
@@ -231,6 +237,47 @@ auto Gamepad::read() -> n32 {
   platform->input(z);
   platform->input(start);
 
+#if false
+  //scale {-32768 ... +32767} to {-85 ... +85}
+  auto ax = x->value() * 85.0 / 32767.0;
+  auto ay = y->value() * 85.0 / 32767.0;
+
+  //create inner axial dead-zone in range {-7 ... +7} and scale from it up to outer circular dead-zone of radius 85
+  auto length = sqrt(ax * ax + ay * ay);
+  if(length <= 85.0) {
+    auto lengthAbsoluteX = abs(ax);
+    auto lengthAbsoluteY = abs(ay);
+    if(lengthAbsoluteX <= 7.0) {
+      lengthAbsoluteX = 0.0;
+    } else {
+      lengthAbsoluteX = (lengthAbsoluteX - 7.0) * 85.0 / (85.0 - 7.0) / lengthAbsoluteX;
+    }
+    ax *= lengthAbsoluteX;
+    if(lengthAbsoluteY <= 7.0) {
+      lengthAbsoluteY = 0.0;
+    } else {
+      lengthAbsoluteY = (lengthAbsoluteY - 7.0) * 85.0 / (85.0 - 7.0) / lengthAbsoluteY;
+    }
+    ay *= lengthAbsoluteY;
+  } else {
+    length = 85.0 / length;
+    ax *= length;
+    ay *= length;
+  }
+
+  //bound diagonals to an octagonal range {-69 ... +69}
+  if(ax != 0.0 && ay != 0.0) {
+    auto slope = ay / ax;
+    auto edgex = copysign(85.0 / (abs(slope) + 16.0 / 69.0), ax);
+    auto edgey = copysign(min(abs(edgex * slope), 85.0 / (1.0 / abs(slope) + 16.0 / 69.0)), ay);
+    edgex = edgey / slope;
+
+    auto scale = sqrt(edgex * edgex + edgey * edgey) / 85.0;
+    ax *= scale;
+    ay *= scale;
+  }
+#endif
+
   n32 data;
   data.byte(0) = y->value();
   data.byte(1) = x->value();
diff --git a/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc.hpp b/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc.hpp
index 6bcff0a22e..39863d9a74 100644
--- a/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc.hpp
+++ b/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc.hpp
@@ -10,6 +10,11 @@ protected:
   Memory::Writable& ram;
 };
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-case-range"
+
 #include "mbc1.hpp"
 #include "mbc3.hpp"
 #include "mbc5.hpp"
+
+#pragma clang diagnostic pop
diff --git a/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc3.hpp b/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc3.hpp
index ca70a63a3b..36f0b2511d 100644
--- a/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc3.hpp
+++ b/waterbox/ares64/ares/ares/n64/controller/gamepad/mbc/mbc3.hpp
@@ -2,7 +2,7 @@ struct Mbc3 : Mbc {
   explicit Mbc3(Memory::Readable& rom_, Memory::Writable& ram_, bool rtc) : Mbc(rom_, ram_), hasRtc(rtc) { reset(); }
 
   inline auto rtcUpdate() -> void {
-    u64 diff = rtcCallback() - lastTime;
+    u64 diff = platform->time() - lastTime;
     lastTime += diff;
     if(!rtcHalt) {
       s8 seconds = rtcSeconds;
@@ -146,6 +146,4 @@ private:
   n8 rtcLatches[5] = {};
 
   u64 lastTime = 0;
-public:
-  std::function<u64()> rtcCallback = []() { return 0; };
 };
diff --git a/waterbox/ares64/ares/ares/n64/controller/port.cpp b/waterbox/ares64/ares/ares/n64/controller/port.cpp
index 580a94f9b0..9cd9bc6ac4 100644
--- a/waterbox/ares64/ares/ares/n64/controller/port.cpp
+++ b/waterbox/ares64/ares/ares/n64/controller/port.cpp
@@ -12,6 +12,7 @@ auto ControllerPort::load(Node::Object parent) -> void {
   port->setType("Controller");
   port->setHotSwappable(true);
   port->setAllocate([&](auto name) { return allocate(name); });
+  port->setDisconnect([&] { device.reset(); });
   port->setSupported({"Gamepad", "Mouse"});
 }
 
@@ -25,7 +26,6 @@ auto ControllerPort::save() -> void {
 }
 
 auto ControllerPort::allocate(string name) -> Node::Peripheral {
-  device = {};
   if(name == "Gamepad") device = new Gamepad(port);
   if(name == "Mouse"  ) device = new Mouse(port);
   if(device) return device->node;
diff --git a/waterbox/ares64/ares/ares/n64/cpu/algorithms.cpp b/waterbox/ares64/ares/ares/n64/cpu/algorithms.cpp
index 7eec4187a4..37a2ece7dd 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/algorithms.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/algorithms.cpp
@@ -2,7 +2,7 @@ template <typename T>
 auto CPU::roundNearest(f32 f) -> T {
 #if defined(ARCHITECTURE_ARM64)
   return vrndns_f32(f);
-#elif defined(ARCHITECTURE_AMD64)
+#elif defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128 t = _mm_set_ss(f);
   t = _mm_round_ss(t, t, _MM_FROUND_TO_NEAREST_INT);
   return _mm_cvtss_f32(t);
@@ -14,9 +14,9 @@ auto CPU::roundNearest(f32 f) -> T {
 template <typename T>
 auto CPU::roundNearest(f64 f) -> T {
 #if defined(ARCHITECTURE_ARM64)
-  float64x1_t vf = {f};
-  return vrndn_f64(vf)[0];
-#elif defined(ARCHITECTURE_AMD64)
+  float64x1_t vf = vdup_n_f64(f);
+  return vget_lane_f64(vrndn_f64(vf), 0);
+#elif defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128d t = _mm_set_sd(f);
   t = _mm_round_sd(t, t, _MM_FROUND_TO_NEAREST_INT);
   return _mm_cvtsd_f64(t);
@@ -27,7 +27,7 @@ auto CPU::roundNearest(f64 f) -> T {
 
 template <typename T>
 auto CPU::roundCeil(f32 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128 t = _mm_set_ss(f);
   t = _mm_round_ss(t, t, _MM_FROUND_TO_POS_INF);
   return _mm_cvtss_f32(t);
@@ -38,7 +38,7 @@ auto CPU::roundCeil(f32 f) -> T {
 
 template <typename T>
 auto CPU::roundCeil(f64 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128d t = _mm_set_sd(f);
   t = _mm_round_sd(t, t, _MM_FROUND_TO_POS_INF);
   return _mm_cvtsd_f64(t);
@@ -49,7 +49,7 @@ auto CPU::roundCeil(f64 f) -> T {
 
 template<typename T>
 auto CPU::roundCurrent(f32 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   auto t = _mm_set_ss(f);
   t = _mm_round_ss(t, t, _MM_FROUND_CUR_DIRECTION);
   return _mm_cvtss_f32(t);
@@ -60,7 +60,7 @@ auto CPU::roundCurrent(f32 f) -> T {
 
 template<typename T>
 auto CPU::roundCurrent(f64 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   auto t = _mm_set_sd(f);
   t = _mm_round_sd(t, t, _MM_FROUND_CUR_DIRECTION);
   return _mm_cvtsd_f64(t);
@@ -71,7 +71,7 @@ auto CPU::roundCurrent(f64 f) -> T {
 
 template <typename T>
 auto CPU::roundFloor(f32 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128 t = _mm_set_ss(f);
   t = _mm_round_ss(t, t, _MM_FROUND_TO_NEG_INF);
   return _mm_cvtss_f32(t);
@@ -82,7 +82,7 @@ auto CPU::roundFloor(f32 f) -> T {
 
 template <typename T>
 auto CPU::roundFloor(f64 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128d t = _mm_set_sd(f);
   t = _mm_round_sd(t, t, _MM_FROUND_TO_NEG_INF);
   return _mm_cvtsd_f64(t);
@@ -93,7 +93,7 @@ auto CPU::roundFloor(f64 f) -> T {
 
 template <typename T>
 auto CPU::roundTrunc(f32 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128 t = _mm_set_ss(f);
   t = _mm_round_ss(t, t, _MM_FROUND_TO_ZERO);
   return _mm_cvtss_f32(t);
@@ -104,7 +104,7 @@ auto CPU::roundTrunc(f32 f) -> T {
 
 template <typename T>
 auto CPU::roundTrunc(f64 f) -> T {
-#if defined(ARCHITECTURE_AMD64)
+#if defined(ARCHITECTURE_AMD64) && ARCHITECTURE_SUPPORTS_SSE4_1
   __m128d t = _mm_set_sd(f);
   t = _mm_round_sd(t, t, _MM_FROUND_TO_ZERO);
   return _mm_cvtsd_f64(t);
diff --git a/waterbox/ares64/ares/ares/n64/cpu/cpu.cpp b/waterbox/ares64/ares/ares/n64/cpu/cpu.cpp
index bb7e9e1b4a..956c0ec479 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/cpu.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/cpu.cpp
@@ -46,10 +46,12 @@ auto CPU::synchronize() -> void {
    ai.clock -= clocks;
   rsp.clock -= clocks;
   rdp.clock -= clocks;
+  pif.clock -= clocks;
   while( vi.clock < 0)  vi.main();
   while( ai.clock < 0)  ai.main();
   while(rsp.clock < 0) rsp.main();
   while(rdp.clock < 0) rdp.main();
+  while(pif.clock < 0) pif.main();
 
   queue.step(clocks, [](u32 event) {
     switch(event) {
@@ -59,6 +61,8 @@ auto CPU::synchronize() -> void {
     case Queue::PI_BUS_Write:  return pi.writeFinished();
     case Queue::SI_DMA_Read:   return si.dmaRead();
     case Queue::SI_DMA_Write:  return si.dmaWrite();
+    case Queue::SI_BUS_Write:  return si.writeFinished();
+    case Queue::RTC_Tick:      return cartridge.rtc.tick();
     case Queue::DD_Clock_Tick:  return dd.rtcTickClock();
     case Queue::DD_MECHA_Response:  return dd.mechaResponse();
     case Queue::DD_BM_Request:  return dd.bmRequest();
@@ -81,16 +85,24 @@ auto CPU::instruction() -> void {
       return exception.interrupt();
     }
   }
+  if (scc.nmiPending) {
+    debugger.nmi();
+    step(1);
+    return exception.nmi();
+  }
 
   if constexpr(Accuracy::CPU::Recompiler) {
-    auto address = devirtualize(ipu.pc)(0);
-    auto& block = recompiler.block(address);
-    block.execute(*this);
+    if (auto address = devirtualize(ipu.pc)) {
+      auto block = recompiler.block(*address);
+      block->execute(*this);
+    }
   }
 
   if constexpr(Accuracy::CPU::Interpreter) {
     pipeline.address = ipu.pc;
-    pipeline.instruction = fetch(ipu.pc);
+    auto data = fetch(ipu.pc);
+    if (!data) return;
+    pipeline.instruction = *data;
     debugger.instruction();
     decoderEXECUTE();
     instructionEpilogue();
@@ -143,8 +155,8 @@ auto CPU::power(bool reset) -> void {
   context.setMode();
 
   if constexpr(Accuracy::CPU::Recompiler) {
-    auto buffer = ares::Memory::FixedAllocator::get().tryAcquire(4_MiB);
-    recompiler.allocator.resize(4_MiB, bump_allocator::executable | bump_allocator::zero_fill, buffer);
+    auto buffer = ares::Memory::FixedAllocator::get().tryAcquire(64_MiB);
+    recompiler.allocator.resize(64_MiB, bump_allocator::executable | bump_allocator::zero_fill, buffer);
     recompiler.reset();
   }
 }
diff --git a/waterbox/ares64/ares/ares/n64/cpu/cpu.hpp b/waterbox/ares64/ares/ares/n64/cpu/cpu.hpp
index 437c49c671..722c2af5f4 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/cpu.hpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/cpu.hpp
@@ -10,6 +10,7 @@ struct CPU : Thread {
     auto instruction() -> void;
     auto exception(u8 code) -> void;
     auto interrupt(u8 mask) -> void;
+    auto nmi() -> void;
     auto tlbWrite(u32 index) -> void;
     auto tlbModification(u64 address) -> void;
     auto tlbLoad(u64 address, u64 physical) -> void;
@@ -83,7 +84,7 @@ struct CPU : Thread {
 
     enum Endian : bool { Little, Big };
     enum Mode : u32 { Kernel, Supervisor, User };
-    enum Segment : u32 { Unused, Mapped, Cached, Direct, Kernel64, Supervisor64, User64 };
+    enum Segment : u32 { Unused, Mapped, Cached, Direct, Cached32, Direct32, Kernel64, Supervisor64, User64 };
 
     auto littleEndian() const -> bool { return endian == Endian::Little; }
     auto bigEndian() const -> bool { return endian == Endian::Big; }
@@ -147,26 +148,26 @@ struct CPU : Thread {
         cpu.step(48);
         valid = 1;
         tag   = address & ~0x0000'0fff;
-        words[0] = bus.read<Word>(tag | index | 0x00);
-        words[1] = bus.read<Word>(tag | index | 0x04);
-        words[2] = bus.read<Word>(tag | index | 0x08);
-        words[3] = bus.read<Word>(tag | index | 0x0c);
-        words[4] = bus.read<Word>(tag | index | 0x10);
-        words[5] = bus.read<Word>(tag | index | 0x14);
-        words[6] = bus.read<Word>(tag | index | 0x18);
-        words[7] = bus.read<Word>(tag | index | 0x1c);
+        words[0] = cpu.busRead<Word>(tag | index | 0x00);
+        words[1] = cpu.busRead<Word>(tag | index | 0x04);
+        words[2] = cpu.busRead<Word>(tag | index | 0x08);
+        words[3] = cpu.busRead<Word>(tag | index | 0x0c);
+        words[4] = cpu.busRead<Word>(tag | index | 0x10);
+        words[5] = cpu.busRead<Word>(tag | index | 0x14);
+        words[6] = cpu.busRead<Word>(tag | index | 0x18);
+        words[7] = cpu.busRead<Word>(tag | index | 0x1c);
       }
 
       auto writeBack(CPU& cpu) -> void {
         cpu.step(48);
-        bus.write<Word>(tag | index | 0x00, words[0]);
-        bus.write<Word>(tag | index | 0x04, words[1]);
-        bus.write<Word>(tag | index | 0x08, words[2]);
-        bus.write<Word>(tag | index | 0x0c, words[3]);
-        bus.write<Word>(tag | index | 0x10, words[4]);
-        bus.write<Word>(tag | index | 0x14, words[5]);
-        bus.write<Word>(tag | index | 0x18, words[6]);
-        bus.write<Word>(tag | index | 0x1c, words[7]);
+        cpu.busWrite<Word>(tag | index | 0x00, words[0]);
+        cpu.busWrite<Word>(tag | index | 0x04, words[1]);
+        cpu.busWrite<Word>(tag | index | 0x08, words[2]);
+        cpu.busWrite<Word>(tag | index | 0x0c, words[3]);
+        cpu.busWrite<Word>(tag | index | 0x10, words[4]);
+        cpu.busWrite<Word>(tag | index | 0x14, words[5]);
+        cpu.busWrite<Word>(tag | index | 0x18, words[6]);
+        cpu.busWrite<Word>(tag | index | 0x1c, words[7]);
       }
 
       auto read(u32 address) const -> u32 { return words[address >> 2 & 7]; }
@@ -259,9 +260,12 @@ struct CPU : Thread {
 
   auto segment(u64 vaddr) -> Context::Segment;
   auto devirtualize(u64 vaddr) -> maybe<u64>;
-  auto fetch(u64 vaddr) -> u32;
+  auto fetch(u64 vaddr) -> maybe<u32>;
+  template<u32 Size> auto busWrite(u32 address, u64 data) -> void;
+  template<u32 Size> auto busRead(u32 address) -> u64;
   template<u32 Size> auto read(u64 vaddr) -> maybe<u64>;
   template<u32 Size> auto write(u64 vaddr, u64 data) -> bool;
+  template<u32 Size> auto vaddrAlignedError(u64 vaddr, bool write) -> bool;
   auto addressException(u64 vaddr) -> void;
 
   //serialization.cpp
@@ -296,6 +300,7 @@ struct CPU : Thread {
     auto trap() -> void;
     auto floatingPoint() -> void;
     auto watchAddress() -> void;
+    auto nmi() -> void;
   } exception{*this};
 
   enum Interrupt : u32 {
@@ -317,8 +322,6 @@ struct CPU : Thread {
     struct {   int64_t s64; };
     struct {  uint64_t u64; };
     struct { float64_t f64; };
-    auto s128() const ->  int128_t { return  (int128_t)s64; }
-    auto u128() const -> uint128_t { return (uint128_t)u64; }
   };
   using cr64 = const r64;
 
@@ -594,6 +597,7 @@ struct CPU : Thread {
 
     //other
     n64 latch;
+    n1 nmiPending;
   } scc;
 
   //interpreter-scc.cpp
@@ -821,8 +825,7 @@ struct CPU : Thread {
     };
 
     struct Pool {
-      Block blocks[1 << 6];
-      bool dirty;
+      Block* blocks[1 << 6];
     };
 
     auto reset() -> void {
@@ -842,14 +845,13 @@ struct CPU : Thread {
       auto pool = pools[address >> 8 & 0x1fffff];
       if(!pool) return;
       memory::jitprotect(false);
-      pool->blocks[address >> 2 & 0x3f].code = nullptr;
+      pool->blocks[address >> 2 & 0x3f] = nullptr;
       memory::jitprotect(true);
       #endif
     }
 
     auto invalidatePool(u32 address) -> void {
-      auto pool = pools[address >> 8 & 0x1fffff];
-      if(pool && pool->dirty) memory::fill(pool, sizeof(Pool));
+      pools[address >> 8 & 0x1fffff] = nullptr;
     }
 
     auto invalidateRange(u32 address, u32 length) -> void {
@@ -859,9 +861,9 @@ struct CPU : Thread {
     }
 
     auto pool(u32 address) -> Pool*;
-    auto block(u32 address) -> Block&;
+    auto block(u32 address) -> Block*;
 
-    auto emit(u32 address, Block& block) -> void;
+    auto emit(u32 address) -> Block*;
     auto emitEXECUTE(u32 instruction) -> bool;
     auto emitSPECIAL(u32 instruction) -> bool;
     auto emitREGIMM(u32 instruction) -> bool;
diff --git a/waterbox/ares64/ares/ares/n64/cpu/dcache.cpp b/waterbox/ares64/ares/ares/n64/cpu/dcache.cpp
index 4ec2409691..34d3d0ff71 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/dcache.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/dcache.cpp
@@ -11,21 +11,21 @@ template<u32 Size> auto CPU::DataCache::Line::fill(u32 address, u64 data) -> voi
   switch(address & 8) {
   case 0:
     if constexpr(Size != Dual) {
-      words[0] = bus.read<Word>(tag | index | 0x0);
-      words[1] = bus.read<Word>(tag | index | 0x4);
+      words[0] = cpu.busRead<Word>(tag | index | 0x0);
+      words[1] = cpu.busRead<Word>(tag | index | 0x4);
     }
     write<Size>(address, data);
-    words[2] = bus.read<Word>(tag | index | 0x8);
-    words[3] = bus.read<Word>(tag | index | 0xc);
+    words[2] = cpu.busRead<Word>(tag | index | 0x8);
+    words[3] = cpu.busRead<Word>(tag | index | 0xc);
     break;
   case 8:
     if constexpr(Size != Dual) {
-      words[2] = bus.read<Word>(tag | index | 0x8);
-      words[3] = bus.read<Word>(tag | index | 0xc);
+      words[2] = cpu.busRead<Word>(tag | index | 0x8);
+      words[3] = cpu.busRead<Word>(tag | index | 0xc);
     }
     write<Size>(address, data);
-    words[0] = bus.read<Word>(tag | index | 0x0);
-    words[1] = bus.read<Word>(tag | index | 0x4);
+    words[0] = cpu.busRead<Word>(tag | index | 0x0);
+    words[1] = cpu.busRead<Word>(tag | index | 0x4);
     break;
   }
 }
@@ -38,16 +38,16 @@ auto CPU::DataCache::Line::fill(u32 address) -> void {
   //read words according to critical doubleword first scheme
   switch(address & 8) {
   case 0:
-    words[0] = bus.read<Word>(tag | index | 0x0);
-    words[1] = bus.read<Word>(tag | index | 0x4);
-    words[2] = bus.read<Word>(tag | index | 0x8);
-    words[3] = bus.read<Word>(tag | index | 0xc);
+    words[0] = cpu.busRead<Word>(tag | index | 0x0);
+    words[1] = cpu.busRead<Word>(tag | index | 0x4);
+    words[2] = cpu.busRead<Word>(tag | index | 0x8);
+    words[3] = cpu.busRead<Word>(tag | index | 0xc);
     break;
   case 8:
-    words[2] = bus.read<Word>(tag | index | 0x8);
-    words[3] = bus.read<Word>(tag | index | 0xc);
-    words[0] = bus.read<Word>(tag | index | 0x0);
-    words[1] = bus.read<Word>(tag | index | 0x4);
+    words[2] = cpu.busRead<Word>(tag | index | 0x8);
+    words[3] = cpu.busRead<Word>(tag | index | 0xc);
+    words[0] = cpu.busRead<Word>(tag | index | 0x0);
+    words[1] = cpu.busRead<Word>(tag | index | 0x4);
     break;
   }
 }
@@ -55,10 +55,10 @@ auto CPU::DataCache::Line::fill(u32 address) -> void {
 auto CPU::DataCache::Line::writeBack() -> void {
   cpu.step(40);
   dirty = 0;
-  bus.write<Word>(tag | index | 0x0, words[0]);
-  bus.write<Word>(tag | index | 0x4, words[1]);
-  bus.write<Word>(tag | index | 0x8, words[2]);
-  bus.write<Word>(tag | index | 0xc, words[3]);
+  cpu.busWrite<Word>(tag | index | 0x0, words[0]);
+  cpu.busWrite<Word>(tag | index | 0x4, words[1]);
+  cpu.busWrite<Word>(tag | index | 0x8, words[2]);
+  cpu.busWrite<Word>(tag | index | 0xc, words[3]);
 }
 
 auto CPU::DataCache::line(u32 address) -> Line& {
diff --git a/waterbox/ares64/ares/ares/n64/cpu/debugger.cpp b/waterbox/ares64/ares/ares/n64/cpu/debugger.cpp
index 40a0580bc6..dc284e5b15 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/debugger.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/debugger.cpp
@@ -69,6 +69,12 @@ auto CPU::Debugger::interrupt(u8 mask) -> void {
   }
 }
 
+auto CPU::Debugger::nmi() -> void {
+  if(unlikely(tracer.exception->enabled())) {
+    tracer.exception->notify("NMI");
+  }
+}
+
 auto CPU::Debugger::tlbWrite(u32 index) -> void {
   if(unlikely(tracer.tlb->enabled())) {
     auto entry = cpu.tlb.entry[index & 31];
diff --git a/waterbox/ares64/ares/ares/n64/cpu/disassembler.cpp b/waterbox/ares64/ares/ares/n64/cpu/disassembler.cpp
index 4321bb62c9..e176191de2 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/disassembler.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/disassembler.cpp
@@ -476,6 +476,6 @@ auto CPU::Disassembler::ccrRegisterValue(u32 index) const -> string {
 
 template<typename... P>
 auto CPU::Disassembler::hint(P&&... p) const -> string {
-  if(showColors) return {"\e[0m\e[37m", std::forward<P>(p)..., "\e[0m"};
+  if(showColors) return {terminal::csi, "0m", terminal::csi, "37m", std::forward<P>(p)..., terminal::csi, "0m"};
   return {std::forward<P>(p)...};
 }
diff --git a/waterbox/ares64/ares/ares/n64/cpu/exceptions.cpp b/waterbox/ares64/ares/ares/n64/cpu/exceptions.cpp
index 475f043c6d..310662f496 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/exceptions.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/exceptions.cpp
@@ -1,8 +1,7 @@
 auto CPU::Exception::trigger(u32 code, u32 coprocessor, bool tlbMiss) -> void {
   self.debugger.exception(code);
 
-  u64 vectorBase = !self.scc.status.vectorLocation ? 0x8000'0000 : 0xbfc0'0200;
-  if(self.context.bits == 64) vectorBase = (s32)vectorBase;
+  u64 vectorBase = !self.scc.status.vectorLocation ? (s32)0x8000'0000 : (s32)0xbfc0'0200;
 
   u16 vectorOffset = 0x0180;
   if(tlbMiss) {
@@ -51,3 +50,12 @@ auto CPU::Exception::arithmeticOverflow()      -> void { trigger(12); }
 auto CPU::Exception::trap()                    -> void { trigger(13); }
 auto CPU::Exception::floatingPoint()           -> void { trigger(15); }
 auto CPU::Exception::watchAddress()            -> void { trigger(23); }
+
+auto CPU::Exception::nmi() -> void {
+  self.scc.status.vectorLocation = 1;
+  self.scc.status.tlbShutdown = 0;
+  self.scc.status.softReset = 0;
+  self.scc.status.errorLevel = 1;
+  self.scc.epcError = self.ipu.pc;
+  self.ipu.pc = 0xffff'ffff'bfc0'0000;
+}
diff --git a/waterbox/ares64/ares/ares/n64/cpu/interpreter-fpu.cpp b/waterbox/ares64/ares/ares/n64/cpu/interpreter-fpu.cpp
index 5f15d295da..7283d816c1 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/interpreter-fpu.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/interpreter-fpu.cpp
@@ -208,15 +208,13 @@ auto CPU::checkFPUExceptions() -> bool {
   return raise;
 }
 
-#define CHECK_FPE_IMPL(type, operation, convert) ({ \
+#define CHECK_FPE_IMPL(type, res, operation, convert) \
   fenv.clearExcept(); \
-  type res = [&]() noinline { return type(operation); }(); \
-  if (checkFPUExceptions<convert>()) return; \
-  (res); \
-})
+  type res = [&]() noinline -> type { return operation; }(); \
+  if (checkFPUExceptions<convert>()) return;
 
-#define CHECK_FPE(type, operation)      CHECK_FPE_IMPL(type, operation, false)
-#define CHECK_FPE_CONV(type, operation) CHECK_FPE_IMPL(type, operation, true)
+#define CHECK_FPE(type, res, operation)      CHECK_FPE_IMPL(type, res, operation, false)
+#define CHECK_FPE_CONV(type, res, operation) CHECK_FPE_IMPL(type, res, operation, true)
 
 auto f32repr(f32 f) -> n32 {
   uint32_t v; memcpy(&v, &f, 4);
@@ -420,7 +418,7 @@ auto CPU::FADD_S(u8 fd, u8 fs, u8 ft) -> void {
   f32 ffs = FS(f32), fft = FT(f32);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  float ffd = CHECK_FPE(f32, FS(f32) + FT(f32));
+  CHECK_FPE(f32, ffd, FS(f32) + FT(f32));
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -430,7 +428,7 @@ auto CPU::FADD_D(u8 fd, u8 fs, u8 ft) -> void {
   auto ffs = FS(f64), fft = FT(f64);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  auto ffd = CHECK_FPE(f64, ffs + fft);
+  CHECK_FPE(f64, ffd, ffs + fft);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -439,7 +437,7 @@ auto CPU::FCEIL_L_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundCeil<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundCeil<s64>(ffs));
   FD(s64) = ffd;
 }
 
@@ -447,7 +445,7 @@ auto CPU::FCEIL_L_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundCeil<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundCeil<s64>(ffs));
   FD(s64) = ffd;
 }
 
@@ -455,7 +453,7 @@ auto CPU::FCEIL_W_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundCeil<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundCeil<s32>(ffs));
   FD(s32) = ffd;
 }
 
@@ -463,7 +461,7 @@ auto CPU::FCEIL_W_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundCeil<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundCeil<s32>(ffs));
   FD(s32) = ffd;
 }
 
@@ -651,7 +649,7 @@ auto CPU::FCVT_S_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInput(ffs)) return;
-  auto ffd = CHECK_FPE(f32, (f32)ffs);
+  CHECK_FPE(f32, ffd, (f32)ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -659,7 +657,7 @@ auto CPU::FCVT_S_D(u8 fd, u8 fs) -> void {
 auto CPU::FCVT_S_W(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(s32);
-  auto ffd = CHECK_FPE(f32, ffs);
+  CHECK_FPE(f32, ffd, ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -671,7 +669,7 @@ auto CPU::FCVT_S_L(u8 fd, u8 fs) -> void {
     if (fpeUnimplemented()) return exception.floatingPoint();
     return;
   }
-  auto ffd = CHECK_FPE(f32, (f32)ffs);
+  CHECK_FPE(f32, ffd, (f32)ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -680,7 +678,7 @@ auto CPU::FCVT_D_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInput(ffs)) return;
-  auto ffd = CHECK_FPE(f64, ffs);
+  CHECK_FPE(f64, ffd, ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -693,7 +691,7 @@ auto CPU::FCVT_D_D(u8 fd, u8 fs) -> void {
 auto CPU::FCVT_D_W(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(s32);
-  auto ffd = CHECK_FPE(f64, (f64)ffs);
+  CHECK_FPE(f64, ffd, (f64)ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -705,7 +703,7 @@ auto CPU::FCVT_D_L(u8 fd, u8 fs) -> void {
     if (fpeUnimplemented()) return exception.floatingPoint();
     return;
   }
-  auto ffd = CHECK_FPE(f64, (f64)ffs);
+  CHECK_FPE(f64, ffd, (f64)ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffs;
 }
@@ -714,7 +712,7 @@ auto CPU::FCVT_L_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundCurrent<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundCurrent<s64>(ffs));
   FD(s64) = ffd;
 }
 
@@ -722,7 +720,7 @@ auto CPU::FCVT_L_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundCurrent<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundCurrent<s64>(ffs));
   FD(s64) = ffd;
 }
 
@@ -730,7 +728,7 @@ auto CPU::FCVT_W_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundCurrent<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundCurrent<s32>(ffs));
   FD(s32) = ffd;
 }
 
@@ -738,7 +736,7 @@ auto CPU::FCVT_W_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundCurrent<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundCurrent<s32>(ffs));
   FD(s32) = ffd;
 }
 
@@ -747,7 +745,7 @@ auto CPU::FDIV_S(u8 fd, u8 fs, u8 ft) -> void {
   auto ffs = FS(f32), fft = FT(f32);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  auto ffd = CHECK_FPE(f32, ffs / fft);
+  CHECK_FPE(f32, ffd, ffs / fft);
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -757,7 +755,7 @@ auto CPU::FDIV_D(u8 fd, u8 fs, u8 ft) -> void {
   auto ffs = FS(f64), fft = FT(f64);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  auto ffd = CHECK_FPE(f64, ffs / fft);
+  CHECK_FPE(f64, ffd, ffs / fft);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -766,7 +764,7 @@ auto CPU::FFLOOR_L_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundFloor<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundFloor<s64>(ffs));
   FD(s64) = ffd;
 }
 
@@ -774,7 +772,7 @@ auto CPU::FFLOOR_L_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundFloor<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundFloor<s64>(ffs));
   FD(s64) = ffd;
 }
 
@@ -782,7 +780,7 @@ auto CPU::FFLOOR_W_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundFloor<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundFloor<s32>(ffs));
   FD(s32) = ffd;
 }
 
@@ -790,7 +788,7 @@ auto CPU::FFLOOR_W_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundFloor<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundFloor<s32>(ffs));
   FD(s32) = ffd;
 }
 
@@ -809,7 +807,7 @@ auto CPU::FMUL_S(u8 fd, u8 fs, u8 ft) -> void {
   auto ffs = FS(f32), fft = FT(f32);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  auto ffd = CHECK_FPE(f32, ffs * fft);
+  CHECK_FPE(f32, ffd, ffs * fft);
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -819,7 +817,7 @@ auto CPU::FMUL_D(u8 fd, u8 fs, u8 ft) -> void {
   auto ffs = FS(f64), fft = FT(f64);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  auto ffd = CHECK_FPE(f64, ffs * fft);
+  CHECK_FPE(f64, ffd, ffs * fft);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -828,7 +826,7 @@ auto CPU::FNEG_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInput(ffs)) return;
-  auto ffd = CHECK_FPE(f32, -ffs);
+  CHECK_FPE(f32, ffd, -ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -837,7 +835,7 @@ auto CPU::FNEG_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInput(ffs)) return;
-  auto ffd = CHECK_FPE(f64, -ffs);
+  CHECK_FPE(f64, ffd, -ffs);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -846,7 +844,7 @@ auto CPU::FROUND_L_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundNearest<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundNearest<s64>(ffs));
   if(ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s64) = ffd;
 }
@@ -855,7 +853,7 @@ auto CPU::FROUND_L_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundNearest<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundNearest<s64>(ffs));
   if(ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s64) = ffd;
 }
@@ -864,7 +862,7 @@ auto CPU::FROUND_W_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundNearest<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundNearest<s32>(ffs));
   if(ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s32) = ffd;
 }
@@ -873,7 +871,7 @@ auto CPU::FROUND_W_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundNearest<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundNearest<s32>(ffs));
   if(ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s32) = ffd;
 }
@@ -882,7 +880,7 @@ auto CPU::FSQRT_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInput(ffs)) return;
-  auto ffd = CHECK_FPE(f32, squareRoot(ffs));
+  CHECK_FPE(f32, ffd, squareRoot(ffs));
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -891,7 +889,7 @@ auto CPU::FSQRT_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInput(ffs)) return;
-  auto ffd = CHECK_FPE(f64, squareRoot(ffs));
+  CHECK_FPE(f64, ffd, squareRoot(ffs));
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -901,7 +899,7 @@ auto CPU::FSUB_S(u8 fd, u8 fs, u8 ft) -> void {
   auto ffs = FS(f32), fft = FT(f32);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  auto ffd = CHECK_FPE(f32, ffs - fft);
+  CHECK_FPE(f32, ffd, ffs - fft);
   if(!fpuCheckOutput(ffd)) return;
   FD(f32) = ffd;
 }
@@ -911,7 +909,7 @@ auto CPU::FSUB_D(u8 fd, u8 fs, u8 ft) -> void {
   auto ffs = FS(f64), fft = FT(f64);
   if(!fpuCheckInput(ffs)) return;
   if(!fpuCheckInput(fft)) return;
-  auto ffd = CHECK_FPE(f64, ffs - fft);
+  CHECK_FPE(f64, ffd, ffs - fft);
   if(!fpuCheckOutput(ffd)) return;
   FD(f64) = ffd;
 }
@@ -920,7 +918,7 @@ auto CPU::FTRUNC_L_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundTrunc<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundTrunc<s64>(ffs));
   if((f32)ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s64) = ffd;
 }
@@ -929,7 +927,7 @@ auto CPU::FTRUNC_L_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s64>(ffs)) return;
-  auto ffd = CHECK_FPE(s64, roundTrunc<s64>(ffs));
+  CHECK_FPE(s64, ffd, roundTrunc<s64>(ffs));
   if((f64)ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s64) = ffd;
 }
@@ -938,7 +936,7 @@ auto CPU::FTRUNC_W_S(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f32);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundTrunc<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundTrunc<s32>(ffs));
   if((f32)ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s32) = ffd;
 }
@@ -947,7 +945,7 @@ auto CPU::FTRUNC_W_D(u8 fd, u8 fs) -> void {
   if(!fpuCheckStart()) return;
   auto ffs = FS(f64);
   if(!fpuCheckInputConv<s32>(ffs)) return;
-  auto ffd = CHECK_FPE_CONV(s32, roundTrunc<s32>(ffs));
+  CHECK_FPE_CONV(s32, ffd, roundTrunc<s32>(ffs));
   if((f64)ffd != ffs && fpeInexact()) return exception.floatingPoint();
   FD(s32) = ffd;
 }
diff --git a/waterbox/ares64/ares/ares/n64/cpu/interpreter-ipu.cpp b/waterbox/ares64/ares/ares/n64/cpu/interpreter-ipu.cpp
index 08cc7bedc3..c50c6c9fab 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/interpreter-ipu.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/interpreter-ipu.cpp
@@ -252,9 +252,19 @@ auto CPU::DADDU(r64& rd, cr64& rs, cr64& rt) -> void {
 auto CPU::DDIV(cr64& rs, cr64& rt) -> void {
   if(!context.kernelMode() && context.bits == 32) return exception.reservedInstruction();
   if(rt.s64) {
+    #if defined(_MSC_VER) || !defined(__SIZEOF_INT128__)
+    if(rs.s64 != (-1LL << 63) || rt.s64 != -1LL) {
+      LO.u64 = rs.s64 / rt.s64;
+      HI.u64 = rs.s64 % rt.s64;
+    } else {
+      LO.u64 = rs.s64;
+      HI.u64 = 0;
+    }
+    #else
     //cast to i128 to prevent exception on INT64_MIN / -1
     LO.u64 = s128(rs.s64) / s128(rt.s64);
     HI.u64 = s128(rs.s64) % s128(rt.s64);
+    #endif
   } else {
     LO.u64 = rs.s64 < 0 ? +1 : -1;
     HI.u64 = rs.s64;
@@ -301,17 +311,41 @@ auto CPU::DIVU(cr64& rs, cr64& rt) -> void {
 
 auto CPU::DMULT(cr64& rs, cr64& rt) -> void {
   if(!context.kernelMode() && context.bits == 32) return exception.reservedInstruction();
-  u128 result = rs.s128() * rt.s128();
+#if defined(COMPILER_MICROSOFT) && (defined(ARCHITECTURE_AMD64) || defined(ARCHITECTURE_ARM64))
+  #if defined(ARCHITECTURE_AMD64)
+  LO.s64 = _mul128(rs.s64, rt.s64, &HI.s64);
+  #else
+	LO.s64 = rs.s64 * rt.s64;
+	HI.s64 = __mulh(rs.s64, rt.s64);
+  #endif
+#else
+  #if defined(__SIZEOF_INT128__)
+  u128 result = s128(rs.s64) * s128(rt.s64);
+  #else
+  u128 result = u128(rs.u64) * u128(rt.u64);
+  if(rs.s64 < 0) result -= u128(rt.u64) << 64;
+  if(rt.s64 < 0) result -= u128(rs.u64) << 64;
+  #endif
   LO.u64 = result >>  0;
   HI.u64 = result >> 64;
+#endif
   step(8);
 }
 
 auto CPU::DMULTU(cr64& rs, cr64& rt) -> void {
   if(!context.kernelMode() && context.bits == 32) return exception.reservedInstruction();
-  u128 result = rs.u128() * rt.u128();
+#if defined(COMPILER_MICROSOFT) && (defined(ARCHITECTURE_AMD64) || defined(ARCHITECTURE_ARM64))
+  #if defined(ARCHITECTURE_AMD64)
+  LO.u64 = _umul128(rs.u64, rt.u64, &HI.u64);
+  #else
+	LO.u64 = rs.u64 * rt.u64;
+	HI.u64 = __umulh(rs.u64, rt.u64);
+  #endif
+#else
+  u128 result = u128(rs.u64) * u128(rt.u64);
   LO.u64 = result >>  0;
   HI.u64 = result >> 64;
+#endif
   step(8);
 }
 
diff --git a/waterbox/ares64/ares/ares/n64/cpu/memory.cpp b/waterbox/ares64/ares/ares/n64/cpu/memory.cpp
index 9b139a6782..80ce9bb1eb 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/memory.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/memory.cpp
@@ -30,21 +30,21 @@ auto CPU::kernelSegment64(u64 vaddr) const -> Context::Segment {
   if(vaddr <= 0x3fff'ffff'ffff'ffffull) return Context::Segment::Unused;
   if(vaddr <= 0x4000'00ff'ffff'ffffull) return Context::Segment::Mapped;  //xksseg
   if(vaddr <= 0x7fff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0x8000'0000'ffff'ffffull) return Context::Segment::Cached;  //xkphys*
+  if(vaddr <= 0x8000'0000'ffff'ffffull) return Context::Segment::Cached32;  //xkphys*
   if(vaddr <= 0x87ff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0x8800'0000'ffff'ffffull) return Context::Segment::Cached;  //xkphys*
+  if(vaddr <= 0x8800'0000'ffff'ffffull) return Context::Segment::Cached32;  //xkphys*
   if(vaddr <= 0x8fff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0x9000'0000'ffff'ffffull) return Context::Segment::Direct;  //xkphys*
+  if(vaddr <= 0x9000'0000'ffff'ffffull) return Context::Segment::Direct32;  //xkphys*
   if(vaddr <= 0x97ff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0x9800'0000'ffff'ffffull) return Context::Segment::Cached;  //xkphys*
+  if(vaddr <= 0x9800'0000'ffff'ffffull) return Context::Segment::Cached32;  //xkphys*
   if(vaddr <= 0x9fff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0xa000'0000'ffff'ffffull) return Context::Segment::Cached;  //xkphys*
+  if(vaddr <= 0xa000'0000'ffff'ffffull) return Context::Segment::Cached32;  //xkphys*
   if(vaddr <= 0xa7ff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0xa800'0000'ffff'ffffull) return Context::Segment::Cached;  //xkphys*
+  if(vaddr <= 0xa800'0000'ffff'ffffull) return Context::Segment::Cached32;  //xkphys*
   if(vaddr <= 0xafff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0xb000'0000'ffff'ffffull) return Context::Segment::Cached;  //xkphys*
+  if(vaddr <= 0xb000'0000'ffff'ffffull) return Context::Segment::Cached32;  //xkphys*
   if(vaddr <= 0xb7ff'ffff'ffff'ffffull) return Context::Segment::Unused;
-  if(vaddr <= 0xb800'0000'ffff'ffffull) return Context::Segment::Cached;  //xkphys*
+  if(vaddr <= 0xb800'0000'ffff'ffffull) return Context::Segment::Cached32;  //xkphys*
   if(vaddr <= 0xbfff'ffff'ffff'ffffull) return Context::Segment::Unused;
   if(vaddr <= 0xc000'00ff'7fff'ffffull) return Context::Segment::Mapped;  //xkseg
   if(vaddr <= 0xffff'ffff'7fff'ffffull) return Context::Segment::Unused;
@@ -89,6 +89,7 @@ auto CPU::segment(u64 vaddr) -> Context::Segment {
 }
 
 auto CPU::devirtualize(u64 vaddr) -> maybe<u64> {
+  if(vaddrAlignedError<Word>(vaddr, false)) return nothing;
   switch(segment(vaddr)) {
   case Context::Segment::Unused:
     addressException(vaddr);
@@ -100,32 +101,55 @@ auto CPU::devirtualize(u64 vaddr) -> maybe<u64> {
     return nothing;
   case Context::Segment::Cached:
   case Context::Segment::Direct:
-    return vaddr & context.physMask;
+    return vaddr & 0x1fff'ffff;
+  case Context::Segment::Cached32:
+  case Context::Segment::Direct32:
+    return vaddr & 0xffff'ffff;
   }
   unreachable;
 }
 
-auto CPU::fetch(u64 vaddr) -> u32 {
+template<u32 Size>
+inline auto CPU::busWrite(u32 address, u64 data) -> void {
+  u32 cycles = 0;
+  bus.write<Size>(address, data, cycles);
+  step(cycles);
+}
+
+template<u32 Size>
+inline auto CPU::busRead(u32 address) -> u64 {
+  u32 cycles = 0; u64 data;
+  data = bus.read<Size>(address, cycles);
+  return step(cycles), data;
+}
+
+auto CPU::fetch(u64 vaddr) -> maybe<u32> {
+  if(vaddrAlignedError<Word>(vaddr, false)) return nothing;
   switch(segment(vaddr)) {
   case Context::Segment::Unused:
     step(1);
     addressException(vaddr);
     exception.addressLoad();
-    return 0;  //nop
+    return nothing;
   case Context::Segment::Mapped:
     if(auto match = tlb.load(vaddr)) {
       if(match.cache) return icache.fetch(match.address & context.physMask, cpu);
       step(1);
-      return bus.read<Word>(match.address & context.physMask);
+      return busRead<Word>(match.address & context.physMask);
     }
     step(1);
     addressException(vaddr);
-    return 0;  //nop
+    return nothing;
   case Context::Segment::Cached:
-    return icache.fetch(vaddr & context.physMask, cpu);
+    return icache.fetch(vaddr & 0x1fff'ffff, cpu);
+  case Context::Segment::Cached32:
+    return icache.fetch(vaddr & 0xffff'ffff, cpu);
   case Context::Segment::Direct:
     step(1);
-    return bus.read<Word>(vaddr & context.physMask);
+    return busRead<Word>(vaddr & 0x1fff'ffff);
+  case Context::Segment::Direct32:
+    step(1);
+    return busRead<Word>(vaddr & 0xffff'ffff);
   }
 
   unreachable;
@@ -133,21 +157,7 @@ auto CPU::fetch(u64 vaddr) -> u32 {
 
 template<u32 Size>
 auto CPU::read(u64 vaddr) -> maybe<u64> {
-  if constexpr(Accuracy::CPU::AddressErrors) {
-    if(unlikely(vaddr & Size - 1)) {
-      step(1);
-      addressException(vaddr);
-      exception.addressLoad();
-      return nothing;
-    }
-    if (context.bits == 32 && unlikely((s32)vaddr != vaddr)) {
-      step(1);
-      addressException(vaddr);
-      exception.addressLoad();
-      return nothing;      
-    }
-  }
-
+  if(vaddrAlignedError<Size>(vaddr, false)) return nothing;
   switch(segment(vaddr)) {
   case Context::Segment::Unused:
     step(1);
@@ -158,16 +168,21 @@ auto CPU::read(u64 vaddr) -> maybe<u64> {
     if(auto match = tlb.load(vaddr)) {
       if(match.cache) return dcache.read<Size>(match.address & context.physMask);
       step(1);
-      return bus.read<Size>(match.address & context.physMask);
+      return busRead<Size>(match.address & context.physMask);
     }
     step(1);
     addressException(vaddr);
     return nothing;
   case Context::Segment::Cached:
-    return dcache.read<Size>(vaddr & context.physMask);
+    return dcache.read<Size>(vaddr & 0x1fff'ffff);
+  case Context::Segment::Cached32:
+    return dcache.read<Size>(vaddr & 0xffff'ffff);
   case Context::Segment::Direct:
     step(1);
-    return bus.read<Size>(vaddr & context.physMask);
+    return busRead<Size>(vaddr & 0x1fff'ffff);
+  case Context::Segment::Direct32:
+    step(1);
+    return busRead<Size>(vaddr & 0xffff'ffff);
   }
 
   unreachable;
@@ -175,21 +190,7 @@ auto CPU::read(u64 vaddr) -> maybe<u64> {
 
 template<u32 Size>
 auto CPU::write(u64 vaddr, u64 data) -> bool {
-  if constexpr(Accuracy::CPU::AddressErrors) {
-    if(unlikely(vaddr & Size - 1)) {
-      step(1);
-      addressException(vaddr);
-      exception.addressStore();
-      return false;
-    }
-    if (context.bits == 32 && unlikely((s32)vaddr != vaddr)) {
-      step(1);
-      addressException(vaddr);
-      exception.addressStore();
-      return false;
-    }
-  }
-
+  if(vaddrAlignedError<Size>(vaddr, true)) return false;
   switch(segment(vaddr)) {
   case Context::Segment::Unused:
     step(1);
@@ -200,21 +201,47 @@ auto CPU::write(u64 vaddr, u64 data) -> bool {
     if(auto match = tlb.store(vaddr)) {
       if(match.cache) return dcache.write<Size>(match.address & context.physMask, data), true;
       step(1);
-      return bus.write<Size>(match.address & context.physMask, data), true;
+      return busWrite<Size>(match.address & context.physMask, data), true;
     }
     step(1);
     addressException(vaddr);
     return false;
   case Context::Segment::Cached:
-    return dcache.write<Size>(vaddr & context.physMask, data), true;
+    return dcache.write<Size>(vaddr & 0x1fff'ffff, data), true;
+  case Context::Segment::Cached32:
+    return dcache.write<Size>(vaddr & 0xffff'ffff, data), true;
   case Context::Segment::Direct:
     step(1);
-    return bus.write<Size>(vaddr & context.physMask, data), true;
+    return busWrite<Size>(vaddr & 0x1fff'ffff, data), true;
+  case Context::Segment::Direct32:
+    step(1);
+    return busWrite<Size>(vaddr & 0xffff'ffff, data), true;
   }
 
   unreachable;
 }
 
+template<u32 Size>
+auto CPU::vaddrAlignedError(u64 vaddr, bool write) -> bool {
+  if constexpr(Accuracy::CPU::AddressErrors) {
+    if(unlikely(vaddr & Size - 1)) {
+      step(1);
+      addressException(vaddr);
+      if(write) exception.addressStore();
+      else exception.addressLoad();
+      return true;
+    }
+    if (context.bits == 32 && unlikely((s32)vaddr != vaddr)) {
+      step(1);
+      addressException(vaddr);
+      if(write) exception.addressStore();
+      else exception.addressLoad();
+      return true;
+    }
+  }
+  return false;
+}
+
 auto CPU::addressException(u64 vaddr) -> void {
   scc.badVirtualAddress = vaddr;
   scc.tlb.virtualAddress.bit(13,39) = vaddr >> 13;
diff --git a/waterbox/ares64/ares/ares/n64/cpu/recompiler.cpp b/waterbox/ares64/ares/ares/n64/cpu/recompiler.cpp
index 3e660414d9..fb0fc61319 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/recompiler.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/recompiler.cpp
@@ -4,17 +4,15 @@ auto CPU::Recompiler::pool(u32 address) -> Pool* {
   return pool;
 }
 
-auto CPU::Recompiler::block(u32 address) -> Block& {
-  auto pool = this->pool(address);
-  auto& block = pool->blocks[address >> 2 & 0x3f];
-  if(block.code) return block;
-  emit(address, block);
-  pool->dirty = true;
+auto CPU::Recompiler::block(u32 address) -> Block* {
+  if(auto block = pool(address)->blocks[address >> 2 & 0x3f]) return block;
+  auto block = emit(address);
+  pool(address)->blocks[address >> 2 & 0x3f] = block;
   memory::jitprotect(true);
   return block;
 }
 
-auto CPU::Recompiler::emit(u32 address, Block& block) -> void {
+auto CPU::Recompiler::emit(u32 address) -> Block* {
   if(unlikely(allocator.available() < 1_MiB)) {
     print("CPU allocator flush\n");
     memory::jitprotect(false);
@@ -23,11 +21,13 @@ auto CPU::Recompiler::emit(u32 address, Block& block) -> void {
     reset();
   }
 
+  auto block = (Block*)allocator.acquire(sizeof(Block));
   beginFunction(3);
 
+  u32 memCycles;
   bool hasBranched = 0;
   while(true) {
-    u32 instruction = bus.read<Word>(address);
+    u32 instruction = bus.read<Word>(address, memCycles);
     bool branched = emitEXECUTE(instruction);
     if(unlikely(instruction == 0x1000'ffff)) {
       //accelerate idle loops
@@ -43,9 +43,10 @@ auto CPU::Recompiler::emit(u32 address, Block& block) -> void {
   jumpEpilog();
 
   memory::jitprotect(false);
-  block.code = endFunction();
+  block->code = endFunction();
 
 //print(hex(PC, 8L), " ", instructions, " ", size(), "\n");
+  return block;
 }
 
 #define Sa  (instruction >>  6 & 31)
@@ -284,7 +285,7 @@ auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x1c ... 0x1f: {
+  case range4(0x1c, 0x1f): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -599,9 +600,7 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //SLLV Rd,Rt,Rs
   case 0x04: {
-    mov32(reg(0), mem(Rt32));
-    and32(reg(1), mem(Rs32), imm(31));
-    shl32(reg(0), reg(0), reg(1));
+    mshl32(reg(0), mem(Rt32), mem(Rs32));
     mov64_s32(reg(0), reg(0));
     mov64(mem(Rd), reg(0));
     return 0;
@@ -615,9 +614,7 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //SRLV Rd,Rt,RS
   case 0x06: {
-    mov32(reg(0), mem(Rt32));
-    and32(reg(1), mem(Rs32), imm(31));
-    lshr32(reg(0), reg(0), reg(1));
+    mlshr32(reg(0), mem(Rt32), mem(Rs32));
     mov64_s32(reg(0), reg(0));
     mov64(mem(Rd), reg(0));
     return 0;
@@ -625,7 +622,7 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //SRAV Rd,Rt,Rs
   case 0x07: {
-    and32(reg(1), mem(Rs32), imm(31));
+    and64(reg(1), mem(Rs), imm(31));
     ashr64(reg(0), mem(Rt), reg(1));
     mov64_s32(reg(0), reg(0));
     mov64(mem(Rd), reg(0));
@@ -648,7 +645,7 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x0a ... 0x0b: {
+  case range2(0x0a, 0x0b): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -853,13 +850,13 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
   //NOR Rd,Rs,Rt
   case 0x27: {
     or64(reg(0), mem(Rs), mem(Rt));
-    not64(reg(0), reg(0));
+    xor64(reg(0), reg(0), imm(-1));
     mov64(mem(Rd), reg(0));
     return 0;
   }
 
   //INVALID
-  case 0x28 ... 0x29: {
+  case range2(0x28, 0x29): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -1081,7 +1078,7 @@ auto CPU::Recompiler::emitREGIMM(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x04 ... 0x07: {
+  case range4(0x04, 0x07): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -1179,7 +1176,7 @@ auto CPU::Recompiler::emitREGIMM(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x14 ... 0x1f: {
+  case range12(0x14, 0x1f): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -1209,7 +1206,7 @@ auto CPU::Recompiler::emitSCC(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x02 ... 0x03: {
+  case range2(0x02, 0x03): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -1231,7 +1228,7 @@ auto CPU::Recompiler::emitSCC(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x06 ... 0x0f: {
+  case range10(0x06, 0x0f): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -1348,7 +1345,7 @@ auto CPU::Recompiler::emitFPU(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x09 ... 0x0f: {
+  case range7(0x09, 0x0f): {
     call(&CPU::INVALID);
     return 1;
   }
@@ -1951,12 +1948,12 @@ auto CPU::Recompiler::emitFPU(u32 instruction) -> bool {
 
   if((instruction >> 21 & 31) == 20)
   switch(instruction & 0x3f) {    
-  case 0x08 ... 0x0f: {
+  case range8(0x08, 0x0f): {
     call(&CPU::COP1UNIMPLEMENTED);
     return 1;
   }
 
-  case 0x24 ... 0x25: {
+  case range2(0x24, 0x25): {
     call(&CPU::COP1UNIMPLEMENTED);
     return 1;
   }
@@ -1981,11 +1978,11 @@ auto CPU::Recompiler::emitFPU(u32 instruction) -> bool {
 
   if((instruction >> 21 & 31) == 21)
   switch(instruction & 0x3f) {
-  case 0x08 ... 0x0f: {
+  case range8(0x08, 0x0f): {
     call(&CPU::COP1UNIMPLEMENTED);
     return 1;
   }
-  case 0x24 ... 0x25: {
+  case range2(0x24, 0x25): {
     call(&CPU::COP1UNIMPLEMENTED);
     return 1;
   }
@@ -2069,7 +2066,7 @@ auto CPU::Recompiler::emitCOP2(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x07 ... 0x0f: {
+  case range9(0x07, 0x0f): {
     call(&CPU::COP2INVALID);
     return 1;
   }
diff --git a/waterbox/ares64/ares/ares/n64/cpu/serialization.cpp b/waterbox/ares64/ares/ares/n64/cpu/serialization.cpp
index 56442a6476..48f9d03cfd 100644
--- a/waterbox/ares64/ares/ares/n64/cpu/serialization.cpp
+++ b/waterbox/ares64/ares/ares/n64/cpu/serialization.cpp
@@ -116,6 +116,7 @@ auto CPU::serialize(serializer& s) -> void {
   s(scc.tagLo.physicalAddress);
   s(scc.epcError);
   s(scc.latch);
+  s(scc.nmiPending);
 
   for(auto& r : fpu.r) s(r.u64);
   s(fpu.csr.roundMode);
diff --git a/waterbox/ares64/ares/ares/n64/dd/dd.cpp b/waterbox/ares64/ares/ares/n64/dd/dd.cpp
index b291be4593..316e53903e 100644
--- a/waterbox/ares64/ares/ares/n64/dd/dd.cpp
+++ b/waterbox/ares64/ares/ares/n64/dd/dd.cpp
@@ -93,10 +93,12 @@ auto DD::disconnect() -> void {
 }
 
 auto DD::save() -> void {
-  /*if(disk)
+#if false
+  if(disk)
   if(auto fp = pak->write("program.disk")) {
     disk.save(fp);
-  }*/
+  }
+#endif
   
   rtcSave();
 }
diff --git a/waterbox/ares64/ares/ares/n64/dd/dd.hpp b/waterbox/ares64/ares/ares/n64/dd/dd.hpp
index a6abe053e2..d6b9f2a7e8 100644
--- a/waterbox/ares64/ares/ares/n64/dd/dd.hpp
+++ b/waterbox/ares64/ares/ares/n64/dd/dd.hpp
@@ -1,6 +1,8 @@
 //Disk Drive
 
-struct DD : Memory::IO<DD> {
+#include <nall/bcd.hpp>
+
+struct DD : Memory::PI<DD> {
   Node::Object obj;
   Node::Port port;
   Node::Peripheral node;
@@ -65,6 +67,8 @@ struct DD : Memory::IO<DD> {
   auto rtcTickSecond() -> void;
 
   //io.cpp
+  auto readHalf(u32 address) -> u16;
+  auto writeHalf(u32 address, u16 data) -> void;
   auto readWord(u32 address) -> u32;
   auto writeWord(u32 address, u32 data) -> void;
 
@@ -76,13 +80,6 @@ struct DD : Memory::IO<DD> {
     string cic;
   } information;
 
-  struct BCD {
-    static auto encode(u8 value) -> u8 { return value / 10 << 4 | value % 10; }
-    static auto decode(u8 value) -> u8 { return (value >> 4) * 10 + (value & 15); }
-  };
-
-  std::function<u64()> rtcCallback = []() { return 0; };
-
 private:
   struct Interrupt {
     b1 line = 0;
diff --git a/waterbox/ares64/ares/ares/n64/dd/drive.cpp b/waterbox/ares64/ares/ares/n64/dd/drive.cpp
index 7eef152641..33768765a7 100644
--- a/waterbox/ares64/ares/ares/n64/dd/drive.cpp
+++ b/waterbox/ares64/ares/ares/n64/dd/drive.cpp
@@ -20,7 +20,7 @@ auto DD::seekSector(n8 sector) -> u32 {
   n1 headCalc = io.currentTrack.bit(12);
 
   u32 startOffsetTable[16] = {0x0,0x5F15E0,0xB79D00,0x10801A0,0x1523720,0x1963D80,0x1D414C0,0x20BBCE0,
-	                            0x23196E0,0x28A1E00,0x2DF5DC0,0x3299340,0x36D99A0,0x3AB70E0,0x3E31900,0x4149200};
+                              0x23196E0,0x28A1E00,0x2DF5DC0,0x3299340,0x36D99A0,0x3AB70E0,0x3E31900,0x4149200};
   
   u16 trackPhysicalTable[] = {0x000, 0x09E, 0x13C, 0x1D1, 0x266, 0x2FB, 0x390, 0x425};
   u16 blockSizeTable[] = {0x4D08, 0x47B8, 0x4510, 0x3FC0, 0x3A70, 0x3520, 0x2FD0, 0x2A80, 
diff --git a/waterbox/ares64/ares/ares/n64/dd/io.cpp b/waterbox/ares64/ares/ares/n64/dd/io.cpp
index 1ed95c4d19..a85269edd9 100644
--- a/waterbox/ares64/ares/ares/n64/dd/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/dd/io.cpp
@@ -1,31 +1,31 @@
-auto DD::readWord(u32 address) -> u32 {
-  address = (address & 0x7f) >> 2;
-  n32 data;
+auto DD::readHalf(u32 address) -> u16 {
+  address = (address & 0x7f) >> 1;
+  n16 data = 0;
 
   //ASIC_DATA
   if(address == 0) {
-    data.bit(16,31) = io.data;
+    data.bit(0,15) = io.data;
   }
 
   //ASIC_MISC_REG
-  if(address == 1) {
+  if(address == 2) {
   }
 
   //ASIC_STATUS
-  if(address == 2) {
-    data.bit(16) = io.status.diskChanged;
-    data.bit(17) = io.status.mechaError;
-    data.bit(18) = io.status.writeProtect;
-    data.bit(19) = io.status.headRetracted;
-    data.bit(20) = io.status.spindleMotorStopped;
-    data.bit(22) = io.status.resetState;
-    data.bit(23) = io.status.busyState;
-    data.bit(24) = (bool)disk; //disk present
-    data.bit(25) = irq.mecha.line;
-    data.bit(26) = irq.bm.line;
-    data.bit(27) = io.bm.error;
-    data.bit(28) = io.status.requestC2Sector;
-    data.bit(30) = io.status.requestUserSector;
+  if(address == 4) {
+    data.bit(0) = io.status.diskChanged;
+    data.bit(1) = io.status.mechaError;
+    data.bit(2) = io.status.writeProtect;
+    data.bit(3) = io.status.headRetracted;
+    data.bit(4) = io.status.spindleMotorStopped;
+    data.bit(6) = io.status.resetState;
+    data.bit(7) = io.status.busyState;
+    data.bit(8) = (bool)disk; //disk present
+    data.bit(9) = irq.mecha.line;
+    data.bit(10) = irq.bm.line;
+    data.bit(11) = io.bm.error;
+    data.bit(12) = io.status.requestC2Sector;
+    data.bit(14) = io.status.requestUserSector;
 
     //acknowledge bm interrupt (tested on real hardware)
     if(irq.bm.line) {
@@ -36,133 +36,133 @@ auto DD::readWord(u32 address) -> u32 {
   }
 
   //ASIC_CUR_TK
-  if(address == 3) {
-    data.bit(16,31) = io.currentTrack;
+  if(address == 6) {
+    data.bit(0,15) = io.currentTrack;
   }
 
   //ASIC_BM_STATUS
-  if(address == 4) {
-    data.bit(16) = io.bm.c1Error;
-    data.bit(21) = io.bm.c1Single;
-    data.bit(22) = io.bm.c1Double;
-    data.bit(23) = io.bm.c1Correct;
-    data.bit(24) = io.bm.blockTransfer;
-    data.bit(25) = io.micro.error;
-    data.bit(26) = io.bm.error;
-    data.bit(31) = io.bm.start;
+  if(address == 8) {
+    data.bit(0) = io.bm.c1Error;
+    data.bit(5) = io.bm.c1Single;
+    data.bit(6) = io.bm.c1Double;
+    data.bit(7) = io.bm.c1Correct;
+    data.bit(8) = io.bm.blockTransfer;
+    data.bit(9) = io.micro.error;
+    data.bit(10) = io.bm.error;
+    data.bit(15) = io.bm.start;
   }
 
   //ASIC_ERR_SECTOR
-  if(address == 5) {
-    data.bit(16,23) = io.error.sector;
-    data.bit(24) = io.error.selfStop;
-    data.bit(25) = io.error.clockUnlock;
-    data.bit(26) = ~(bool)disk; //no disk
-    data.bit(27) = io.error.offTrack;
-    data.bit(28) = io.error.overrun;
-    data.bit(29) = io.error.spindle;
-    data.bit(30) = io.micro.error;
-    data.bit(31) = io.error.am;
+  if(address == 10) {
+    data.bit(0,7) = io.error.sector;
+    data.bit(8) = io.error.selfStop;
+    data.bit(9) = io.error.clockUnlock;
+    data.bit(10) = ~(bool)disk; //no disk
+    data.bit(11) = io.error.offTrack;
+    data.bit(12) = io.error.overrun;
+    data.bit(13) = io.error.spindle;
+    data.bit(14) = io.micro.error;
+    data.bit(15) = io.error.am;
   }
 
   //ASIC_SEQ_STATUS
-  if(address == 6) {
+  if(address == 12) {
   }
 
   //ASIC_CUR_SECTOR
-  if(address == 7) {
-    data.bit(24,31) = io.currentSector;
-    data.bit(16,23) = 0xc3;
+  if(address == 14) {
+    data.bit(8,15) = io.currentSector;
+    data.bit(0,7) = 0xc3;
   }
 
   //ASIC_HARD_RESET
-  if(address == 8) {
+  if(address == 16) {
   }
 
   //ASIC_C1_S0
-  if(address == 9) {
+  if(address == 18) {
   }
 
   //ASIC_HOST_SECBYTE
-  if(address == 10) {
-    data.bit(16,23) = io.sectorSizeBuf;
+  if(address == 20) {
+    data.bit(0,7) = io.sectorSizeBuf;
   }
 
   //ASIC_C1_S2
-  if(address == 11) {
+  if(address == 22) {
   }
 
   //ASIC_SEC_BYTE
-  if(address == 12) {
-    data.bit(16,23) = io.sectorSize;
-    data.bit(24,31) = io.sectorBlock;
+  if(address == 24) {
+    data.bit(0,7) = io.sectorSize;
+    data.bit(8,15) = io.sectorBlock;
   }
 
   //ASIC_C1_S4
-  if(address == 13) {
+  if(address == 26) {
   }
 
   //ASIC_C1_S6
-  if(address == 14) {
+  if(address == 28) {
   }
 
   //ASIC_CUR_ADDRESS
-  if(address == 15) {
+  if(address == 30) {
   }
 
   //ASIC_ID_REG
-  if(address == 16) {
-    data.bit(16,31) = io.id;
+  if(address == 32) {
+    data.bit(0,15) = io.id;
   }
 
   //ASIC_TEST_REG
-  if(address == 17) {
+  if(address == 34) {
   }
 
   //ASIC_TEST_PIN_SEL
-  if(address == 18) {
+  if(address == 36) {
   }
 
   debugger.io(Read, address, data);
   return data;
 }
 
-auto DD::writeWord(u32 address, u32 data_) -> void {
-  address = (address & 0x7f) >> 2;
-  n32 data = data_;
+auto DD::writeHalf(u32 address, u16 data_) -> void {
+  address = (address & 0x7f) >> 1;
+  n16 data = data_;
 
   //ASIC_DATA
   if(address == 0) {
-    io.data = data.bit(16,31);
+    io.data = data.bit(0,15);
   }
 
   //ASIC_MISC_REG
-  if(address == 1) {
+  if(address == 2) {
   }
 
   //ASIC_CMD
-  if(address == 2) {
-    command(data.bit(16,31));
+  if(address == 4) {
+    command(data.bit(0,15));
   }
 
   //ASIC_CUR_TK
-  if(address == 3) {
+  if(address == 6) {
   }
 
   //ASIC_BM_CTL
-  if(address == 4) {
-    io.bm.reset |= data.bit(28);
-    io.bm.readMode = data.bit(30);
-    //irq.bm.mask = ~data.bit(29);
-    io.bm.disableORcheck = data.bit(27);
-    io.bm.disableC1Correction = data.bit(26);
-    io.bm.blockTransfer = data.bit(25);
-    if (data.bit(24)) {
+  if(address == 8) {
+    io.bm.reset |= data.bit(12);
+    io.bm.readMode = data.bit(14);
+    //irq.bm.mask = ~data.bit(13);
+    io.bm.disableORcheck = data.bit(11);
+    io.bm.disableC1Correction = data.bit(10);
+    io.bm.blockTransfer = data.bit(9);
+    if (data.bit(8)) {
       //mecha int reset
       lower(IRQ::MECHA);
     }
-    io.currentSector = data.bit(16,23);
-    if (!data.bit(28) && io.bm.reset) {
+    io.currentSector = data.bit(0,7);
+    if (!data.bit(12) && io.bm.reset) {
       //BM reset
       io.bm.start = 0;
       io.bm.error = 0;
@@ -172,76 +172,88 @@ auto DD::writeWord(u32 address, u32 data_) -> void {
       lower(IRQ::BM);
     }
 
-    if(data.bit(31) && disk) {
+    if(data.bit(15) && disk) {
       //start BM
-      io.bm.start |= data.bit(31);
+      io.bm.start |= data.bit(15);
       //TODO: proper research into seek and access times
       queue.insert(Queue::DD_BM_Request, 50'000 + (io.currentTrack.bit(0,11) / 15));
     }
   }
 
   //ASIC_ERR_SECTOR
-  if(address == 5) {
+  if(address == 10) {
   }
 
   //ASIC_SEQ_CTL
-  if(address == 6) {
-    io.micro.enable = data.bit(30);
+  if(address == 12) {
+    io.micro.enable = data.bit(14);
   }
 
   //ASIC_CUR_SECTOR
-  if(address == 7) {
+  if(address == 14) {
   }
 
   //ASIC_HARD_RESET
-  if(address == 8) {
+  if(address == 16) {
     if((data >> 16) == 0xAAAA) {
       power(true);
     }
   }
 
   //ASIC_C1_S0
-  if(address == 9) {
+  if(address == 18) {
   }
 
   //ASIC_HOST_SECBYTE
-  if(address == 10) {
-    io.sectorSizeBuf = data.bit(16,23);
+  if(address == 20) {
+    io.sectorSizeBuf = data.bit(0,7);
   }
 
   //ASIC_C1_S2
-  if(address == 11) {
-    io.sectorSize = data.bit(16,23);
-    io.sectorBlock = data.bit(24,31);
+  if(address == 22) {
+    io.sectorSize = data.bit(0,7);
+    io.sectorBlock = data.bit(8,15);
   }
 
   //ASIC_SEC_BYTE
-  if(address == 12) {
+  if(address == 24) {
   }
 
   //ASIC_C1_S4
-  if(address == 13) {
+  if(address == 26) {
   }
 
   //ASIC_C1_S6
-  if(address == 14) {
+  if(address == 28) {
   }
 
   //ASIC_CUR_ADDRESS
-  if(address == 15) {
+  if(address == 30) {
   }
 
   //ASIC_ID_REG
-  if(address == 16) {
+  if(address == 32) {
   }
 
   //ASIC_TEST_REG
-  if(address == 17) {
+  if(address == 34) {
   }
 
   //ASIC_TEST_PIN_SEL
-  if(address == 18) {
+  if(address == 36) {
   }
 
   debugger.io(Write, address, data);
 }
+
+auto DD::readWord(u32 address) -> u32 {
+  n32 data;
+  data.bit(16,31) = readHalf(address + 0);
+  data.bit( 0,15) = readHalf(address + 2);
+  return (u32)data;
+}
+
+auto DD::writeWord(u32 address, u32 data) -> void {
+  writeHalf(address + 0, data >> 16);
+  writeHalf(address + 2, data & 0xffff);
+}
diff --git a/waterbox/ares64/ares/ares/n64/dd/rtc.cpp b/waterbox/ares64/ares/ares/n64/dd/rtc.cpp
index 8dae10a970..961a5a85ea 100644
--- a/waterbox/ares64/ares/ares/n64/dd/rtc.cpp
+++ b/waterbox/ares64/ares/ares/n64/dd/rtc.cpp
@@ -1,7 +1,9 @@
 auto DD::rtcLoad() -> void {
-  /*if(auto fp = system.pak->read("time.rtc")) {
+#if false
+  if(auto fp = system.pak->read("time.rtc")) {
     rtc.load(fp);
-  }*/
+  }
+#endif
 
   n64 check = 0;
   for(auto n : range(8)) check.byte(n) = rtc.read<Byte>(n);
@@ -11,17 +13,19 @@ auto DD::rtcLoad() -> void {
   for(auto n : range(8)) timestamp.byte(n) = rtc.read<Byte>(8 + n);
   if(!~timestamp) return;  //new save file
 
-  timestamp = rtcCallback() - timestamp;
+  timestamp = platform->time() - timestamp;
   while(timestamp--) rtcTickSecond();
 }
 
 auto DD::rtcSave() -> void {
-  n64 timestamp = rtcCallback();
+  n64 timestamp = platform->time();
   for(auto n : range(8)) rtc.write<Byte>(8 + n, timestamp.byte(n));
 
-  /*if(auto fp = system.pak->write("time.rtc")) {
+#if false
+  if(auto fp = system.pak->write("time.rtc")) {
     rtc.save(fp);
-  }*/
+  }
+#endif
 }
 
 auto DD::rtcTick(u32 offset) -> void {
diff --git a/waterbox/ares64/ares/ares/n64/dd/serialization.cpp b/waterbox/ares64/ares/ares/n64/dd/serialization.cpp
index a5c84e2439..f9047a15f6 100644
--- a/waterbox/ares64/ares/ares/n64/dd/serialization.cpp
+++ b/waterbox/ares64/ares/ares/n64/dd/serialization.cpp
@@ -1,66 +1,66 @@
 auto DD::serialize(serializer& s) -> void {
-	s(irq.bm.line);
-	s(irq.bm.mask);
-	s(irq.mecha.line);
-	s(irq.mecha.mask);
+  s(irq.bm.line);
+  s(irq.bm.mask);
+  s(irq.mecha.line);
+  s(irq.mecha.mask);
 
-	s(ctl.diskType);
-	s(ctl.error.selfDiagnostic);
-	s(ctl.error.servoNG);
-	s(ctl.error.indexGapNG);
-	s(ctl.error.timeout);
-	s(ctl.error.undefinedCommand);
-	s(ctl.error.invalidParam);
-	s(ctl.error.unknown);
-	s(ctl.standbyDelayDisable);
-	s(ctl.standbyDelay);
-	s(ctl.sleepDelayDisable);
-	s(ctl.sleepDelay);
-	s(ctl.ledOnTime);
-	s(ctl.ledOffTime);
+  s(ctl.diskType);
+  s(ctl.error.selfDiagnostic);
+  s(ctl.error.servoNG);
+  s(ctl.error.indexGapNG);
+  s(ctl.error.timeout);
+  s(ctl.error.undefinedCommand);
+  s(ctl.error.invalidParam);
+  s(ctl.error.unknown);
+  s(ctl.standbyDelayDisable);
+  s(ctl.standbyDelay);
+  s(ctl.sleepDelayDisable);
+  s(ctl.sleepDelay);
+  s(ctl.ledOnTime);
+  s(ctl.ledOffTime);
 
-	s(io.data);
+  s(io.data);
 
-	s(io.status.requestUserSector);
-	s(io.status.requestC2Sector);
-	s(io.status.busyState);
-	s(io.status.resetState);
-	s(io.status.spindleMotorStopped);
-	s(io.status.headRetracted);
-	s(io.status.writeProtect);
-	s(io.status.mechaError);
-	s(io.status.diskChanged);
+  s(io.status.requestUserSector);
+  s(io.status.requestC2Sector);
+  s(io.status.busyState);
+  s(io.status.resetState);
+  s(io.status.spindleMotorStopped);
+  s(io.status.headRetracted);
+  s(io.status.writeProtect);
+  s(io.status.mechaError);
+  s(io.status.diskChanged);
 
-	s(io.currentTrack);
-	s(io.currentSector);
+  s(io.currentTrack);
+  s(io.currentSector);
 
-	s(io.sectorSizeBuf);
-	s(io.sectorSize);
-	s(io.sectorBlock);
-	s(io.id);
+  s(io.sectorSizeBuf);
+  s(io.sectorSize);
+  s(io.sectorBlock);
+  s(io.id);
 
-	s(io.bm.start);
-	s(io.bm.reset);
-	s(io.bm.error);
-	s(io.bm.blockTransfer);
-	s(io.bm.c1Correct);
-	s(io.bm.c1Double);
-	s(io.bm.c1Single);
-	s(io.bm.c1Error);
-	s(io.bm.readMode);
-	s(io.bm.disableORcheck);
-	s(io.bm.disableC1Correction);
+  s(io.bm.start);
+  s(io.bm.reset);
+  s(io.bm.error);
+  s(io.bm.blockTransfer);
+  s(io.bm.c1Correct);
+  s(io.bm.c1Double);
+  s(io.bm.c1Single);
+  s(io.bm.c1Error);
+  s(io.bm.readMode);
+  s(io.bm.disableORcheck);
+  s(io.bm.disableC1Correction);
 
-	s(io.error.am);
-	s(io.error.spindle);
-	s(io.error.overrun);
-	s(io.error.offTrack);
-	s(io.error.clockUnlock);
-	s(io.error.selfStop);
-	s(io.error.sector);
+  s(io.error.am);
+  s(io.error.spindle);
+  s(io.error.overrun);
+  s(io.error.offTrack);
+  s(io.error.clockUnlock);
+  s(io.error.selfStop);
+  s(io.error.sector);
 
-	s(io.micro.enable);
-	s(io.micro.error);
+  s(io.micro.enable);
+  s(io.micro.error);
 
-	s(state.seek);
+  s(state.seek);
 }
diff --git a/waterbox/ares64/ares/ares/n64/memory/bus.hpp b/waterbox/ares64/ares/ares/n64/memory/bus.hpp
index 904487dcbe..b7dcec5952 100644
--- a/waterbox/ares64/ares/ares/n64/memory/bus.hpp
+++ b/waterbox/ares64/ares/ares/n64/memory/bus.hpp
@@ -1,30 +1,30 @@
 template<u32 Size>
-inline auto Bus::read(u32 address) -> u64 {
+inline auto Bus::read(u32 address, u32& cycles) -> u64 {
   static constexpr u64 unmapped = 0;
   address &= 0x1fff'ffff - (Size - 1);
 
   if(address <= 0x007f'ffff) return rdram.ram.read<Size>(address);
   if(address <= 0x03ef'ffff) return unmapped;
-  if(address <= 0x03ff'ffff) return rdram.read<Size>(address);
-  if(address <= 0x0407'ffff) return rsp.read<Size>(address);
-  if(address <= 0x040f'ffff) return rsp.status.read<Size>(address);
-  if(address <= 0x041f'ffff) return rdp.read<Size>(address);
-  if(address <= 0x042f'ffff) return rdp.io.read<Size>(address);
-  if(address <= 0x043f'ffff) return mi.read<Size>(address);
-  if(address <= 0x044f'ffff) return vi.read<Size>(address);
-  if(address <= 0x045f'ffff) return ai.read<Size>(address);
-  if(address <= 0x046f'ffff) return pi.read<Size>(address);
-  if(address <= 0x047f'ffff) return ri.read<Size>(address);
-  if(address <= 0x048f'ffff) return si.read<Size>(address);
+  if(address <= 0x03ff'ffff) return rdram.read<Size>(address, cycles);
+  if(address <= 0x0407'ffff) return rsp.read<Size>(address, cycles);
+  if(address <= 0x040f'ffff) return rsp.status.read<Size>(address, cycles);
+  if(address <= 0x041f'ffff) return rdp.read<Size>(address, cycles);
+  if(address <= 0x042f'ffff) return rdp.io.read<Size>(address, cycles);
+  if(address <= 0x043f'ffff) return mi.read<Size>(address, cycles);
+  if(address <= 0x044f'ffff) return vi.read<Size>(address, cycles);
+  if(address <= 0x045f'ffff) return ai.read<Size>(address, cycles);
+  if(address <= 0x046f'ffff) return pi.read<Size>(address, cycles);
+  if(address <= 0x047f'ffff) return ri.read<Size>(address, cycles);
+  if(address <= 0x048f'ffff) return si.read<Size>(address, cycles);
   if(address <= 0x04ff'ffff) return unmapped;
-  if(address <= 0x1fbf'ffff) return pi.read<Size>(address);
-  if(address <= 0x1fcf'ffff) return pif.read<Size>(address);
-  if(address <= 0x7fff'ffff) return pi.read<Size>(address);
+  if(address <= 0x1fbf'ffff) return pi.read<Size>(address, cycles);
+  if(address <= 0x1fcf'ffff) return si.read<Size>(address, cycles);
+  if(address <= 0x7fff'ffff) return pi.read<Size>(address, cycles);
   return unmapped;
 }
 
 template<u32 Size>
-inline auto Bus::write(u32 address, u64 data) -> void {
+inline auto Bus::write(u32 address, u64 data, u32& cycles) -> void {
   address &= 0x1fff'ffff - (Size - 1);
   if constexpr(Accuracy::CPU::Recompiler) {
     cpu.recompiler.invalidate(address + 0); if constexpr(Size == Dual)
@@ -33,20 +33,20 @@ inline auto Bus::write(u32 address, u64 data) -> void {
 
   if(address <= 0x007f'ffff) return rdram.ram.write<Size>(address, data);
   if(address <= 0x03ef'ffff) return;
-  if(address <= 0x03ff'ffff) return rdram.write<Size>(address, data);
-  if(address <= 0x0407'ffff) return rsp.write<Size>(address, data);
-  if(address <= 0x040f'ffff) return rsp.status.write<Size>(address, data);
-  if(address <= 0x041f'ffff) return rdp.write<Size>(address, data);
-  if(address <= 0x042f'ffff) return rdp.io.write<Size>(address, data);
-  if(address <= 0x043f'ffff) return mi.write<Size>(address, data);
-  if(address <= 0x044f'ffff) return vi.write<Size>(address, data);
-  if(address <= 0x045f'ffff) return ai.write<Size>(address, data);
-  if(address <= 0x046f'ffff) return pi.write<Size>(address, data);
-  if(address <= 0x047f'ffff) return ri.write<Size>(address, data);
-  if(address <= 0x048f'ffff) return si.write<Size>(address, data);
+  if(address <= 0x03ff'ffff) return rdram.write<Size>(address, data, cycles);
+  if(address <= 0x0407'ffff) return rsp.write<Size>(address, data, cycles);
+  if(address <= 0x040f'ffff) return rsp.status.write<Size>(address, data, cycles);
+  if(address <= 0x041f'ffff) return rdp.write<Size>(address, data, cycles);
+  if(address <= 0x042f'ffff) return rdp.io.write<Size>(address, data, cycles);
+  if(address <= 0x043f'ffff) return mi.write<Size>(address, data, cycles);
+  if(address <= 0x044f'ffff) return vi.write<Size>(address, data, cycles);
+  if(address <= 0x045f'ffff) return ai.write<Size>(address, data, cycles);
+  if(address <= 0x046f'ffff) return pi.write<Size>(address, data, cycles);
+  if(address <= 0x047f'ffff) return ri.write<Size>(address, data, cycles);
+  if(address <= 0x048f'ffff) return si.write<Size>(address, data, cycles);
   if(address <= 0x04ff'ffff) return;
-  if(address <= 0x1fbf'ffff) return pi.write<Size>(address, data);
-  if(address <= 0x1fcf'ffff) return pif.write<Size>(address, data);
-  if(address <= 0x7fff'ffff) return pi.write<Size>(address, data);
+  if(address <= 0x1fbf'ffff) return pi.write<Size>(address, data, cycles);
+  if(address <= 0x1fcf'ffff) return si.write<Size>(address, data, cycles);
+  if(address <= 0x7fff'ffff) return pi.write<Size>(address, data, cycles);
   return;
 }
diff --git a/waterbox/ares64/ares/ares/n64/memory/io.hpp b/waterbox/ares64/ares/ares/n64/memory/io.hpp
index 8df9708a5c..aa8fd4b95f 100644
--- a/waterbox/ares64/ares/ares/n64/memory/io.hpp
+++ b/waterbox/ares64/ares/ares/n64/memory/io.hpp
@@ -1,9 +1,13 @@
 template<typename T>
-struct IO {
+struct RCP {  //A device which is part of RCP
+  const u32 DefaultReadCycles = 20;
+  const u32 DefaultWriteCycles = 0;  //not implemented until we implement the CPU write queue
+
   template<u32 Size>
-  auto read(u32 address) -> u64 {
+  auto read(u32 address, u32& cycles) -> u64 {
+    cycles = DefaultReadCycles;
     if constexpr(Size == Byte) {
-      auto data = ((T*)this)->readWord(address);
+      auto data = ((T*)this)->readWord(address, cycles);
       switch(address & 3) {
       case 0: return data >> 24;
       case 1: return data >> 16;
@@ -12,43 +16,92 @@ struct IO {
       }
     }
     if constexpr(Size == Half) {
-      auto data = ((T*)this)->readWord(address);
+      auto data = ((T*)this)->readWord(address, cycles);
       switch(address & 2) {
       case 0: return data >> 16;
       case 2: return data >>  0;
       }
     }
     if constexpr(Size == Word) {
-      return ((T*)this)->readWord(address);
+      return ((T*)this)->readWord(address, cycles);
     }
     if constexpr(Size == Dual) {
-      u64 data = ((T*)this)->readWord(address);
-      return data << 32 | ((T*)this)->readWord(address + 4);
+      u64 data = ((T*)this)->readWord(address, cycles);
+      return data << 32 | ((T*)this)->readWord(address + 4, cycles);
     }
     unreachable;
   }
 
   template<u32 Size>
-  auto write(u32 address, u64 data) -> void {
+  auto write(u32 address, u64 data, u32& cycles) -> void {
+    cycles = DefaultWriteCycles;
     if constexpr(Size == Byte) {
       switch(address & 3) {
-      case 0: return ((T*)this)->writeWord(address, data << 24);
-      case 1: return ((T*)this)->writeWord(address, data << 16);
-      case 2: return ((T*)this)->writeWord(address, data <<  8);
-      case 3: return ((T*)this)->writeWord(address, data <<  0);
+      case 0: return ((T*)this)->writeWord(address, data << 24, cycles);
+      case 1: return ((T*)this)->writeWord(address, data << 16, cycles);
+      case 2: return ((T*)this)->writeWord(address, data <<  8, cycles);
+      case 3: return ((T*)this)->writeWord(address, data <<  0, cycles);
       }
     }
     if constexpr(Size == Half) {
       switch(address & 2) {
-      case 0: return ((T*)this)->writeWord(address, data << 16);
-      case 2: return ((T*)this)->writeWord(address, data <<  0);
+      case 0: return ((T*)this)->writeWord(address, data << 16, cycles);
+      case 2: return ((T*)this)->writeWord(address, data <<  0, cycles);
       }
     }
     if constexpr(Size == Word) {
-      ((T*)this)->writeWord(address, data);
+      ((T*)this)->writeWord(address, data, cycles);
     }
     if constexpr(Size == Dual) {
-      ((T*)this)->writeWord(address, data >> 32);
+      ((T*)this)->writeWord(address, data >> 32, cycles);
     }
   }
 };
+
+template<typename T>
+struct PI {  //A device which is reachable only behind PI
+  template<u32 Size>
+  auto read(u32 address) -> u64 {
+    static_assert(Size == Half || Size == Word);  //PI bus will do 32-bit (CPU) or 16-bit (DMA) only
+    if constexpr(Size == Half) {
+      return ((T*)this)->readHalf(address);
+    }
+    if constexpr(Size == Word) {
+      return ((T*)this)->readWord(address);
+    }
+    unreachable;
+  }
+  
+  template<u32 Size>
+  auto write(u32 address, u64 data) -> void {
+    static_assert(Size == Half || Size == Word);  //PI bus will do 32-bit (CPU) or 16-bit (DMA) only
+    if constexpr(Size == Half) {
+      return ((T*)this)->writeHalf(address, data);
+    }
+    if constexpr(Size == Word) {
+      return ((T*)this)->writeWord(address, data);
+    }
+    unreachable;
+  }
+};
+
+template<typename T>
+struct SI {  //A device which is reachable only behind SI
+  template<u32 Size>
+  auto read(u32 address) -> u64 {
+    static_assert(Size == Word);  //SI bus will do 32-bit (CPU/DMA)
+    if constexpr(Size == Word) {
+      return ((T*)this)->readWord(address);
+    }
+    unreachable;
+  }
+  
+  template<u32 Size>
+  auto write(u32 address, u64 data) -> void {
+    static_assert(Size == Word);  //PI bus will do 32-bit (CPU/DMA)
+    if constexpr(Size == Word) {
+      return ((T*)this)->writeWord(address, data);
+    }
+    unreachable;
+  }
+};
diff --git a/waterbox/ares64/ares/ares/n64/memory/memory.hpp b/waterbox/ares64/ares/ares/n64/memory/memory.hpp
index 20b24ebf66..b00614f4da 100644
--- a/waterbox/ares64/ares/ares/n64/memory/memory.hpp
+++ b/waterbox/ares64/ares/ares/n64/memory/memory.hpp
@@ -31,8 +31,8 @@ namespace Memory {
 
 struct Bus {
   //bus.hpp
-  template<u32 Size> auto read(u32 address) -> u64;
-  template<u32 Size> auto write(u32 address, u64 data) -> void;
+  template<u32 Size> auto read(u32 address, u32& cycles) -> u64;
+  template<u32 Size> auto write(u32 address, u64 data, u32& cycles) -> void;
 };
 
 extern Bus bus;
diff --git a/waterbox/ares64/ares/ares/n64/mi/io.cpp b/waterbox/ares64/ares/ares/n64/mi/io.cpp
index aac5e31753..caaf82bcb6 100644
--- a/waterbox/ares64/ares/ares/n64/mi/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/mi/io.cpp
@@ -1,4 +1,4 @@
-auto MI::readWord(u32 address) -> u32 {
+auto MI::readWord(u32 address, u32& cycles) -> u32 {
   address = (address & 0xfffff) >> 2;
   n32 data;
 
@@ -42,7 +42,7 @@ auto MI::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto MI::writeWord(u32 address, u32 data_) -> void {
+auto MI::writeWord(u32 address, u32 data_, u32& cycles) -> void {
   address = (address & 0xfffff) >> 2;
   n32 data = data_;
 
diff --git a/waterbox/ares64/ares/ares/n64/mi/mi.hpp b/waterbox/ares64/ares/ares/n64/mi/mi.hpp
index 7402933874..e0df7dcfa6 100644
--- a/waterbox/ares64/ares/ares/n64/mi/mi.hpp
+++ b/waterbox/ares64/ares/ares/n64/mi/mi.hpp
@@ -1,6 +1,6 @@
 //MIPS Interface
 
-struct MI : Memory::IO<MI> {
+struct MI : Memory::RCP<MI> {
   Node::Object node;
 
   struct Debugger {
@@ -27,8 +27,8 @@ struct MI : Memory::IO<MI> {
   auto power(bool reset) -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
 
   //serialization.cpp
   auto serialize(serializer&) -> void;
diff --git a/waterbox/ares64/ares/ares/n64/n64.hpp b/waterbox/ares64/ares/ares/n64/n64.hpp
index 52fc35eece..52b756e396 100644
--- a/waterbox/ares64/ares/ares/n64/n64.hpp
+++ b/waterbox/ares64/ares/ares/n64/n64.hpp
@@ -7,21 +7,18 @@
 #include <ares/ares.hpp>
 #include <nall/float-env.hpp>
 #include <nall/hashset.hpp>
+#include <nall/queue.hpp>
 #include <nall/recompiler/generic/generic.hpp>
 #include <component/processor/sm5k/sm5k.hpp>
 
 #if defined(ARCHITECTURE_AMD64)
 #include <nmmintrin.h>
 using v128 = __m128i;
-#elif defined(ARCHITECTURE_ARM64)
+#elif defined(ARCHITECTURE_ARM64) && !defined(COMPILER_MICROSOFT)
 #include <sse2neon.h>
 using v128 = __m128i;
 #endif
 
-#if defined(VULKAN)
-  #include <n64/vulkan/vulkan.hpp>
-#endif
-
 namespace ares::Nintendo64 {
   auto enumerate() -> vector<string>;
   auto load(Node::System& node, string name) -> bool;
@@ -57,6 +54,8 @@ namespace ares::Nintendo64 {
       PI_BUS_Write,
       SI_DMA_Read,
       SI_DMA_Write,
+      SI_BUS_Write,
+      RTC_Tick,
       DD_Clock_Tick,
       DD_MECHA_Response,
       DD_BM_Request,
@@ -65,10 +64,16 @@ namespace ares::Nintendo64 {
   };
   extern Queue queue;
 
+  struct BCD {
+    static auto encode(u8 value) -> u8 { return value / 10 << 4 | value % 10; }
+    static auto decode(u8 value) -> u8 { return (value >> 4) * 10 + (value & 15); }
+  };
+
   #include <n64/accuracy.hpp>
   #include <n64/memory/memory.hpp>
   #include <n64/system/system.hpp>
   #include <n64/cartridge/cartridge.hpp>
+  #include <n64/cic/cic.hpp>
   #include <n64/controller/controller.hpp>
   #include <n64/dd/dd.hpp>
   #include <n64/mi/mi.hpp>
diff --git a/waterbox/ares64/ares/ares/n64/pi/bus.hpp b/waterbox/ares64/ares/ares/n64/pi/bus.hpp
index 839c39f96b..1ccd057970 100644
--- a/waterbox/ares64/ares/ares/n64/pi/bus.hpp
+++ b/waterbox/ares64/ares/ares/n64/pi/bus.hpp
@@ -1,10 +1,11 @@
-inline auto PI::readWord(u32 address) -> u32 {
+inline auto PI::readWord(u32 address, u32& cycles) -> u32 {
   if(address <= 0x046f'ffff) return ioRead(address);
 
   if (unlikely(io.ioBusy)) {
-    writeForceFinish(); //technically, we should wait until Queue::PI_BUS_Write
+    cycles += writeForceFinish();
     return io.busLatch;
   }
+  cycles += 250;
   return busRead<Word>(address);
 }
 
@@ -50,7 +51,7 @@ inline auto PI::busRead(u32 address) -> u32 {
   return unmapped; //accesses here actually lock out the RCP
 }
 
-inline auto PI::writeWord(u32 address, u32 data) -> void {
+inline auto PI::writeWord(u32 address, u32 data, u32& cycles) -> void {
   if(address <= 0x046f'ffff) return ioWrite(address, data);
 
   if(io.ioBusy) return;
@@ -106,7 +107,7 @@ inline auto PI::writeFinished() -> void {
   io.ioBusy = 0;
 }
 
-inline auto PI::writeForceFinish() -> void {
+inline auto PI::writeForceFinish() -> u32 {
   io.ioBusy = 0;
-  queue.remove(Queue::PI_BUS_Write);
+  return queue.remove(Queue::PI_BUS_Write);
 }
diff --git a/waterbox/ares64/ares/ares/n64/pi/pi.hpp b/waterbox/ares64/ares/ares/n64/pi/pi.hpp
index 3a1c4fc2e0..6c466604f2 100644
--- a/waterbox/ares64/ares/ares/n64/pi/pi.hpp
+++ b/waterbox/ares64/ares/ares/n64/pi/pi.hpp
@@ -1,6 +1,6 @@
 //Peripheral Interface
 
-struct PI : Memory::IO<PI> {
+struct PI : Memory::RCP<PI> {
   Node::Object node;
 
   struct Debugger {
@@ -28,10 +28,10 @@ struct PI : Memory::IO<PI> {
   auto ioWrite(u32 address, u32 data) -> void;
 
   //bus.hpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
   auto writeFinished() -> void;
-  auto writeForceFinish() -> void;
+  auto writeForceFinish() -> u32;
   template <u32 Size>
   auto busRead(u32 address) -> u32;
   template <u32 Size>
diff --git a/waterbox/ares64/ares/ares/n64/pif/hle.cpp b/waterbox/ares64/ares/ares/n64/pif/hle.cpp
new file mode 100644
index 0000000000..a35acf3a20
--- /dev/null
+++ b/waterbox/ares64/ares/ares/n64/pif/hle.cpp
@@ -0,0 +1,373 @@
+
+auto PIF::addressCRC(u16 address) const -> n5 {
+  n5 crc = 0;
+  for(u32 i : range(16)) {
+    n5 xor_ = crc & 0x10 ? 0x15 : 0x00;
+    crc <<= 1;
+    if(address & 0x8000) crc |= 1;
+    address <<= 1;
+    crc ^= xor_;
+  }
+  return crc;
+}
+
+auto PIF::dataCRC(array_view<u8> data) const -> n8 {
+  n8 crc = 0;
+  for(u32 i : range(33)) {
+    for(u32 j : reverse(range(8))) {
+      n8 xor_ = crc & 0x80 ? 0x85 : 0x00;
+      crc <<= 1;
+      if(i < 32) {
+        if(data[i] & 1 << j) crc |= 1;
+      }
+      crc ^= xor_;
+    }
+  }
+  return crc;
+}
+
+auto PIF::descramble(n4 *buf, int size) -> void {
+  for(int i=size-1; i>0; i--) buf[i] -= buf[i-1] + 1;
+}
+
+auto PIF::step(u32 clocks) -> void {
+  Thread::clock += clocks;
+  if(intram.bootTimeout > 0) intram.bootTimeout -= clocks;
+}
+
+auto PIF::ramReadCommand() -> u8 {
+  return ram.read<Byte>(0x3f);
+}
+
+auto PIF::ramWriteCommand(u8 val) -> void {
+  return ram.write<Byte>(0x3f, val);
+}
+
+auto PIF::memSwap(u32 address, n8 &val) -> void {
+  n8 data = ram.read<Byte>(address);
+  ram.write<Byte>(address, (u8)val);
+  val = data;
+}
+
+auto PIF::memSwapSecrets() -> void {
+  for (auto i: range(3)) memSwap(0x25+i, intram.osInfo[i]);
+  for (auto i: range(6)) memSwap(0x32+i, intram.cpuChecksum[i]);
+}
+
+auto PIF::intA(bool dir, bool size) -> void {
+  if(dir == Read) {
+    if(size == Size64) {
+      if(ramReadCommand() & 0x02) {
+        challenge();
+        return;
+      }
+      joyRun();
+      return;
+    }
+  }
+  if(dir == Write) {
+    if(ramReadCommand() & 0x01) {
+      ramWriteCommand(ramReadCommand() & ~0x01);
+      joyInit();
+      joyParse();
+      return;
+    }
+    return;
+  }
+}
+
+auto PIF::joyInit() -> void {
+  for(auto i : range(5)) {
+    intram.joyStatus[i].skip = 1;
+    intram.joyStatus[i].reset = 0;
+  }
+}
+
+auto PIF::joyParse() -> void {
+  static constexpr bool Debug = 0;
+
+  if constexpr(Debug) {
+    print("joyParse:\n{\n");
+    for(u32 y : range(8)) {
+      print("  ");
+      for(u32 x : range(8)) {
+        print(hex(ram.read<Byte>(y * 8 + x), 2L), " ");
+      }
+      print("\n");
+    }
+    print("}\n");
+  }
+
+  u32 offset = 0;
+  n3 channel = 0;  //0-5
+  while(channel < 5 && offset < 64) {
+    n8 send = ram.read<Byte>(offset++);
+    if(send == 0xfe) break;     //end of packets
+    if(send == 0xff) continue;  //alignment padding
+    if(send == 0x00) { channel++; continue; }  // channel skip
+    if(send == 0xfd) { // channel reset
+      intram.joyStatus[channel++].reset = 1;
+      continue;
+    }
+    u32 sendOffset = offset-1;
+    n8 recv = ram.read<Byte>(offset++);
+    send &= 0x3f;
+    recv &= 0x3f;
+    offset += send+recv;
+    if(offset < 64) {
+      intram.joyAddress[channel] = sendOffset;
+      intram.joyStatus[channel].skip = 0;
+      channel++;
+    }
+  }
+}
+
+auto PIF::joyRun() -> void {
+  static constexpr bool Debug = 0;
+
+  ControllerPort* controllers[4] = {
+    &controllerPort1,
+    &controllerPort2,
+    &controllerPort3,
+    &controllerPort4,
+  };
+
+  for (i32 channel=4; channel>=0; channel--) {
+    if (intram.joyStatus[channel].reset) {
+      if (channel < 4 && controllers[channel]->device)
+        controllers[channel]->device->reset();
+      continue;
+    }
+    if (intram.joyStatus[channel].skip) continue;
+
+    u32 offset = intram.joyAddress[channel];
+    n8 send = ram.read<Byte>(offset++);
+    if(send & 0x80) continue; // skip (another way to do it)
+    if(send & 0x40) { //reset (another way to do it)
+      if (channel < 4 && controllers[channel]->device)
+        controllers[channel]->device->reset();
+      continue;
+    }
+    u32 recvOffset = offset;
+    n8 recv = ram.read<Byte>(offset++);
+    send &= 0x3f;
+    recv &= 0x3f;
+
+    n8 input[64];
+    for(u32 index : range(send)) {
+      input[index] = ram.read<Byte>(offset++);
+    }
+    n8 output[64];
+    b1 valid = 0;
+    b1 over = 0;
+
+    //controller port communication
+    if (channel < 4 && controllers[channel]->device) {
+      n2 status = controllers[channel]->device->comm(send, recv, input, output);
+      valid = status.bit(0);
+      over = status.bit(1);
+    }
+    //cartrige joybus communication
+    if (channel == 4) {
+      n2 status = cartridge.joybusComm(send, recv, input, output);
+      valid = status.bit(0);
+      over = status.bit(1);
+    }
+
+    if(!valid) ram.write<Byte>(recvOffset, 0x80 | recv);
+    if(over)   ram.write<Byte>(recvOffset, 0x40 | recv);
+    if (valid) {
+      for(u32 index : range(recv)) {
+        ram.write<Byte>(offset++, output[index]);
+      }
+    }
+  }
+
+  if constexpr(Debug) {
+    print("joyRun:\n[\n");
+    for(u32 y : range(8)) {
+      print("  ");
+      for(u32 x : range(8)) {
+        print(hex(ram.read<Byte>(y * 8 + x), 2L), " ");
+      }
+      print("\n");
+    }
+    print("]\n");
+  }
+}
+
+auto PIF::estimateTiming() -> u32 {
+  ControllerPort* controllers[4] = {
+    &controllerPort1,
+    &controllerPort2,
+    &controllerPort3,
+    &controllerPort4,
+  };
+
+  u32 cycles = 13600;
+  u32 short_cmds = 0;
+
+  u32 offset = 0;
+  u32 channel = 0;
+  while(offset < 64 && channel < 5) {
+    n8 send = ram.read<Byte>(offset++);
+    if(send == 0xfe) { short_cmds++; break; }     //end of packets
+    if(send == 0x00) { short_cmds++; channel++; continue; }
+    if(send == 0xfd) { short_cmds++; channel++; continue;  } //channel reset
+    if(send == 0xff) { short_cmds++; continue;  } //alignment padding
+
+    n8 recv = ram.read<Byte>(offset++);
+
+    //clear flags from lengths
+    send &= 0x3f;
+    recv &= 0x3f;
+    n8 input[64];
+    for(u32 index : range(send)) {
+      input[index] = ram.read<Byte>(offset++);
+    }
+    offset += recv;
+
+    if (channel < 4) {
+      if (controllers[channel]->device) {
+        cycles += 22000;
+      } else {
+        cycles += 18000;
+      }
+    } else {
+      //accessories(TBD)
+      cycles += 20000;
+    }
+
+    channel++;
+  }
+
+  cycles += 1420 * short_cmds;
+  return cycles;
+}
+
+auto PIF::challenge() -> void {
+  cic.writeBit(1); cic.writeBit(0); //challenge command
+  cic.readNibble(); //ignore timeout value returned by CIC (we simulate instant response)
+  cic.readNibble(); //timeout high nibble
+  for(u32 address : range(15)) {
+    auto data = ram.read<Byte>(0x30 + address);
+    cic.writeNibble(data >> 4 & 0xf);
+    cic.writeNibble(data >> 0 & 0xf);
+  }
+  cic.readBit(); //ignore start bit
+  for(u32 address : range(15)) {
+    u8 data = 0;
+    data |= cic.readNibble() << 4;
+    data |= cic.readNibble() << 0;
+    ram.write<Byte>(0x30 + address, data);
+  }
+}
+
+auto PIF::mainHLE() -> void {
+  step(10240*8);
+
+  if(likely(state == Run)) {
+    //cicCompare()
+    return;
+  }
+
+  if(state == Init) {
+    n4 hello = cic.readNibble();
+    if (hello.bit(0,1) != 1) {
+      debug(unusual, "[PIF::main] invalid CIC hello message ", hex(hello, 4L));
+      state = Error;
+      return;
+    }
+    if constexpr(Accuracy::PIF::RegionLock) {
+      if(hello.bit(2) != (u32)system.region()) {
+        const char *region[2] = { "NTSC", "PAL" };
+        debug(unusual, "[PIF::main] CIC region mismatch: console is ", region[(u32)system.region()], " but cartridge is ", region[(int)hello.bit(4)]);
+        state = Error;
+        return;
+      }
+    }
+    n4 osinfo = 0;
+    osinfo.bit(2) = 1;              //"version" bit (unknown, always set)
+    osinfo.bit(3) = hello.bit(3);   //64dd
+
+    n4 buf[6];
+    for (auto i: range(6)) buf[i] = cic.readNibble();
+    for (auto i: range(2)) descramble(buf, 6);
+
+    intram.osInfo[0].bit(4,7) = buf[0];
+    intram.osInfo[0].bit(0,3) = buf[1];
+    intram.osInfo[1].bit(4,7) = buf[2];
+    intram.osInfo[1].bit(0,3) = buf[3];
+    intram.osInfo[2].bit(4,7) = buf[4];
+    intram.osInfo[2].bit(0,3) = buf[5];
+
+    intram.osInfo[0].bit(0,3) = osinfo;
+    ramWriteCommand(0x00);
+    memSwapSecrets();  //show osinfo+seeds in external memory
+    state = WaitLockout;
+    return;
+  }
+
+  if(state == WaitLockout && (ramReadCommand() & 0x10)) {
+    io.romLockout = 1;
+    joyInit();
+    state = WaitGetChecksum;
+    return;
+  }
+
+  if(state == WaitGetChecksum && (ramReadCommand() & 0x20)) {
+    memSwapSecrets();  //hide osinfo+seeds, copy+hide checksum to internal memory 
+    ramWriteCommand(ramReadCommand() | 0x80);
+    state = WaitCheckChecksum;
+    return;
+  }
+
+  if(state == WaitCheckChecksum && (ramReadCommand() & 0x40)) {
+    if (true) { // only on cold boot
+      n4 buf[16];
+      for (auto i: range(16)) buf[i] = cic.readNibble();
+      for (auto i: range(4))  descramble(buf, 16);
+      for (auto i: range(6)) {
+        intram.cicChecksum[i].bit(4,7) = buf[i*2+4];
+        intram.cicChecksum[i].bit(0,3) = buf[i*2+5];
+      }
+      intram.osInfo[0].bit(1) = 1;  //warm boot (NMI) flag (ready in case a reset is made in the future)
+    }
+
+    for (auto i: range(6)) {
+      u8 data = intram.cpuChecksum[i];
+      if (intram.cicChecksum[i] != data) {
+        debug(unusual, "[PIF::main] invalid IPL2 checksum: ", cic.model, ":",
+          hex(intram.cicChecksum[0], 2L), hex(intram.cicChecksum[1], 2L), hex(intram.cicChecksum[2], 2L),
+          hex(intram.cicChecksum[3], 2L), hex(intram.cicChecksum[4], 2L), hex(intram.cicChecksum[5], 2L),
+          " != cpu:", 
+          hex(intram.cpuChecksum[0], 2L), hex(intram.cpuChecksum[1], 2L), hex(intram.cpuChecksum[2], 2L),
+          hex(intram.cpuChecksum[3], 2L), hex(intram.cpuChecksum[4], 2L), hex(intram.cpuChecksum[5], 2L));
+        state = Error;
+        return;
+      }
+    }
+    for (auto i: range(6)) intram.cpuChecksum[i] = 0;
+    state = WaitTerminateBoot;
+    intram.bootTimeout = 6 * 187500000;  //6 seconds
+    return;
+  }
+
+  if(state == WaitTerminateBoot && (ramReadCommand() & 0x08)) {
+    ramWriteCommand(0x00);
+    io.resetEnabled = 1;
+    state = Run;
+    return;
+  }
+
+  if(state == WaitTerminateBoot && intram.bootTimeout <= 0) {
+    debug(unusual, "[PIF::main] boot timeout: CPU has not sent the boot termination command within 5 seconds. Halting the CPU");
+    state = Error;
+    return;
+  }
+
+  if(state == Error) {
+    cpu.scc.nmiPending = 1;
+    return;
+  }
+}
diff --git a/waterbox/ares64/ares/ares/n64/pif/io.cpp b/waterbox/ares64/ares/ares/n64/pif/io.cpp
index 6e3f3b09a4..4d1ac5478f 100644
--- a/waterbox/ares64/ares/ares/n64/pif/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/pif/io.cpp
@@ -1,4 +1,4 @@
-auto PIF::readWord(u32 address) -> u32 {
+auto PIF::readInt(u32 address) -> u32 {
   address &= 0x7ff;
   if(address <= 0x7bf) {
     if(io.romLockout) return 0;
@@ -7,7 +7,7 @@ auto PIF::readWord(u32 address) -> u32 {
   return ram.read<Word>(address);
 }
 
-auto PIF::writeWord(u32 address, u32 data) -> void {
+auto PIF::writeInt(u32 address, u32 data) -> void {
   address &= 0x7ff;
   if(address <= 0x7bf) {
     if(io.romLockout) return;
@@ -15,3 +15,29 @@ auto PIF::writeWord(u32 address, u32 data) -> void {
   }
   return ram.write<Word>(address, data);
 }
+
+auto PIF::readWord(u32 address) -> u32 {
+  intA(Read, Size4);
+  return readInt(address);
+}
+
+auto PIF::writeWord(u32 address, u32 data) -> void {
+  writeInt(address, data);  
+  return intA(Write, Size4);
+}
+
+auto PIF::dmaRead(u32 address, u32 ramAddress) -> void {
+  intA(Read, Size64);
+  for(u32 offset = 0; offset < 64; offset += 4) {
+    u32 data = readInt(address + offset);
+    rdram.ram.write<Word>(ramAddress + offset, data);
+  }
+}
+
+auto PIF::dmaWrite(u32 address, u32 ramAddress) -> void {
+  for(u32 offset = 0; offset < 64; offset += 4) {
+    u32 data = rdram.ram.read<Word>(ramAddress + offset);
+    writeInt(address + offset, data);
+  }
+  intA(Write, Size64);
+}
diff --git a/waterbox/ares64/ares/ares/n64/pif/pif.cpp b/waterbox/ares64/ares/ares/n64/pif/pif.cpp
index d7fa83a886..841d82414a 100644
--- a/waterbox/ares64/ares/ares/n64/pif/pif.cpp
+++ b/waterbox/ares64/ares/ares/n64/pif/pif.cpp
@@ -3,6 +3,7 @@
 namespace ares::Nintendo64 {
 
 PIF pif;
+#include "hle.cpp"
 #include "io.cpp"
 #include "debugger.cpp"
 #include "serialization.cpp"
@@ -22,262 +23,13 @@ auto PIF::unload() -> void {
   node.reset();
 }
 
-auto PIF::addressCRC(u16 address) const -> n5 {
-  n5 crc = 0;
-  for(u32 i : range(16)) {
-    n5 xor = crc & 0x10 ? 0x15 : 0x00;
-    crc <<= 1;
-    if(address & 0x8000) crc |= 1;
-    address <<= 1;
-    crc ^= xor;
-  }
-  return crc;
-}
-
-auto PIF::dataCRC(array_view<u8> data) const -> n8 {
-  n8 crc = 0;
-  for(u32 i : range(33)) {
-    for(u32 j : reverse(range(8))) {
-      n8 xor = crc & 0x80 ? 0x85 : 0x00;
-      crc <<= 1;
-      if(i < 32) {
-        if(data[i] & 1 << j) crc |= 1;
-      }
-      crc ^= xor;
-    }
-  }
-  return crc;
-}
-
-auto PIF::run() -> void {
-  auto flags = ram.read<Byte>(0x3f);
-
-  //controller polling
-  if(flags & 0x01) {
-  //todo: this flag is supposed to be cleared, but doing so breaks inputs
-  //flags &= ~0x01;
-    scan();
-  }
-
-  //CIC-NUS-6105 challenge/response
-  if(flags & 0x02) {
-    flags &= ~0x02;
-    challenge();
-  }
-
-  //unknown purpose
-  if(flags & 0x04) {
-    flags &= ~0x04;
-    debug(unimplemented, "[SI::main] flags & 0x04");
-  }
-
-  //must be sent within 5s of the console booting, or SM5 will lock the N64
-  if(flags & 0x08) {
-    flags &= ~0x08;
-  }
-
-  //PIF ROM lockout
-  if(flags & 0x10) {
-    flags &= ~0x10;
-    io.romLockout = 1;
-  }
-
-  //initialization
-  if(flags & 0x20) {
-    flags &= ~0x20;
-    flags |=  0x80;  //set completion flag
-  }
-
-  //clear PIF RAM
-  if(flags & 0x40) {
-    flags &= ~0x40;
-    ram.fill();
-  }
-
-  ram.write<Byte>(0x3f, flags);
-}
-
-auto PIF::scan() -> void {
-  ControllerPort* controllers[4] = {
-    &controllerPort1,
-    &controllerPort2,
-    &controllerPort3,
-    &controllerPort4,
-  };
-
-  static constexpr bool Debug = 0;
-
-  if constexpr(Debug) {
-    print("{\n");
-    for(u32 y : range(8)) {
-      print("  ");
-      for(u32 x : range(8)) {
-        print(hex(ram.read<Byte>(y * 8 + x), 2L), " ");
-      }
-      print("\n");
-    }
-    print("}\n");
-  }
-
-  n3 channel = 0;  //0-5
-  for(u32 offset = 0; offset < 64;) {
-    n8 send = ram.read<Byte>(offset++);
-    if(send == 0x00) { channel++; continue; }
-    if(send == 0xfd) continue;  //channel reset
-    if(send == 0xfe) break;     //end of packets
-    if(send == 0xff) continue;  //alignment padding
-    n8 recvOffset = offset;
-    n8 recv = ram.read<Byte>(offset++);
-    if(recv == 0xfe) break;     //end of packets
-
-    //clear flags from lengths
-    send &= 0x3f;
-    recv &= 0x3f;
-
-    n8 input[64];
-    for(u32 index : range(send)) {
-      input[index] = ram.read<Byte>(offset++);
-    }
-    n8 output[64];
-    b1 valid = 0;
-    b1 over = 0;
-
-    //controller port communication
-    if (channel < 4 && controllers[channel]->device) {
-      n2 status = controllers[channel]->device->comm(send, recv, input, output);
-      valid = status.bit(0);
-      over = status.bit(1);
-    }
-    
-    if (channel >= 4) {
-      //status
-      if(input[0] == 0x00 || input[0] == 0xff) {
-        //cartridge EEPROM (4kbit)
-        if(cartridge.eeprom.size == 512) {
-          output[0] = 0x00;
-          output[1] = 0x80;
-          output[2] = 0x00;
-          valid = 1;
-        }
-
-        //cartridge EEPROM (16kbit)
-        if(cartridge.eeprom.size == 2048) {
-          output[0] = 0x00;
-          output[1] = 0xc0;
-          output[2] = 0x00;
-          valid = 1;
-        }
-      }
-
-      //read EEPROM
-      if(input[0] == 0x04 && send >= 2) {
-        u32 address = input[1] * 8;
-        for(u32 index : range(recv)) {
-          output[index] = cartridge.eeprom.read<Byte>(address++);
-        }
-        valid = 1;
-      }
-
-      //write EEPROM
-      if(input[0] == 0x05 && send >= 2 && recv >= 1) {
-        u32 address = input[1] * 8;
-        for(u32 index : range(send - 2)) {
-          cartridge.eeprom.write<Byte>(address++, input[2 + index]);
-        }
-        output[0] = 0x00;
-        valid = 1;
-      }
-
-      //RTC status
-      if(input[0] == 0x06) {
-        debug(unimplemented, "[SI::main] RTC status");
-      }
-
-      //RTC read
-      if(input[0] == 0x07) {
-        debug(unimplemented, "[SI::main] RTC read");
-      }
-
-      //RTC write
-      if(input[0] == 0x08) {
-        debug(unimplemented, "[SI::main] RTC write");
-      }
-    }
-
-    if(!valid) {
-      ram.write<Byte>(recvOffset, 0x80 | recv & 0x3f);
-    }
-    if(over) {
-      ram.write<Byte>(recvOffset, 0x40 | recv & 0x3f);
-    }
-
-    if (valid) {
-      for(u32 index : range(recv)) {
-        ram.write<Byte>(offset++, output[index]);
-      }
-    }
-    channel++;
-  }
-
-  if constexpr(Debug) {
-    print("[\n");
-    for(u32 y : range(8)) {
-      print("  ");
-      for(u32 x : range(8)) {
-        print(hex(ram.read<Byte>(y * 8 + x), 2L), " ");
-      }
-      print("\n");
-    }
-    print("]\n");
-  }
-}
-
-//CIC-NUS-6105 anti-piracy challenge/response
-auto PIF::challenge() -> void {
-  static n4 lut[32] = {
-    0x4, 0x7, 0xa, 0x7, 0xe, 0x5, 0xe, 0x1,
-    0xc, 0xf, 0x8, 0xf, 0x6, 0x3, 0x6, 0x9,
-    0x4, 0x1, 0xa, 0x7, 0xe, 0x5, 0xe, 0x1,
-    0xc, 0x9, 0x8, 0x5, 0x6, 0x3, 0xc, 0x9,
-  };
-
-  n4 challenge[30];
-  n4 response[30];
-
-  //15 bytes -> 30 nibbles
-  for(u32 address : range(15)) {
-    auto data = ram.read<Byte>(0x30 + address);
-    challenge[address << 1 | 0] = data >> 4;
-    challenge[address << 1 | 1] = data >> 0;
-  }
-
-  n4 key = 0xb;
-  n1 sel = 0;
-  for(u32 address : range(30)) {
-    n4 data = key + 5 * challenge[address];
-    response[address] = data;
-    key = lut[sel << 4 | data];
-    n1 mod = data >> 3;
-    n3 mag = data >> 0;
-    if(mod) mag = ~mag;
-    if(mag % 3 != 1) mod = !mod;
-    if(sel) {
-      if(data == 0x1 || data == 0x9) mod = 1;
-      if(data == 0xb || data == 0xe) mod = 0;
-    }
-    sel = mod;
-  }
-
-  //30 nibbles -> 15 bytes
-  for(u32 address : range(15)) {
-    n8 data = 0;
-    data |= response[address << 1 | 0] << 4;
-    data |= response[address << 1 | 1] << 0;
-    ram.write<Byte>(0x30 + address, data);
-  }
+auto PIF::main() -> void {
+    mainHLE();
 }
 
 auto PIF::power(bool reset) -> void {
+  Thread::reset();
+
   string pifrom = Region::PAL() ? "pif.pal.rom" : "pif.ntsc.rom";
   if(auto fp = system.pak->read(pifrom)) {
     rom.load(fp);
@@ -285,27 +37,8 @@ auto PIF::power(bool reset) -> void {
 
   ram.fill();
   io = {};
-
-  //write CIC seeds into PIF RAM so that cartridge checksum function passes
-  string cic = cartridge.node ? cartridge.cic() : dd.cic();
-  n8 seed = 0x3f;
-  n1 version = 0;
-  n1 type = 0;
-  if(cic == "CIC-NUS-6101" || cic == "CIC-NUS-7102") seed = 0x3f, version = 1;
-  if(cic == "CIC-NUS-6102" || cic == "CIC-NUS-7101") seed = 0x3f;
-  if(cic == "CIC-NUS-6103" || cic == "CIC-NUS-7103") seed = 0x78;
-  if(cic == "CIC-NUS-6105" || cic == "CIC-NUS-7105") seed = 0x91;
-  if(cic == "CIC-NUS-6106" || cic == "CIC-NUS-7106") seed = 0x85;
-  if(cic == "CIC-NUS-8303" || cic == "CIC-NUS-8401") seed = 0xdd, type = 1;
-  if(cic == "CIC-NUS-DDUS") seed = 0xde, type = 1;
-
-  n32 data;
-  data.bit(0, 7) = 0x3f;     //CIC IPL2 seed
-  data.bit(8,15) = seed;     //CIC IPL3 seed
-  data.bit(17)   = reset;    //osResetType (0 = power; 1 = reset (NMI))
-  data.bit(18)   = version;  //osVersion
-  data.bit(19)   = type;     //osRomType (0 = Gamepak; 1 = 64DD)
-  ram.write<Word>(0x24, data);
+  intram = {};
+  state = Init;
 }
 
 }
diff --git a/waterbox/ares64/ares/ares/n64/pif/pif.hpp b/waterbox/ares64/ares/ares/n64/pif/pif.hpp
index 0c532aa4fd..a93a3e2424 100644
--- a/waterbox/ares64/ares/ares/n64/pif/pif.hpp
+++ b/waterbox/ares64/ares/ares/n64/pif/pif.hpp
@@ -1,9 +1,14 @@
 //PIF-NUS
 
-struct PIF : Memory::IO<PIF> {
+struct PIF : Thread, Memory::SI<PIF> {
+  enum State : u32 { Init, WaitLockout, WaitGetChecksum, WaitCheckChecksum, WaitTerminateBoot, Run, Error };
+  enum IntADir : bool { Read, Write };
+  enum IntASize : bool { Size4, Size64 };
+
   Node::Object node;
   Memory::Readable rom;
   Memory::Writable ram;
+  u32 state;
 
   struct Debugger {
     //debugger.cpp
@@ -13,28 +18,59 @@ struct PIF : Memory::IO<PIF> {
     struct Memory {
       Node::Debugger::Memory ram;
     } memory;
-
   } debugger;
 
+  struct Intram {
+    n8 osInfo[3];
+    n8 cpuChecksum[6];
+    n8 cicChecksum[6];
+    s32 bootTimeout;
+    n8 joyAddress[5];
+    struct {
+      n1 skip;
+      n1 reset;
+    } joyStatus[5];
+
+    auto serialize(serializer& s) -> void;
+  } intram;
+
   //pif.cpp
+  auto step(u32 clocks) -> void;
   auto load(Node::Object) -> void;
   auto unload() -> void;
+  auto main() -> void;
+  auto power(bool reset) -> void;
+  auto estimateTiming() -> u32;
+
+  //hle.cpp
+  auto mainHLE() -> void;
   auto addressCRC(u16 address) const -> n5;
   auto dataCRC(array_view<u8> data) const -> n8;
-  auto run() -> void;
-  auto scan() -> void;
+  auto descramble(n4 *buf, int size) -> void;
+  auto ramReadCommand() -> u8;
+  auto ramWriteCommand(u8 val) -> void;
+  auto memSwap(u32 address, n8 &val) -> void;
+  auto memSwapSecrets() -> void;
+  auto joyInit() -> void;
+  auto joyParse() -> void;
+  auto joyRun() -> void;
   auto challenge() -> void;
-  auto power(bool reset) -> void;
+  auto intA(bool dir, bool size) -> void;
 
   //io.cpp
+  auto readInt(u32 address) -> u32;
+  auto writeInt(u32 address, u32 data) -> void;
   auto readWord(u32 address) -> u32;
   auto writeWord(u32 address, u32 data) -> void;
+  auto dmaRead(u32 address, u32 ramAddress) -> void;
+  auto dmaWrite(u32 address, u32 ramAddress) -> void;
 
   //serialization.cpp
   auto serialize(serializer&) -> void;
 
   struct IO {
     n1  romLockout;
+    n1  resetEnabled;
   } io;
 };
 
diff --git a/waterbox/ares64/ares/ares/n64/pif/serialization.cpp b/waterbox/ares64/ares/ares/n64/pif/serialization.cpp
index d10a058372..51ec4ffb13 100644
--- a/waterbox/ares64/ares/ares/n64/pif/serialization.cpp
+++ b/waterbox/ares64/ares/ares/n64/pif/serialization.cpp
@@ -1,5 +1,16 @@
 auto PIF::serialize(serializer& s) -> void {
   s(ram);
-
+  s(state);
+  s(intram);
   s(io.romLockout);
+  s(io.resetEnabled);
+}
+
+auto PIF::Intram::serialize(serializer& s) -> void {
+  s(osInfo);
+  s(cpuChecksum);
+  s(cicChecksum);
+  s(bootTimeout);
+  s(joyAddress);
+  for(auto i: range(5)) s(joyStatus[i].skip), s(joyStatus[i].reset);
 }
diff --git a/waterbox/ares64/ares/ares/n64/rdp/io.cpp b/waterbox/ares64/ares/ares/n64/rdp/io.cpp
index 5e7dd7437d..44c241ed64 100644
--- a/waterbox/ares64/ares/ares/n64/rdp/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/rdp/io.cpp
@@ -1,4 +1,4 @@
-auto RDP::readWord(u32 address) -> u32 {
+auto RDP::readWord(u32 address, u32& cycles) -> u32 {
   address = (address & 0xfffff) >> 2;
   n32 data;
 
@@ -56,7 +56,7 @@ auto RDP::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto RDP::writeWord(u32 address, u32 data_) -> void {
+auto RDP::writeWord(u32 address, u32 data_, u32& cycles) -> void {
   address = (address & 0xfffff) >> 2;
   n32 data = data_;
 
@@ -113,7 +113,7 @@ auto RDP::writeWord(u32 address, u32 data_) -> void {
   debugger.ioDPC(Write, address, data);
 }
 
-auto RDP::IO::readWord(u32 address) -> u32 {
+auto RDP::IO::readWord(u32 address, u32& cycles) -> u32 {
   address = (address & 0xfffff) >> 2;
   n32 data;
 
@@ -144,7 +144,7 @@ auto RDP::IO::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto RDP::IO::writeWord(u32 address, u32 data_) -> void {
+auto RDP::IO::writeWord(u32 address, u32 data_, u32& cycles) -> void {
   address = (address & 0xfffff) >> 2;
   n32 data = data_;
 
diff --git a/waterbox/ares64/ares/ares/n64/rdp/rdp.hpp b/waterbox/ares64/ares/ares/n64/rdp/rdp.hpp
index 085e58292c..350cc7f3ab 100644
--- a/waterbox/ares64/ares/ares/n64/rdp/rdp.hpp
+++ b/waterbox/ares64/ares/ares/n64/rdp/rdp.hpp
@@ -1,6 +1,6 @@
 //Reality Display Processor
 
-struct RDP : Thread, Memory::IO<RDP> {
+struct RDP : Thread, Memory::RCP<RDP> {
   Node::Object node;
 
   struct Debugger {
@@ -66,8 +66,8 @@ struct RDP : Thread, Memory::IO<RDP> {
   auto setColorImage() -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
   auto flushCommands() -> void;
 
   //serialization.cpp
@@ -333,13 +333,13 @@ struct RDP : Thread, Memory::IO<RDP> {
     } x, y;
   } fillRectangle_;
 
-  struct IO : Memory::IO<IO> {
+  struct IO : Memory::RCP<IO> {
     RDP& self;
     IO(RDP& self) : self(self) {}
 
     //io.cpp
-    auto readWord(u32 address) -> u32;
-    auto writeWord(u32 address, u32 data) -> void;
+    auto readWord(u32 address, u32& cycles) -> u32;
+    auto writeWord(u32 address, u32 data, u32& cycles) -> void;
 
     struct BIST {
       n1 check;
diff --git a/waterbox/ares64/ares/ares/n64/rdram/io.cpp b/waterbox/ares64/ares/ares/n64/rdram/io.cpp
index d5dd561629..3fd6f98e25 100644
--- a/waterbox/ares64/ares/ares/n64/rdram/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/rdram/io.cpp
@@ -1,4 +1,4 @@
-auto RDRAM::readWord(u32 address) -> u32 {
+auto RDRAM::readWord(u32 address, u32& cycles) -> u32 {
   u32 chipID = address >> 13 & 3;
   auto& chip = chips[chipID];
   address = (address & 0x3ff) >> 2;
@@ -63,7 +63,7 @@ auto RDRAM::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto RDRAM::writeWord(u32 address, u32 data) -> void {
+auto RDRAM::writeWord(u32 address, u32 data, u32& cycles) -> void {
   u32 chipID = address >> 13 & 3;
   auto& chip = chips[chipID];
   address = (address & 0x3ff) >> 2;
diff --git a/waterbox/ares64/ares/ares/n64/rdram/rdram.hpp b/waterbox/ares64/ares/ares/n64/rdram/rdram.hpp
index 57cb39ba09..40ee05ece6 100644
--- a/waterbox/ares64/ares/ares/n64/rdram/rdram.hpp
+++ b/waterbox/ares64/ares/ares/n64/rdram/rdram.hpp
@@ -1,6 +1,6 @@
 //RAMBUS RAM
 
-struct RDRAM : Memory::IO<RDRAM> {
+struct RDRAM : Memory::RCP<RDRAM> {
   Node::Object node;
   Memory::Writable ram;
 
@@ -24,8 +24,8 @@ struct RDRAM : Memory::IO<RDRAM> {
   auto power(bool reset) -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
 
   //serialization.cpp
   auto serialize(serializer&) -> void;
diff --git a/waterbox/ares64/ares/ares/n64/ri/io.cpp b/waterbox/ares64/ares/ares/n64/ri/io.cpp
index 74cb89f8fd..473415b0e6 100644
--- a/waterbox/ares64/ares/ares/n64/ri/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/ri/io.cpp
@@ -1,4 +1,4 @@
-auto RI::readWord(u32 address) -> u32 {
+auto RI::readWord(u32 address, u32& cycles) -> u32 {
   address = (address & 0xfffff) >> 2;
   n32 data = 0;
 
@@ -58,7 +58,7 @@ auto RI::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto RI::writeWord(u32 address, u32 data_) -> void {
+auto RI::writeWord(u32 address, u32 data_, u32& cycles) -> void {
   address = (address & 0xfffff) >> 2;
   n32 data = data_;
 
diff --git a/waterbox/ares64/ares/ares/n64/ri/ri.hpp b/waterbox/ares64/ares/ares/n64/ri/ri.hpp
index 9c13617d0d..e8bfbd4c4e 100644
--- a/waterbox/ares64/ares/ares/n64/ri/ri.hpp
+++ b/waterbox/ares64/ares/ares/n64/ri/ri.hpp
@@ -1,6 +1,6 @@
 //RDRAM Interface
 
-struct RI : Memory::IO<RI> {
+struct RI : Memory::RCP<RI> {
   Node::Object node;
 
   struct Debugger {
@@ -19,8 +19,8 @@ struct RI : Memory::IO<RI> {
   auto power(bool reset) -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
 
   //serialization.cpp
   auto serialize(serializer&) -> void;
diff --git a/waterbox/ares64/ares/ares/n64/rsp/disassembler.cpp b/waterbox/ares64/ares/ares/n64/rsp/disassembler.cpp
index 0695e34217..c6b45276c6 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/disassembler.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/disassembler.cpp
@@ -490,10 +490,10 @@ auto RSP::Disassembler::sccRegisterName(u32 index) const -> string {
 }
 
 auto RSP::Disassembler::sccRegisterValue(u32 index) const -> string {
-  u32 value = 0;
-  if(index <= 6) value = rsp.readWord((index & 7) << 2);
+  u32 value = 0; u32 cycles;
+  if(index <= 6) value = rsp.readWord((index & 7) << 2, cycles);
   if(index == 7) value = self.status.semaphore;  //rsp.readSCC(7) has side-effects
-  if(index >= 8) value = rdp.readWord((index & 7) << 2);
+  if(index >= 8) value = rdp.readWord((index & 7) << 2, cycles);
   if(showValues) return {sccRegisterName(index), hint("{$", hex(value, 8L), "}")};
   return sccRegisterName(index);
 }
@@ -525,6 +525,6 @@ auto RSP::Disassembler::ccrRegisterValue(u32 index) const -> string {
 
 template<typename... P>
 auto RSP::Disassembler::hint(P&&... p) const -> string {
-  if(showColors) return {"\e[0m\e[37m", std::forward<P>(p)..., "\e[0m"};
+  if(showColors) return {terminal::csi, "0m", terminal::csi, "37m", std::forward<P>(p)..., terminal::csi, "0m"};
   return {std::forward<P>(p)...};
 }
diff --git a/waterbox/ares64/ares/ares/n64/rsp/interpreter-scc.cpp b/waterbox/ares64/ares/ares/n64/rsp/interpreter-scc.cpp
index 5cea79fdfd..8056d3e6fe 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/interpreter-scc.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/interpreter-scc.cpp
@@ -1,9 +1,13 @@
 auto RSP::MFC0(r32& rt, u8 rd) -> void {
+  u32 cycles = 0;
   if((rd & 8) == 0) rt.u32 = Nintendo64::rsp.ioRead  ((rd & 7) << 2);
-  if((rd & 8) != 0) rt.u32 = Nintendo64::rdp.readWord((rd & 7) << 2);
+  if((rd & 8) != 0) rt.u32 = Nintendo64::rdp.readWord((rd & 7) << 2, cycles);
+  step(cycles);
 }
 
 auto RSP::MTC0(cr32& rt, u8 rd) -> void {
+  u32 cycles = 0;
   if((rd & 8) == 0) Nintendo64::rsp.ioWrite  ((rd & 7) << 2, rt.u32);
-  if((rd & 8) != 0) Nintendo64::rdp.writeWord((rd & 7) << 2, rt.u32);
+  if((rd & 8) != 0) Nintendo64::rdp.writeWord((rd & 7) << 2, rt.u32, cycles);
+  step(cycles);
 }
diff --git a/waterbox/ares64/ares/ares/n64/rsp/interpreter-vpu.cpp b/waterbox/ares64/ares/ares/n64/rsp/interpreter-vpu.cpp
index 80a2cdb7cb..24cdf0b2ca 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/interpreter-vpu.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/interpreter-vpu.cpp
@@ -11,6 +11,23 @@
 #define DIVOUT vpu.divout
 #define DIVDP  vpu.divdp
 
+static auto countLeadingZeros(u32 value) -> u32 {
+  assert(value);
+#if defined(COMPILER_MICROSOFT)
+  unsigned long index;
+  _BitScanReverse(&index, value);
+  return index ^ 31;
+#elif __has_builtin(__builtin_clz)
+  return __builtin_clz(value);
+#else
+  s32 index;
+  for(index = 31; index >= 0; --index) {
+    if(value >> index & 1) break;
+  }
+  return 31 - index;
+#endif
+}
+
 auto RSP::r128::operator()(u32 index) const -> r128 {
   if constexpr(Accuracy::RSP::SISD) {
     r128 v{*this};
@@ -60,7 +77,9 @@ auto RSP::r128::operator()(u32 index) const -> r128 {
       _mm_set_epi8( 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0),  //77777777
     };
     //todo: benchmark to see if testing for cases 0&1 to return value directly is faster
-    return {uint128_t(_mm_shuffle_epi8(v128, shuffle[index]))};
+    r128 v;
+    v = _mm_shuffle_epi8(v128, shuffle[index]);
+    return v;
     #endif
   }
 }
@@ -132,8 +151,8 @@ auto RSP::CTC2(cr32& rt, u8 rd) -> void {
   if constexpr(Accuracy::RSP::SIMD) {
     #if ARCHITECTURE_SUPPORTS_SSE4_1
     static const v128 mask = _mm_set_epi16(0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080);
-    lo->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{~rt.u32 >> 0}, zero), mask), zero);
-    hi->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{~rt.u32 >> 8}, zero), mask), zero);
+    lo->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_set1_epi8(~rt.u32 >> 0), mask), zero);
+    hi->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_set1_epi8(~rt.u32 >> 8), mask), zero);
     #endif
   }
 }
@@ -1321,7 +1340,7 @@ auto RSP::VRCP(r128& vd, u8 de, cr128& vt) -> void {
   } else if(input == -32768) {
     result = 0xffff'0000;
   } else {
-    u32 shift = __builtin_clz(data);
+    u32 shift = countLeadingZeros(data);
     u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22;
     result = reciprocals[index];
     result = (0x10000 | result) << 14;
@@ -1373,7 +1392,7 @@ auto RSP::VRSQ(r128& vd, u8 de, cr128& vt) -> void {
   } else if(input == -32768) {
     result = 0xffff'0000;
   } else {
-    u32 shift = __builtin_clz(data);
+    u32 shift = countLeadingZeros(data);
     u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22;
     result = inverseSquareRoots[index & 0x1fe | shift & 1];
     result = (0x10000 | result) << 14;
diff --git a/waterbox/ares64/ares/ares/n64/rsp/interpreter.cpp b/waterbox/ares64/ares/ares/n64/rsp/interpreter.cpp
index e766fb9e36..cb868882e0 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/interpreter.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/interpreter.cpp
@@ -27,7 +27,8 @@
   case 0xd: return name<0xd>(__VA_ARGS__); \
   case 0xe: return name<0xe>(__VA_ARGS__); \
   case 0xf: return name<0xf>(__VA_ARGS__); \
-  }
+  } \
+  unreachable;
 
 #define SA     (OP >>  6 & 31)
 #define RDn    (OP >> 11 & 31)
diff --git a/waterbox/ares64/ares/ares/n64/rsp/io.cpp b/waterbox/ares64/ares/ares/n64/rsp/io.cpp
index d4c1cd05d5..4b48f9bb78 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/io.cpp
@@ -1,4 +1,4 @@
-auto RSP::readWord(u32 address) -> u32 {
+auto RSP::readWord(u32 address, u32& cycles) -> u32 {
   if(address <= 0x0403'ffff) {
     if(address & 0x1000) return imem.read<Word>(address);
     else                 return dmem.read<Word>(address);
@@ -67,7 +67,7 @@ auto RSP::ioRead(u32 address) -> u32 {
   return data;
 }
 
-auto RSP::writeWord(u32 address, u32 data) -> void {
+auto RSP::writeWord(u32 address, u32 data, u32& cycles) -> void {
   if(address <= 0x0403'ffff) {
     if(address & 0x1000) return recompiler.invalidate(address & 0xfff), imem.write<Word>(address, data);
     else                 return dmem.write<Word>(address, data);
@@ -156,7 +156,7 @@ auto RSP::ioWrite(u32 address, u32 data_) -> void {
   debugger.ioSCC(Write, address, data);
 }
 
-auto RSP::Status::readWord(u32 address) -> u32 {
+auto RSP::Status::readWord(u32 address, u32& cycles) -> u32 {
   address = (address & 0x7ffff) >> 2;
   n32 data;
 
@@ -177,7 +177,7 @@ auto RSP::Status::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto RSP::Status::writeWord(u32 address, u32 data_) -> void {
+auto RSP::Status::writeWord(u32 address, u32 data_, u32& cycles) -> void {
   address = (address & 0x7ffff) >> 2;
   n32 data = data_;
 
diff --git a/waterbox/ares64/ares/ares/n64/rsp/recompiler.cpp b/waterbox/ares64/ares/ares/n64/rsp/recompiler.cpp
index 7f867792f8..e10a886f8a 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/recompiler.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/recompiler.cpp
@@ -188,7 +188,7 @@ auto RSP::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   }
 
   //ADDIU Rt,Rs,i16
-  case 0x08 ... 0x09: {
+  case range2(0x08, 0x09): {
     add32(mem(Rt), mem(Rs), imm(i16));
     return 0;
   }
@@ -247,7 +247,7 @@ auto RSP::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x13 ... 0x1f: {
+  case range13(0x13, 0x1f): {
     return 0;
   }
 
@@ -348,7 +348,7 @@ auto RSP::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x2c ... 0x31: {
+  case range6(0x2c, 0x31): {
     return 0;
   }
 
@@ -358,7 +358,7 @@ auto RSP::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x33 ... 0x39: {
+  case range7(0x33, 0x39): {
     return 0;
   }
 
@@ -368,7 +368,7 @@ auto RSP::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x3b ... 0x3f: {
+  case range5(0x3b, 0x3f): {
     return 0;
   }
 
@@ -405,8 +405,7 @@ auto RSP::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //SLLV Rd,Rt,Rs
   case 0x04: {
-    and32(reg(0), mem(Rs), imm(31));
-    shl32(mem(Rd), mem(Rt), reg(0));
+    mshl32(mem(Rd), mem(Rt), mem(Rs));
     return 0;
   }
 
@@ -417,15 +416,13 @@ auto RSP::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //SRLV Rd,Rt,Rs
   case 0x06: {
-    and32(reg(0), mem(Rs), imm(31));
-    lshr32(mem(Rd), mem(Rt), reg(0));
+    mlshr32(mem(Rd), mem(Rt), mem(Rs));
     return 0;
   }
 
   //SRAV Rd,Rt,Rs
   case 0x07: {
-    and32(reg(0), mem(Rs), imm(31));
-    ashr32(mem(Rd), mem(Rt), reg(0));
+    mashr32(mem(Rd), mem(Rt), mem(Rs));
     return 0;
   }
 
@@ -445,7 +442,7 @@ auto RSP::Recompiler::emitSPECIAL(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x0a ... 0x0c: {
+  case range3(0x0a, 0x0c): {
     return 0;
   }
 
@@ -456,18 +453,18 @@ auto RSP::Recompiler::emitSPECIAL(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x0e ... 0x1f: {
+  case range18(0x0e, 0x1f): {
     return 0;
   }
 
   //ADDU Rd,Rs,Rt
-  case 0x20 ... 0x21: {
+  case range2(0x20, 0x21): {
     add32(mem(Rd), mem(Rs), mem(Rt));
     return 0;
   }
 
   //SUBU Rd,Rs,Rt
-  case 0x22 ... 0x23: {
+  case range2(0x22, 0x23): {
     sub32(mem(Rd), mem(Rs), mem(Rt));
     return 0;
   }
@@ -493,13 +490,13 @@ auto RSP::Recompiler::emitSPECIAL(u32 instruction) -> bool {
   //NOR Rd,Rs,Rt
   case 0x27: {
     or32(reg(0), mem(Rs), mem(Rt));
-    not32(reg(0), reg(0));
+    xor32(reg(0), reg(0), imm(-1));
     mov32(mem(Rd), reg(0));
     return 0;
   }
 
   //INVALID
-  case 0x28 ... 0x29: {
+  case range2(0x28, 0x29): {
     return 0;
   }
 
@@ -518,7 +515,7 @@ auto RSP::Recompiler::emitSPECIAL(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x2c ... 0x3f: {
+  case range20(0x2c, 0x3f): {
     return 0;
   }
 
@@ -547,7 +544,7 @@ auto RSP::Recompiler::emitREGIMM(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x02 ... 0x0f: {
+  case range14(0x02, 0x0f): {
     return 0;
   }
 
@@ -568,7 +565,7 @@ auto RSP::Recompiler::emitREGIMM(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x12 ... 0x1f: {
+  case range14(0x12, 0x1f): {
     return 0;
   }
 
@@ -589,7 +586,7 @@ auto RSP::Recompiler::emitSCC(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x01 ... 0x03: {
+  case range3(0x01, 0x03): {
     return 0;
   }
 
@@ -602,7 +599,7 @@ auto RSP::Recompiler::emitSCC(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x05 ... 0x1f: {
+  case range27(0x05, 0x1f): {
     return 0;
   }
 
@@ -663,7 +660,7 @@ auto RSP::Recompiler::emitVU(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x07 ... 0x0f: {
+  case range9(0x07, 0x0f): {
     return 0;
   }
 
@@ -871,7 +868,7 @@ auto RSP::Recompiler::emitVU(u32 instruction) -> bool {
   }
 
   //Broken opcodes: VADDB, VSUBB, VACCB, VSUCB, VSAD, VSAC, VSUM
-  case 0x16 ... 0x1c: {
+  case range7(0x16, 0x1c): {
     lea(reg(1), Vd);
     lea(reg(2), Vs);
     lea(reg(3), Vt);
@@ -888,7 +885,7 @@ auto RSP::Recompiler::emitVU(u32 instruction) -> bool {
   }
 
   //Invalid opcodes
-  case 0x1e ... 0x1f: {
+  case range2(0x1e, 0x1f): {
     lea(reg(1), Vd);
     lea(reg(2), Vs);
     lea(reg(3), Vt);
@@ -1023,7 +1020,7 @@ auto RSP::Recompiler::emitVU(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x2e ... 0x2f: {
+  case range2(0x2e, 0x2f): {
     lea(reg(1), Vd);
     lea(reg(2), Vs);
     lea(reg(3), Vt);
@@ -1101,7 +1098,7 @@ auto RSP::Recompiler::emitVU(u32 instruction) -> bool {
   }
 
   //Broken opcodes: VEXTT, VEXTQ, VEXTN
-  case 0x38 ... 0x3a: {
+  case range3(0x38, 0x3a): {
     lea(reg(1), Vd);
     lea(reg(2), Vs);
     lea(reg(3), Vt);
@@ -1119,7 +1116,7 @@ auto RSP::Recompiler::emitVU(u32 instruction) -> bool {
   }
 
   //Broken opcodes: VINST, VINSQ, VINSN
-  case 0x3c ... 0x3e: {
+  case range3(0x3c, 0x3e): {
     lea(reg(1), Vd);
     lea(reg(2), Vs);
     lea(reg(3), Vt);
@@ -1250,7 +1247,7 @@ auto RSP::Recompiler::emitLWC2(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x0c ... 0x1f: {
+  case range20(0x0c, 0x1f): {
     return 0;
   }
 
@@ -1375,7 +1372,7 @@ auto RSP::Recompiler::emitSWC2(u32 instruction) -> bool {
   }
 
   //INVALID
-  case 0x0c ... 0x1f: {
+  case range20(0x0c, 0x1f): {
     return 0;
   }
 
diff --git a/waterbox/ares64/ares/ares/n64/rsp/rsp.cpp b/waterbox/ares64/ares/ares/n64/rsp/rsp.cpp
index 48753112c6..76598cf55d 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/rsp.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/rsp.cpp
@@ -86,15 +86,15 @@ auto RSP::power(bool reset) -> void {
   for(auto& r : ipu.r) r.u32 = 0;
   ipu.pc = 0;
   branch = {};
-  for(auto& r : vpu.r) r.u128 = 0;
-  vpu.acch.u128 = 0;
-  vpu.accm.u128 = 0;
-  vpu.accl.u128 = 0;
-  vpu.vcoh.u128 = 0;
-  vpu.vcol.u128 = 0;
-  vpu.vcch.u128 = 0;
-  vpu.vccl.u128 = 0;
-  vpu.vce.u128 = 0;
+  for(auto& r : vpu.r) r = zero;
+  vpu.acch = zero;
+  vpu.accm = zero;
+  vpu.accl = zero;
+  vpu.vcoh = zero;
+  vpu.vcol = zero;
+  vpu.vcch = zero;
+  vpu.vccl = zero;
+  vpu.vce = zero;
   vpu.divin = 0;
   vpu.divout = 0;
   vpu.divdp = 0;
@@ -115,8 +115,8 @@ auto RSP::power(bool reset) -> void {
   }
 
   if constexpr(Accuracy::RSP::Recompiler) {
-    auto buffer = ares::Memory::FixedAllocator::get().tryAcquire(4_MiB);
-    recompiler.allocator.resize(4_MiB, bump_allocator::executable | bump_allocator::zero_fill, buffer);
+    auto buffer = ares::Memory::FixedAllocator::get().tryAcquire(64_MiB);
+    recompiler.allocator.resize(64_MiB, bump_allocator::executable | bump_allocator::zero_fill, buffer);
     recompiler.reset();
   }
 
diff --git a/waterbox/ares64/ares/ares/n64/rsp/rsp.hpp b/waterbox/ares64/ares/ares/n64/rsp/rsp.hpp
index 0e21ee255f..8cda7862a9 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/rsp.hpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/rsp.hpp
@@ -1,6 +1,6 @@
 //Reality Signal Processor
 
-struct RSP : Thread, Memory::IO<RSP> {
+struct RSP : Thread, Memory::RCP<RSP> {
   Node::Object node;
   Memory::Writable dmem;
   Memory::Writable imem;
@@ -47,8 +47,8 @@ struct RSP : Thread, Memory::IO<RSP> {
   auto dmaTransferStep() -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
   auto ioRead(u32 address) -> u32;
   auto ioWrite(u32 address, u32 data) -> void;
 
@@ -75,13 +75,13 @@ struct RSP : Thread, Memory::IO<RSP> {
     } busy, full;
   } dma;
 
-  struct Status : Memory::IO<Status> {
+  struct Status : Memory::RCP<Status> {
     RSP& self;
     Status(RSP& self) : self(self) {}
 
     //io.cpp
-    auto readWord(u32 address) -> u32;
-    auto writeWord(u32 address, u32 data) -> void;
+    auto readWord(u32 address, u32& cycles) -> u32;
+    auto writeWord(u32 address, u32 data, u32& cycles) -> void;
 
     n1 semaphore;
     n1 halted = 1;
@@ -174,7 +174,7 @@ struct RSP : Thread, Memory::IO<RSP> {
 
   //vpu.cpp: Vector Processing Unit
   union r128 {
-    struct { uint128_t u128; };
+    struct { u64 order_msb2(hi, lo); } u128;
 #if ARCHITECTURE_SUPPORTS_SSE4_1
     struct {   __m128i v128; };
 
@@ -203,6 +203,9 @@ struct RSP : Thread, Memory::IO<RSP> {
 
     //vu-registers.cpp
     auto operator()(u32 index) const -> r128;
+
+    //serialization.cpp
+    auto serialize(serializer&) -> void;
   };
   using cr128 = const r128;
 
@@ -217,8 +220,8 @@ struct RSP : Thread, Memory::IO<RSP> {
     bool divdp;
   } vpu;
 
-  static constexpr r128 zero{0};
-  static constexpr r128 invert{u128(0) - 1};
+  static constexpr r128 zero{0ull, 0ull};
+  static constexpr r128 invert{~0ull, ~0ull};
 
   auto accumulatorGet(u32 index) const -> u64;
   auto accumulatorSet(u32 index, u64 value) -> void;
diff --git a/waterbox/ares64/ares/ares/n64/rsp/serialization.cpp b/waterbox/ares64/ares/ares/n64/rsp/serialization.cpp
index 8224a1513f..3da6b71fc7 100644
--- a/waterbox/ares64/ares/ares/n64/rsp/serialization.cpp
+++ b/waterbox/ares64/ares/ares/n64/rsp/serialization.cpp
@@ -27,15 +27,15 @@ auto RSP::serialize(serializer& s) -> void {
   s(branch.pc);
   s(branch.state);
 
-  for(auto& r : vpu.r) s(r.u128);
-  s(vpu.acch.u128);
-  s(vpu.accm.u128);
-  s(vpu.accl.u128);
-  s(vpu.vcoh.u128);
-  s(vpu.vcol.u128);
-  s(vpu.vcch.u128);
-  s(vpu.vccl.u128);
-  s(vpu.vce.u128);
+  for(auto& r : vpu.r) s(r);
+  s(vpu.acch);
+  s(vpu.accm);
+  s(vpu.accl);
+  s(vpu.vcoh);
+  s(vpu.vcol);
+  s(vpu.vcch);
+  s(vpu.vccl);
+  s(vpu.vce);
   s(vpu.divin);
   s(vpu.divout);
   s(vpu.divdp);
@@ -53,3 +53,8 @@ auto RSP::DMA::Regs::serialize(serializer& s) -> void {
   s(skip);
   s(count);
 }
+
+auto RSP::r128::serialize(serializer& s) -> void {
+  s(u128.lo);
+  s(u128.hi);
+}
diff --git a/waterbox/ares64/ares/ares/n64/si/dma.cpp b/waterbox/ares64/ares/ares/n64/si/dma.cpp
index 142adf176c..35f13aeb2a 100644
--- a/waterbox/ares64/ares/ares/n64/si/dma.cpp
+++ b/waterbox/ares64/ares/ares/n64/si/dma.cpp
@@ -1,21 +1,13 @@
 auto SI::dmaRead() -> void {
-  pif.run();
-  for(u32 offset = 0; offset < 64; offset += 4) {
-    u32 data = pif.readWord(io.readAddress + offset);
-    rdram.ram.write<Word>(io.dramAddress + offset, data);
-  }
+  pif.dmaRead(io.readAddress, io.dramAddress);
   io.dmaBusy = 0;
   io.interrupt = 1;
   mi.raise(MI::IRQ::SI);
 }
 
 auto SI::dmaWrite() -> void {
-  for(u32 offset = 0; offset < 64; offset += 4) {
-    u32 data = rdram.ram.read<Word>(io.dramAddress + offset);
-    pif.writeWord(io.writeAddress + offset, data);
-  }
+  pif.dmaWrite(io.writeAddress, io.dramAddress);
   io.dmaBusy = 0;
   io.interrupt = 1;
   mi.raise(MI::IRQ::SI);
-  pif.run();
 }
diff --git a/waterbox/ares64/ares/ares/n64/si/io.cpp b/waterbox/ares64/ares/ares/n64/si/io.cpp
index 30174e8971..4079edad18 100644
--- a/waterbox/ares64/ares/ares/n64/si/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/si/io.cpp
@@ -1,4 +1,14 @@
-auto SI::readWord(u32 address) -> u32 {
+auto SI::readWord(u32 address, u32& cycles) -> u32 {
+  if(address <= 0x048f'ffff) return ioRead(address);
+
+  if (unlikely(io.ioBusy)) {
+    writeForceFinish(); //technically, we should wait until Queue::SI_BUS_Write
+    return io.busLatch;
+  }
+  return pif.read<Word>(address);
+}
+
+auto SI::ioRead(u32 address) -> u32 {
   address = (address & 0xfffff) >> 2;
   n32 data;
 
@@ -44,7 +54,17 @@ auto SI::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto SI::writeWord(u32 address, u32 data_) -> void {
+auto SI::writeWord(u32 address, u32 data, u32& cycles) -> void {
+  if(address <= 0x048f'ffff) return ioWrite(address, data);
+
+  if(io.ioBusy) return;
+  io.ioBusy = 1;
+  io.busLatch = data;
+  queue.insert(Queue::SI_BUS_Write, 2150*3);
+  return pif.write<Word>(address, data);
+}
+
+auto SI::ioWrite(u32 address, u32 data_) -> void {
   address = (address & 0xfffff) >> 2;
   n32 data = data_;
 
@@ -57,7 +77,8 @@ auto SI::writeWord(u32 address, u32 data_) -> void {
     //SI_PIF_ADDRESS_READ64B
     io.readAddress = data.bit(0,31) & ~1;
     io.dmaBusy = 1;
-    queue.insert(Queue::SI_DMA_Read, 2304);
+    int cycles = pif.estimateTiming();
+    queue.insert(Queue::SI_DMA_Read, cycles*3);
   }
 
   if(address == 2) {
@@ -72,7 +93,7 @@ auto SI::writeWord(u32 address, u32 data_) -> void {
     //SI_PIF_ADDRESS_WRITE64B
     io.writeAddress = data.bit(0,31) & ~1;
     io.dmaBusy = 1;
-    queue.insert(Queue::SI_DMA_Write, 2304);
+    queue.insert(Queue::SI_DMA_Write, 4065*3);
   }
 
   if(address == 5) {
@@ -87,3 +108,12 @@ auto SI::writeWord(u32 address, u32 data_) -> void {
 
   debugger.io(Write, address, data);
 }
+
+auto SI::writeFinished() -> void {
+  io.ioBusy = 0;
+}
+
+auto SI::writeForceFinish() -> void {
+  io.ioBusy = 0;
+  queue.remove(Queue::SI_BUS_Write);
+}
diff --git a/waterbox/ares64/ares/ares/n64/si/serialization.cpp b/waterbox/ares64/ares/ares/n64/si/serialization.cpp
index e692fece3d..531c92eb31 100644
--- a/waterbox/ares64/ares/ares/n64/si/serialization.cpp
+++ b/waterbox/ares64/ares/ares/n64/si/serialization.cpp
@@ -2,6 +2,7 @@ auto SI::serialize(serializer& s) -> void {
   s(io.dramAddress);
   s(io.readAddress);
   s(io.writeAddress);
+  s(io.busLatch);
   s(io.dmaBusy);
   s(io.ioBusy);
   s(io.readPending);
diff --git a/waterbox/ares64/ares/ares/n64/si/si.cpp b/waterbox/ares64/ares/ares/n64/si/si.cpp
index 01a3722787..0b02d855e9 100644
--- a/waterbox/ares64/ares/ares/n64/si/si.cpp
+++ b/waterbox/ares64/ares/ares/n64/si/si.cpp
@@ -11,14 +11,6 @@ SI si;
 auto SI::load(Node::Object parent) -> void {
   node = parent->append<Node::Object>("SI");
   debugger.load(node);
-
-/*if(auto fp = system.pak->read("pif.sm5.rom")) {
-    //load 1KB ROM and mirror it to 4KB
-    fp->read({SM5K::ROM, 1024});
-    memory::copy(&SM5K::ROM[1024], &SM5K::ROM[0], 1024);
-    memory::copy(&SM5K::ROM[2048], &SM5K::ROM[0], 1024);
-    memory::copy(&SM5K::ROM[3072], &SM5K::ROM[0], 1024);
-  }*/
 }
 
 auto SI::unload() -> void {
diff --git a/waterbox/ares64/ares/ares/n64/si/si.hpp b/waterbox/ares64/ares/ares/n64/si/si.hpp
index af4c28c532..de88dc404f 100644
--- a/waterbox/ares64/ares/ares/n64/si/si.hpp
+++ b/waterbox/ares64/ares/ares/n64/si/si.hpp
@@ -1,6 +1,6 @@
 //Serial Interface
 
-struct SI : Memory::IO<SI> {
+struct SI : Memory::RCP<SI> {
   Node::Object node;
 
   struct Debugger {
@@ -23,8 +23,12 @@ struct SI : Memory::IO<SI> {
   auto dmaWrite() -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto ioRead(u32 address) -> u32;
+  auto ioWrite(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
+  auto writeFinished() -> void;
+  auto writeForceFinish() -> void;
 
   //serialization.cpp
   auto serialize(serializer&) -> void;
@@ -33,6 +37,7 @@ struct SI : Memory::IO<SI> {
     n24 dramAddress;
     n32 readAddress;
     n32 writeAddress;
+    u32 busLatch;
     n1  dmaBusy;
     n1  ioBusy;
     n1  readPending;
diff --git a/waterbox/ares64/ares/ares/n64/system/serialization.cpp b/waterbox/ares64/ares/ares/n64/system/serialization.cpp
index 3965d18b5d..b199fca5a2 100644
--- a/waterbox/ares64/ares/ares/n64/system/serialization.cpp
+++ b/waterbox/ares64/ares/ares/n64/system/serialization.cpp
@@ -1,3 +1,5 @@
+static const string SerializerVersion = "v131";
+
 auto System::serialize(bool synchronize) -> serializer {
   serializer s;
 
@@ -47,6 +49,7 @@ auto System::serialize(serializer& s, bool synchronize) -> void {
   s(ai);
   s(pi);
   s(pif);
+  s(cic);
   s(ri);
   s(si);
   s(cpu);
diff --git a/waterbox/ares64/ares/ares/n64/system/system.cpp b/waterbox/ares64/ares/ares/n64/system/system.cpp
index 1b24a5930c..bf249cc733 100644
--- a/waterbox/ares64/ares/ares/n64/system/system.cpp
+++ b/waterbox/ares64/ares/ares/n64/system/system.cpp
@@ -40,7 +40,6 @@ auto System::game() -> string {
 auto System::run() -> void {
   while(!vi.refreshed) cpu.main();
   vi.refreshed = false;
-  if (!pif.io.romLockout) pif.run();
 }
 
 auto System::load(Node::System& root, string name) -> bool {
@@ -120,7 +119,7 @@ auto System::unload() -> void {
 }
 
 auto System::save() -> void {
-/*
+#if false
   if(!node) return;
   cartridge.save();
   controllerPort1.save();
@@ -128,7 +127,7 @@ auto System::save() -> void {
   controllerPort3.save();
   controllerPort4.save();
   if(_DD()) dd.save();
-*/
+#endif
 }
 
 auto System::power(bool reset) -> void {
@@ -146,6 +145,7 @@ auto System::power(bool reset) -> void {
   ai.power(reset);
   pi.power(reset);
   pif.power(reset);
+  cic.power(reset);
   ri.power(reset);
   si.power(reset);
   cpu.power(reset);
diff --git a/waterbox/ares64/ares/ares/n64/vi/io.cpp b/waterbox/ares64/ares/ares/n64/vi/io.cpp
index 3b84814051..3c875743d0 100644
--- a/waterbox/ares64/ares/ares/n64/vi/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/vi/io.cpp
@@ -1,4 +1,4 @@
-auto VI::readWord(u32 address) -> u32 {
+auto VI::readWord(u32 address, u32& cycles) -> u32 {
   address = (address & 0xfffff) >> 2;
   n32 data;
 
@@ -96,7 +96,7 @@ auto VI::readWord(u32 address) -> u32 {
   return data;
 }
 
-auto VI::writeWord(u32 address, u32 data_) -> void {
+auto VI::writeWord(u32 address, u32 data_, u32& cycles) -> void {
   address = (address & 0xfffff) >> 2;
   n32 data = data_;
 
diff --git a/waterbox/ares64/ares/ares/n64/vi/vi.hpp b/waterbox/ares64/ares/ares/n64/vi/vi.hpp
index 2314f3cb6a..672e3967f9 100644
--- a/waterbox/ares64/ares/ares/n64/vi/vi.hpp
+++ b/waterbox/ares64/ares/ares/n64/vi/vi.hpp
@@ -1,6 +1,6 @@
 //Video Interface
 
-struct VI : Thread, Memory::IO<VI> {
+struct VI : Thread, Memory::RCP<VI> {
   Node::Object node;
   Node::Video::Screen screen;
 
@@ -24,8 +24,8 @@ struct VI : Thread, Memory::IO<VI> {
   auto power(bool reset) -> void;
 
   //io.cpp
-  auto readWord(u32 address) -> u32;
-  auto writeWord(u32 address, u32 data) -> void;
+  auto readWord(u32 address, u32& cycles) -> u32;
+  auto writeWord(u32 address, u32 data, u32& cycles) -> void;
 
   //serialization.cpp
   auto serialize(serializer&) -> void;
diff --git a/waterbox/ares64/ares/nall/arguments.hpp b/waterbox/ares64/ares/nall/arguments.hpp
index 7ecb536bff..1bf7043794 100644
--- a/waterbox/ares64/ares/nall/arguments.hpp
+++ b/waterbox/ares64/ares/nall/arguments.hpp
@@ -1,8 +1,10 @@
 #pragma once
 
+#include <nall/string.hpp>
+#include <nall/directory.hpp>
+#include <nall/file.hpp>
 #include <nall/location.hpp>
 #include <nall/path.hpp>
-#include <nall/string.hpp>
 #include <nall/vector.hpp>
 
 namespace nall {
diff --git a/waterbox/ares64/ares/nall/bcd.hpp b/waterbox/ares64/ares/nall/bcd.hpp
new file mode 100644
index 0000000000..5d9a92a724
--- /dev/null
+++ b/waterbox/ares64/ares/nall/bcd.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <typeinfo>
+
+namespace nall {
+
+struct BCD {
+    static auto encode(u8 value) -> u8 { return value / 10 << 4 | value % 10; }
+    static auto decode(u8 value) -> u8 { return (value >> 4) * 10 + (value & 15); }
+};
+
+}
diff --git a/waterbox/ares64/ares/nall/case-range.hpp b/waterbox/ares64/ares/nall/case-range.hpp
new file mode 100644
index 0000000000..6f345614f4
--- /dev/null
+++ b/waterbox/ares64/ares/nall/case-range.hpp
@@ -0,0 +1,279 @@
+#pragma once
+
+//GNU case range extension simulated with preprocessor macros
+//
+//  usage                 expands to
+//  ------------------    -----------------------------------
+//  caseN(X):          => case X:
+//                        case X + 1:
+//                        ...
+//                        case X + N - 1:
+//
+//  case rangeN(X, Y): => case X:
+//                        static_assert(Y - X + 1 == N);
+//                        case X + 1:
+//                        ...
+//                        case Y:
+//
+//the range macro more closely resembles the GNU extension syntax at the cost of
+//redundancy. it embeds a static assert between the first two case labels to
+//ensure consistency between the chosen macro and the provided macro arguments.
+
+#define check_case_range(x, y, n) static_assert((y) - (x) + 1 == n, "range does not contain " #n " elements"); [[fallthrough]];
+
+#define case1(x) case (x)
+#define case2(x) case1(x): case1((x) + 1)
+#define case3(x) case1(x): case2((x) + 1)
+#define case4(x) case2(x): case2((x) + 2)
+#define case5(x) case2(x): case3((x) + 2)
+#define case6(x) case3(x): case3((x) + 3)
+#define case7(x) case3(x): case4((x) + 3)
+#define case8(x) case4(x): case4((x) + 4)
+#define case9(x) case4(x): case5((x) + 4)
+#define case10(x) case5(x): case5((x) + 5)
+#define case11(x) case5(x): case6((x) + 5)
+#define case12(x) case6(x): case6((x) + 6)
+#define case13(x) case6(x): case7((x) + 6)
+#define case14(x) case7(x): case7((x) + 7)
+#define case15(x) case7(x): case8((x) + 7)
+#define case16(x) case8(x): case8((x) + 8)
+#define case17(x) case8(x): case9((x) + 8)
+#define case18(x) case9(x): case9((x) + 9)
+#define case19(x) case9(x): case10((x) + 9)
+#define case20(x) case10(x): case10((x) + 10)
+#define case21(x) case10(x): case11((x) + 10)
+#define case22(x) case11(x): case11((x) + 11)
+#define case23(x) case11(x): case12((x) + 11)
+#define case24(x) case12(x): case12((x) + 12)
+#define case25(x) case12(x): case13((x) + 12)
+#define case26(x) case13(x): case13((x) + 13)
+#define case27(x) case13(x): case14((x) + 13)
+#define case28(x) case14(x): case14((x) + 14)
+#define case29(x) case14(x): case15((x) + 14)
+#define case30(x) case15(x): case15((x) + 15)
+#define case31(x) case15(x): case16((x) + 15)
+#define case32(x) case16(x): case16((x) + 16)
+#define case33(x) case16(x): case17((x) + 16)
+#define case34(x) case17(x): case17((x) + 17)
+#define case35(x) case17(x): case18((x) + 17)
+#define case36(x) case18(x): case18((x) + 18)
+#define case37(x) case18(x): case19((x) + 18)
+#define case38(x) case19(x): case19((x) + 19)
+#define case39(x) case19(x): case20((x) + 19)
+#define case40(x) case20(x): case20((x) + 20)
+#define case41(x) case20(x): case21((x) + 20)
+#define case42(x) case21(x): case21((x) + 21)
+#define case43(x) case21(x): case22((x) + 21)
+#define case44(x) case22(x): case22((x) + 22)
+#define case45(x) case22(x): case23((x) + 22)
+#define case46(x) case23(x): case23((x) + 23)
+#define case47(x) case23(x): case24((x) + 23)
+#define case48(x) case24(x): case24((x) + 24)
+#define case49(x) case24(x): case25((x) + 24)
+#define case50(x) case25(x): case25((x) + 25)
+#define case51(x) case25(x): case26((x) + 25)
+#define case52(x) case26(x): case26((x) + 26)
+#define case53(x) case26(x): case27((x) + 26)
+#define case54(x) case27(x): case27((x) + 27)
+#define case55(x) case27(x): case28((x) + 27)
+#define case56(x) case28(x): case28((x) + 28)
+#define case57(x) case28(x): case29((x) + 28)
+#define case58(x) case29(x): case29((x) + 29)
+#define case59(x) case29(x): case30((x) + 29)
+#define case60(x) case30(x): case30((x) + 30)
+#define case61(x) case30(x): case31((x) + 30)
+#define case62(x) case31(x): case31((x) + 31)
+#define case63(x) case31(x): case32((x) + 31)
+#define case64(x) case32(x): case32((x) + 32)
+#define case65(x) case32(x): case33((x) + 32)
+#define case66(x) case33(x): case33((x) + 33)
+#define case67(x) case33(x): case34((x) + 33)
+#define case68(x) case34(x): case34((x) + 34)
+#define case69(x) case34(x): case35((x) + 34)
+#define case70(x) case35(x): case35((x) + 35)
+#define case71(x) case35(x): case36((x) + 35)
+#define case72(x) case36(x): case36((x) + 36)
+#define case73(x) case36(x): case37((x) + 36)
+#define case74(x) case37(x): case37((x) + 37)
+#define case75(x) case37(x): case38((x) + 37)
+#define case76(x) case38(x): case38((x) + 38)
+#define case77(x) case38(x): case39((x) + 38)
+#define case78(x) case39(x): case39((x) + 39)
+#define case79(x) case39(x): case40((x) + 39)
+#define case80(x) case40(x): case40((x) + 40)
+#define case81(x) case40(x): case41((x) + 40)
+#define case82(x) case41(x): case41((x) + 41)
+#define case83(x) case41(x): case42((x) + 41)
+#define case84(x) case42(x): case42((x) + 42)
+#define case85(x) case42(x): case43((x) + 42)
+#define case86(x) case43(x): case43((x) + 43)
+#define case87(x) case43(x): case44((x) + 43)
+#define case88(x) case44(x): case44((x) + 44)
+#define case89(x) case44(x): case45((x) + 44)
+#define case90(x) case45(x): case45((x) + 45)
+#define case91(x) case45(x): case46((x) + 45)
+#define case92(x) case46(x): case46((x) + 46)
+#define case93(x) case46(x): case47((x) + 46)
+#define case94(x) case47(x): case47((x) + 47)
+#define case95(x) case47(x): case48((x) + 47)
+#define case96(x) case48(x): case48((x) + 48)
+#define case97(x) case48(x): case49((x) + 48)
+#define case98(x) case49(x): case49((x) + 49)
+#define case99(x) case49(x): case50((x) + 49)
+#define case100(x) case50(x): case50((x) + 50)
+#define case101(x) case50(x): case51((x) + 50)
+#define case102(x) case51(x): case51((x) + 51)
+#define case103(x) case51(x): case52((x) + 51)
+#define case104(x) case52(x): case52((x) + 52)
+#define case105(x) case52(x): case53((x) + 52)
+#define case106(x) case53(x): case53((x) + 53)
+#define case107(x) case53(x): case54((x) + 53)
+#define case108(x) case54(x): case54((x) + 54)
+#define case109(x) case54(x): case55((x) + 54)
+#define case110(x) case55(x): case55((x) + 55)
+#define case111(x) case55(x): case56((x) + 55)
+#define case112(x) case56(x): case56((x) + 56)
+#define case113(x) case56(x): case57((x) + 56)
+#define case114(x) case57(x): case57((x) + 57)
+#define case115(x) case57(x): case58((x) + 57)
+#define case116(x) case58(x): case58((x) + 58)
+#define case117(x) case58(x): case59((x) + 58)
+#define case118(x) case59(x): case59((x) + 59)
+#define case119(x) case59(x): case60((x) + 59)
+#define case120(x) case60(x): case60((x) + 60)
+#define case121(x) case60(x): case61((x) + 60)
+#define case122(x) case61(x): case61((x) + 61)
+#define case123(x) case61(x): case62((x) + 61)
+#define case124(x) case62(x): case62((x) + 62)
+#define case125(x) case62(x): case63((x) + 62)
+#define case126(x) case63(x): case63((x) + 63)
+#define case127(x) case63(x): case64((x) + 63)
+#define case128(x) case64(x): case64((x) + 64)
+
+#define range2(x, y) (x): check_case_range(x, y, 2) case1((x) + 1)
+#define range3(x, y) (x): check_case_range(x, y, 3) case2((x) + 1)
+#define range4(x, y) (x): check_case_range(x, y, 4) case3((x) + 1)
+#define range5(x, y) (x): check_case_range(x, y, 5) case4((x) + 1)
+#define range6(x, y) (x): check_case_range(x, y, 6) case5((x) + 1)
+#define range7(x, y) (x): check_case_range(x, y, 7) case6((x) + 1)
+#define range8(x, y) (x): check_case_range(x, y, 8) case7((x) + 1)
+#define range9(x, y) (x): check_case_range(x, y, 9) case8((x) + 1)
+#define range10(x, y) (x): check_case_range(x, y, 10) case9((x) + 1)
+#define range11(x, y) (x): check_case_range(x, y, 11) case10((x) + 1)
+#define range12(x, y) (x): check_case_range(x, y, 12) case11((x) + 1)
+#define range13(x, y) (x): check_case_range(x, y, 13) case12((x) + 1)
+#define range14(x, y) (x): check_case_range(x, y, 14) case13((x) + 1)
+#define range15(x, y) (x): check_case_range(x, y, 15) case14((x) + 1)
+#define range16(x, y) (x): check_case_range(x, y, 16) case15((x) + 1)
+#define range17(x, y) (x): check_case_range(x, y, 17) case16((x) + 1)
+#define range18(x, y) (x): check_case_range(x, y, 18) case17((x) + 1)
+#define range19(x, y) (x): check_case_range(x, y, 19) case18((x) + 1)
+#define range20(x, y) (x): check_case_range(x, y, 20) case19((x) + 1)
+#define range21(x, y) (x): check_case_range(x, y, 21) case20((x) + 1)
+#define range22(x, y) (x): check_case_range(x, y, 22) case21((x) + 1)
+#define range23(x, y) (x): check_case_range(x, y, 23) case22((x) + 1)
+#define range24(x, y) (x): check_case_range(x, y, 24) case23((x) + 1)
+#define range25(x, y) (x): check_case_range(x, y, 25) case24((x) + 1)
+#define range26(x, y) (x): check_case_range(x, y, 26) case25((x) + 1)
+#define range27(x, y) (x): check_case_range(x, y, 27) case26((x) + 1)
+#define range28(x, y) (x): check_case_range(x, y, 28) case27((x) + 1)
+#define range29(x, y) (x): check_case_range(x, y, 29) case28((x) + 1)
+#define range30(x, y) (x): check_case_range(x, y, 30) case29((x) + 1)
+#define range31(x, y) (x): check_case_range(x, y, 31) case30((x) + 1)
+#define range32(x, y) (x): check_case_range(x, y, 32) case31((x) + 1)
+#define range33(x, y) (x): check_case_range(x, y, 33) case32((x) + 1)
+#define range34(x, y) (x): check_case_range(x, y, 34) case33((x) + 1)
+#define range35(x, y) (x): check_case_range(x, y, 35) case34((x) + 1)
+#define range36(x, y) (x): check_case_range(x, y, 36) case35((x) + 1)
+#define range37(x, y) (x): check_case_range(x, y, 37) case36((x) + 1)
+#define range38(x, y) (x): check_case_range(x, y, 38) case37((x) + 1)
+#define range39(x, y) (x): check_case_range(x, y, 39) case38((x) + 1)
+#define range40(x, y) (x): check_case_range(x, y, 40) case39((x) + 1)
+#define range41(x, y) (x): check_case_range(x, y, 41) case40((x) + 1)
+#define range42(x, y) (x): check_case_range(x, y, 42) case41((x) + 1)
+#define range43(x, y) (x): check_case_range(x, y, 43) case42((x) + 1)
+#define range44(x, y) (x): check_case_range(x, y, 44) case43((x) + 1)
+#define range45(x, y) (x): check_case_range(x, y, 45) case44((x) + 1)
+#define range46(x, y) (x): check_case_range(x, y, 46) case45((x) + 1)
+#define range47(x, y) (x): check_case_range(x, y, 47) case46((x) + 1)
+#define range48(x, y) (x): check_case_range(x, y, 48) case47((x) + 1)
+#define range49(x, y) (x): check_case_range(x, y, 49) case48((x) + 1)
+#define range50(x, y) (x): check_case_range(x, y, 50) case49((x) + 1)
+#define range51(x, y) (x): check_case_range(x, y, 51) case50((x) + 1)
+#define range52(x, y) (x): check_case_range(x, y, 52) case51((x) + 1)
+#define range53(x, y) (x): check_case_range(x, y, 53) case52((x) + 1)
+#define range54(x, y) (x): check_case_range(x, y, 54) case53((x) + 1)
+#define range55(x, y) (x): check_case_range(x, y, 55) case54((x) + 1)
+#define range56(x, y) (x): check_case_range(x, y, 56) case55((x) + 1)
+#define range57(x, y) (x): check_case_range(x, y, 57) case56((x) + 1)
+#define range58(x, y) (x): check_case_range(x, y, 58) case57((x) + 1)
+#define range59(x, y) (x): check_case_range(x, y, 59) case58((x) + 1)
+#define range60(x, y) (x): check_case_range(x, y, 60) case59((x) + 1)
+#define range61(x, y) (x): check_case_range(x, y, 61) case60((x) + 1)
+#define range62(x, y) (x): check_case_range(x, y, 62) case61((x) + 1)
+#define range63(x, y) (x): check_case_range(x, y, 63) case62((x) + 1)
+#define range64(x, y) (x): check_case_range(x, y, 64) case63((x) + 1)
+#define range65(x, y) (x): check_case_range(x, y, 65) case64((x) + 1)
+#define range66(x, y) (x): check_case_range(x, y, 66) case65((x) + 1)
+#define range67(x, y) (x): check_case_range(x, y, 67) case66((x) + 1)
+#define range68(x, y) (x): check_case_range(x, y, 68) case67((x) + 1)
+#define range69(x, y) (x): check_case_range(x, y, 69) case68((x) + 1)
+#define range70(x, y) (x): check_case_range(x, y, 70) case69((x) + 1)
+#define range71(x, y) (x): check_case_range(x, y, 71) case70((x) + 1)
+#define range72(x, y) (x): check_case_range(x, y, 72) case71((x) + 1)
+#define range73(x, y) (x): check_case_range(x, y, 73) case72((x) + 1)
+#define range74(x, y) (x): check_case_range(x, y, 74) case73((x) + 1)
+#define range75(x, y) (x): check_case_range(x, y, 75) case74((x) + 1)
+#define range76(x, y) (x): check_case_range(x, y, 76) case75((x) + 1)
+#define range77(x, y) (x): check_case_range(x, y, 77) case76((x) + 1)
+#define range78(x, y) (x): check_case_range(x, y, 78) case77((x) + 1)
+#define range79(x, y) (x): check_case_range(x, y, 79) case78((x) + 1)
+#define range80(x, y) (x): check_case_range(x, y, 80) case79((x) + 1)
+#define range81(x, y) (x): check_case_range(x, y, 81) case80((x) + 1)
+#define range82(x, y) (x): check_case_range(x, y, 82) case81((x) + 1)
+#define range83(x, y) (x): check_case_range(x, y, 83) case82((x) + 1)
+#define range84(x, y) (x): check_case_range(x, y, 84) case83((x) + 1)
+#define range85(x, y) (x): check_case_range(x, y, 85) case84((x) + 1)
+#define range86(x, y) (x): check_case_range(x, y, 86) case85((x) + 1)
+#define range87(x, y) (x): check_case_range(x, y, 87) case86((x) + 1)
+#define range88(x, y) (x): check_case_range(x, y, 88) case87((x) + 1)
+#define range89(x, y) (x): check_case_range(x, y, 89) case88((x) + 1)
+#define range90(x, y) (x): check_case_range(x, y, 90) case89((x) + 1)
+#define range91(x, y) (x): check_case_range(x, y, 91) case90((x) + 1)
+#define range92(x, y) (x): check_case_range(x, y, 92) case91((x) + 1)
+#define range93(x, y) (x): check_case_range(x, y, 93) case92((x) + 1)
+#define range94(x, y) (x): check_case_range(x, y, 94) case93((x) + 1)
+#define range95(x, y) (x): check_case_range(x, y, 95) case94((x) + 1)
+#define range96(x, y) (x): check_case_range(x, y, 96) case95((x) + 1)
+#define range97(x, y) (x): check_case_range(x, y, 97) case96((x) + 1)
+#define range98(x, y) (x): check_case_range(x, y, 98) case97((x) + 1)
+#define range99(x, y) (x): check_case_range(x, y, 99) case98((x) + 1)
+#define range100(x, y) (x): check_case_range(x, y, 100) case99((x) + 1)
+#define range101(x, y) (x): check_case_range(x, y, 101) case100((x) + 1)
+#define range102(x, y) (x): check_case_range(x, y, 102) case101((x) + 1)
+#define range103(x, y) (x): check_case_range(x, y, 103) case102((x) + 1)
+#define range104(x, y) (x): check_case_range(x, y, 104) case103((x) + 1)
+#define range105(x, y) (x): check_case_range(x, y, 105) case104((x) + 1)
+#define range106(x, y) (x): check_case_range(x, y, 106) case105((x) + 1)
+#define range107(x, y) (x): check_case_range(x, y, 107) case106((x) + 1)
+#define range108(x, y) (x): check_case_range(x, y, 108) case107((x) + 1)
+#define range109(x, y) (x): check_case_range(x, y, 109) case108((x) + 1)
+#define range110(x, y) (x): check_case_range(x, y, 110) case109((x) + 1)
+#define range111(x, y) (x): check_case_range(x, y, 111) case110((x) + 1)
+#define range112(x, y) (x): check_case_range(x, y, 112) case111((x) + 1)
+#define range113(x, y) (x): check_case_range(x, y, 113) case112((x) + 1)
+#define range114(x, y) (x): check_case_range(x, y, 114) case113((x) + 1)
+#define range115(x, y) (x): check_case_range(x, y, 115) case114((x) + 1)
+#define range116(x, y) (x): check_case_range(x, y, 116) case115((x) + 1)
+#define range117(x, y) (x): check_case_range(x, y, 117) case116((x) + 1)
+#define range118(x, y) (x): check_case_range(x, y, 118) case117((x) + 1)
+#define range119(x, y) (x): check_case_range(x, y, 119) case118((x) + 1)
+#define range120(x, y) (x): check_case_range(x, y, 120) case119((x) + 1)
+#define range121(x, y) (x): check_case_range(x, y, 121) case120((x) + 1)
+#define range122(x, y) (x): check_case_range(x, y, 122) case121((x) + 1)
+#define range123(x, y) (x): check_case_range(x, y, 123) case122((x) + 1)
+#define range124(x, y) (x): check_case_range(x, y, 124) case123((x) + 1)
+#define range125(x, y) (x): check_case_range(x, y, 125) case124((x) + 1)
+#define range126(x, y) (x): check_case_range(x, y, 126) case125((x) + 1)
+#define range127(x, y) (x): check_case_range(x, y, 127) case126((x) + 1)
+#define range128(x, y) (x): check_case_range(x, y, 128) case127((x) + 1)
diff --git a/waterbox/ares64/ares/nall/cd.hpp b/waterbox/ares64/ares/nall/cd.hpp
index 9fca9abab7..55acbeec61 100644
--- a/waterbox/ares64/ares/nall/cd.hpp
+++ b/waterbox/ares64/ares/nall/cd.hpp
@@ -22,6 +22,7 @@
 #include <nall/matrix.hpp>
 #include <nall/reed-solomon.hpp>
 
+#include <nall/bcd.hpp>
 #include <nall/cd/crc16.hpp>
 #include <nall/cd/efm.hpp>
 #include <nall/cd/sync.hpp>
diff --git a/waterbox/ares64/ares/nall/cd/session.hpp b/waterbox/ares64/ares/nall/cd/session.hpp
index 26d359591c..f8b826c5e1 100644
--- a/waterbox/ares64/ares/nall/cd/session.hpp
+++ b/waterbox/ares64/ares/nall/cd/session.hpp
@@ -7,11 +7,6 @@ namespace nall::CD {
 
 enum : s32 { InvalidLBA = 100 * 60 * 75 };
 
-struct BCD {
-  static auto encode(u8 value) -> u8 { return value / 10 << 4 | value % 10; }
-  static auto decode(u8 value) -> u8 { return (value >> 4) * 10 + (value & 15); }
-};
-
 struct MSF {
   u8 minute;        //00-99
   u8 second;        //00-59
diff --git a/waterbox/ares64/ares/nall/chrono.hpp b/waterbox/ares64/ares/nall/chrono.hpp
index 08b9e3e86a..dec25cbd8d 100644
--- a/waterbox/ares64/ares/nall/chrono.hpp
+++ b/waterbox/ares64/ares/nall/chrono.hpp
@@ -3,14 +3,15 @@
 #include <nall/function.hpp>
 #include <nall/string.hpp>
 
+#include <chrono>
+
 namespace nall::chrono {
 
 //passage of time functions (from unknown epoch)
 
 inline auto nanosecond() -> u64 {
-  timespec tv;
-  clock_gettime(CLOCK_MONOTONIC, &tv);
-  return tv.tv_sec * 1'000'000'000 + tv.tv_nsec;
+  auto now = std::chrono::steady_clock::now().time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
 }
 
 inline auto microsecond() -> u64 { return nanosecond() / 1'000; }
diff --git a/waterbox/ares64/ares/nall/cipher/chacha20.hpp b/waterbox/ares64/ares/nall/cipher/chacha20.hpp
index 4d0088056a..eec1042d59 100644
--- a/waterbox/ares64/ares/nall/cipher/chacha20.hpp
+++ b/waterbox/ares64/ares/nall/cipher/chacha20.hpp
@@ -102,7 +102,7 @@ struct HChaCha20 : protected ChaCha20 {
 //192-bit nonce; 64-bit x 64-byte (256GB) counter
 struct XChaCha20 : ChaCha20 {
   XChaCha20(u256 key, u192 nonce, u64 counter = 0):
-  ChaCha20(HChaCha20(key, nonce).key(), nonce >> 128, counter) {
+  ChaCha20(HChaCha20(key, u128(nonce)).key(), nonce >> 128, counter) {
   }
 };
 
diff --git a/waterbox/ares64/ares/nall/database/sqlite3.hpp b/waterbox/ares64/ares/nall/database/sqlite3.hpp
index 8a8322c487..abc2af6468 100644
--- a/waterbox/ares64/ares/nall/database/sqlite3.hpp
+++ b/waterbox/ares64/ares/nall/database/sqlite3.hpp
@@ -116,9 +116,9 @@ struct SQLite3 {
     auto& bind(u32 column, u64 value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
     auto& bind(u32 column, intmax value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
     auto& bind(u32 column, uintmax value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
-    auto& bind(u32 column, nall::boolean value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
-    auto& bind(u32 column, nall::integer value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
-    auto& bind(u32 column, nall::natural value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
+    auto& bind(u32 column, nall::Boolean value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
+    auto& bind(u32 column, nall::Integer value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
+    auto& bind(u32 column, nall::Natural value) { sqlite3_bind_int64(_statement, 1 + column, value); return *this; }
     auto& bind(u32 column, f64 value) { sqlite3_bind_double(_statement, 1 + column, value); return *this; }
     auto& bind(u32 column, const nall::string& value) { sqlite3_bind_text(_statement, 1 + column, value.data(), value.size(), SQLITE_TRANSIENT); return *this; }
     auto& bind(u32 column, const vector<u8>& value) { sqlite3_bind_blob(_statement, 1 + column, value.data(), value.size(), SQLITE_TRANSIENT); return *this; }
@@ -131,9 +131,9 @@ struct SQLite3 {
     auto& bind(u64 value) { return bind(_input++, value); }
     auto& bind(intmax value) { return bind(_input++, value); }
     auto& bind(uintmax value) { return bind(_input++, value); }
-    auto& bind(nall::boolean value) { return bind(_input++, value); }
-    auto& bind(nall::integer value) { return bind(_input++, value); }
-    auto& bind(nall::natural value) { return bind(_input++, value); }
+    auto& bind(nall::Boolean value) { return bind(_input++, value); }
+    auto& bind(nall::Integer value) { return bind(_input++, value); }
+    auto& bind(nall::Natural value) { return bind(_input++, value); }
     auto& bind(f64 value) { return bind(_input++, value); }
     auto& bind(const nall::string& value) { return bind(_input++, value); }
     auto& bind(const vector<u8>& value) { return bind(_input++, value); }
diff --git a/waterbox/ares64/ares/nall/decode/chd.hpp b/waterbox/ares64/ares/nall/decode/chd.hpp
index 8d48228a81..b150787f68 100644
--- a/waterbox/ares64/ares/nall/decode/chd.hpp
+++ b/waterbox/ares64/ares/nall/decode/chd.hpp
@@ -3,7 +3,9 @@
 #include <nall/file.hpp>
 #include <nall/maybe.hpp>
 #include <nall/string.hpp>
-//#include <libchdr/chd.h>
+#if false
+#include <libchdr/chd.h>
+#endif
 
 namespace nall::Decode {
 
@@ -35,7 +37,9 @@ struct CHD {
   vector<Track> tracks;
 private:
   file_buffer fp;
-  //chd_file* chd = nullptr;
+#if false
+  chd_file* chd = nullptr;
+#endif
   const int chd_sector_size = 2352 + 96;
   size_t chd_hunk_size;
   vector<u8> chd_hunk_buffer;
@@ -43,9 +47,11 @@ private:
 };
 
 inline CHD::~CHD() {
-  /*if (chd != nullptr) {
+#if false
+  if (chd != nullptr) {
      chd_close(chd);
-  }*/
+  }
+#endif
 }
 
 inline auto CHD::load(const string& location) -> bool {
@@ -56,8 +62,8 @@ inline auto CHD::load(const string& location) -> bool {
   }
 
   return false;
-
-  /*chd_error err = chd_open_file(fp.handle(), CHD_OPEN_READ, nullptr, &chd);
+#if false
+  chd_error err = chd_open_file(fp.handle(), CHD_OPEN_READ, nullptr, &chd);
   if (err != CHDERR_NONE) {
     print("CHD: Failed to open ", location, ": ", chd_error_string(err), "\n");
     return false;
@@ -112,7 +118,7 @@ inline auto CHD::load(const string& location) -> bool {
 
     // We currently only support RAW and audio tracks; log an error and exit if we see anything different
     auto typeStr = string{type};
-    if (!(typeStr.find("_RAW") || typeStr.find("AUDIO"))) {
+    if (!(typeStr.find("_RAW") || typeStr.find("AUDIO") || typeStr.find("MODE1"))) {
       print("CHD: Unsupported track type: ", type, "\n");
       return false;
     }
@@ -180,18 +186,20 @@ inline auto CHD::load(const string& location) -> bool {
     tracks.append(track);
   }
 
-  return true;*/
+  return true;
+#endif
 }
 
 inline auto CHD::read(u32 sector) -> vector<u8> {
   // Convert LBA in CD-ROM to LBA in CHD
-  /*for(auto& track : tracks) {
+#if false
+  for(auto& track : tracks) {
     for(auto& index : track.indices) {
       if (sector >= index.lba && sector <= index.end) {
         auto chd_lba = (sector - index.lba) + index.chd_lba;
 
         vector<u8> output;
-        output.resize(2352);
+        output.resize(track.type == "MODE1" ? 2048 : 2352);
 
         int hunk = (chd_lba * chd_sector_size) / chd_hunk_size;
         int offset = (chd_lba * chd_sector_size) % chd_hunk_size;
@@ -215,7 +223,7 @@ inline auto CHD::read(u32 sector) -> vector<u8> {
             dst_ptr += sizeof(value);
           }
         } else {
-          std::copy(chd_hunk_buffer.data() + offset, chd_hunk_buffer.data() + offset + 2352, output.data());
+          std::copy(chd_hunk_buffer.data() + offset, chd_hunk_buffer.data() + offset + output.size(), output.data());
         }
 
         return output;
@@ -223,7 +231,8 @@ inline auto CHD::read(u32 sector) -> vector<u8> {
     }
   }
 
-  print("CHD: Attempting to read from unmapped sector ", sector, "\n");*/
+  print("CHD: Attempting to read from unmapped sector ", sector, "\n");
+#endif
   return {};
 }
 
diff --git a/waterbox/ares64/ares/nall/directory.cpp b/waterbox/ares64/ares/nall/directory.cpp
new file mode 100644
index 0000000000..31eb783f85
--- /dev/null
+++ b/waterbox/ares64/ares/nall/directory.cpp
@@ -0,0 +1,86 @@
+#include <nall/directory.hpp>
+
+namespace nall {
+
+#if defined(PLATFORM_WINDOWS)
+
+NALL_HEADER_INLINE auto directory::exists(const string& pathname) -> bool {
+  if(!pathname) return false;
+  string name = pathname;
+  name.trim("\"", "\"");
+  DWORD result = GetFileAttributes(utf16_t(name));
+  if(result == INVALID_FILE_ATTRIBUTES) return false;
+  return (result & FILE_ATTRIBUTE_DIRECTORY);
+}
+
+NALL_HEADER_INLINE auto directory::ufolders(const string& pathname, const string& pattern) -> vector<string> {
+  if(!pathname) {
+    //special root pseudo-folder (return list of drives)
+    wchar_t drives[PATH_MAX] = {0};
+    GetLogicalDriveStrings(PATH_MAX, drives);
+    wchar_t* p = drives;
+    while(*p || *(p + 1)) {
+      if(!*p) *p = ';';
+      p++;
+    }
+    return string{(const char*)utf8_t(drives)}.replace("\\", "/").split(";");
+  }
+
+  vector<string> list;
+  string path = pathname;
+  path.transform("/", "\\");
+  if(!path.endsWith("\\")) path.append("\\");
+  path.append("*");
+  HANDLE handle;
+  WIN32_FIND_DATA data;
+  handle = FindFirstFile(utf16_t(path), &data);
+  if(handle != INVALID_HANDLE_VALUE) {
+    if(wcscmp(data.cFileName, L".") && wcscmp(data.cFileName, L"..")) {
+      if(data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+        string name = (const char*)utf8_t(data.cFileName);
+        if(name.match(pattern)) list.append(name);
+      }
+    }
+    while(FindNextFile(handle, &data) != false) {
+      if(wcscmp(data.cFileName, L".") && wcscmp(data.cFileName, L"..")) {
+        if(data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+          string name = (const char*)utf8_t(data.cFileName);
+          if(name.match(pattern)) list.append(name);
+        }
+      }
+    }
+    FindClose(handle);
+  }
+  return list;
+}
+
+NALL_HEADER_INLINE auto directory::ufiles(const string& pathname, const string& pattern) -> vector<string> {
+  if(!pathname) return {};
+
+  vector<string> list;
+  string path = pathname;
+  path.transform("/", "\\");
+  if(!path.endsWith("\\")) path.append("\\");
+  path.append("*");
+  HANDLE handle;
+  WIN32_FIND_DATA data;
+  handle = FindFirstFile(utf16_t(path), &data);
+  if(handle != INVALID_HANDLE_VALUE) {
+    if((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
+      string name = (const char*)utf8_t(data.cFileName);
+      if(name.match(pattern)) list.append(name);
+    }
+    while(FindNextFile(handle, &data) != false) {
+      if((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
+        string name = (const char*)utf8_t(data.cFileName);
+        if(name.match(pattern)) list.append(name);
+      }
+    }
+    FindClose(handle);
+  }
+  return list;
+}
+
+#endif
+
+}
diff --git a/waterbox/ares64/ares/nall/directory.hpp b/waterbox/ares64/ares/nall/directory.hpp
index c3a2d3aa6f..86fd1ca480 100644
--- a/waterbox/ares64/ares/nall/directory.hpp
+++ b/waterbox/ares64/ares/nall/directory.hpp
@@ -192,83 +192,6 @@ inline auto directory::copy(const string& source, const string& target) -> bool
     }
     return _wrmdir(utf16_t(pathname)) == 0;
   }
-
-  inline auto directory::exists(const string& pathname) -> bool {
-    if(!pathname) return false;
-    string name = pathname;
-    name.trim("\"", "\"");
-    DWORD result = GetFileAttributes(utf16_t(name));
-    if(result == INVALID_FILE_ATTRIBUTES) return false;
-    return (result & FILE_ATTRIBUTE_DIRECTORY);
-  }
-
-  inline auto directory::ufolders(const string& pathname, const string& pattern) -> vector<string> {
-    if(!pathname) {
-      //special root pseudo-folder (return list of drives)
-      wchar_t drives[PATH_MAX] = {0};
-      GetLogicalDriveStrings(PATH_MAX, drives);
-      wchar_t* p = drives;
-      while(*p || *(p + 1)) {
-        if(!*p) *p = ';';
-        p++;
-      }
-      return string{(const char*)utf8_t(drives)}.replace("\\", "/").split(";");
-    }
-
-    vector<string> list;
-    string path = pathname;
-    path.transform("/", "\\");
-    if(!path.endsWith("\\")) path.append("\\");
-    path.append("*");
-    HANDLE handle;
-    WIN32_FIND_DATA data;
-    handle = FindFirstFile(utf16_t(path), &data);
-    if(handle != INVALID_HANDLE_VALUE) {
-      if(wcscmp(data.cFileName, L".") && wcscmp(data.cFileName, L"..")) {
-        if(data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
-          string name = (const char*)utf8_t(data.cFileName);
-          if(name.match(pattern)) list.append(name);
-        }
-      }
-      while(FindNextFile(handle, &data) != false) {
-        if(wcscmp(data.cFileName, L".") && wcscmp(data.cFileName, L"..")) {
-          if(data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
-            string name = (const char*)utf8_t(data.cFileName);
-            if(name.match(pattern)) list.append(name);
-          }
-        }
-      }
-      FindClose(handle);
-    }
-    return list;
-  }
-
-  inline auto directory::ufiles(const string& pathname, const string& pattern) -> vector<string> {
-    if(!pathname) return {};
-
-    vector<string> list;
-    string path = pathname;
-    path.transform("/", "\\");
-    if(!path.endsWith("\\")) path.append("\\");
-    path.append("*");
-    HANDLE handle;
-    WIN32_FIND_DATA data;
-    handle = FindFirstFile(utf16_t(path), &data);
-    if(handle != INVALID_HANDLE_VALUE) {
-      if((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
-        string name = (const char*)utf8_t(data.cFileName);
-        if(name.match(pattern)) list.append(name);
-      }
-      while(FindNextFile(handle, &data) != false) {
-        if((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
-          string name = (const char*)utf8_t(data.cFileName);
-          if(name.match(pattern)) list.append(name);
-        }
-      }
-      FindClose(handle);
-    }
-    return list;
-  }
 #else
   inline auto directoryIsFolder(DIR* dp, struct dirent* ep) -> bool {
     if(ep->d_type == DT_DIR) return true;
@@ -353,3 +276,7 @@ inline auto directory::copy(const string& source, const string& target) -> bool
 #endif
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/directory.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/dl.cpp b/waterbox/ares64/ares/nall/dl.cpp
new file mode 100644
index 0000000000..71c3ed50be
--- /dev/null
+++ b/waterbox/ares64/ares/nall/dl.cpp
@@ -0,0 +1,39 @@
+#include <nall/dl.hpp>
+
+namespace nall {
+
+#if defined(PLATFORM_WINDOWS)
+
+NALL_HEADER_INLINE auto library::open(const string& name, const string& path) -> bool {
+  if(handle) close();
+  if(path) {
+    string filepath = {path, name, ".dll"};
+    handle = (uintptr)LoadLibraryW(utf16_t(filepath));
+  }
+  if(!handle) {
+    string filepath = {name, ".dll"};
+    handle = (uintptr)LoadLibraryW(utf16_t(filepath));
+  }
+  return handle;
+}
+
+NALL_HEADER_INLINE auto library::openAbsolute(const string& name) -> bool {
+  if(handle) close();
+  handle = (uintptr)LoadLibraryW(utf16_t(name));
+  return handle;
+}
+
+NALL_HEADER_INLINE auto library::sym(const string& name) -> void* {
+  if(!handle) return nullptr;
+  return (void*)GetProcAddress((HMODULE)handle, name);
+}
+
+NALL_HEADER_INLINE auto library::close() -> void {
+  if(!handle) return;
+  FreeLibrary((HMODULE)handle);
+  handle = 0;
+}
+
+#endif
+
+}
diff --git a/waterbox/ares64/ares/nall/dl.hpp b/waterbox/ares64/ares/nall/dl.hpp
index 8116d34921..942511b208 100644
--- a/waterbox/ares64/ares/nall/dl.hpp
+++ b/waterbox/ares64/ares/nall/dl.hpp
@@ -87,35 +87,7 @@ inline auto library::close() -> void {
   handle = 0;
 }
 #elif defined(PLATFORM_WINDOWS)
-inline auto library::open(const string& name, const string& path) -> bool {
-  if(handle) close();
-  if(path) {
-    string filepath = {path, name, ".dll"};
-    handle = (uintptr)LoadLibraryW(utf16_t(filepath));
-  }
-  if(!handle) {
-    string filepath = {name, ".dll"};
-    handle = (uintptr)LoadLibraryW(utf16_t(filepath));
-  }
-  return handle;
-}
-
-inline auto library::openAbsolute(const string& name) -> bool {
-  if(handle) close();
-  handle = (uintptr)LoadLibraryW(utf16_t(name));
-  return handle;
-}
-
-inline auto library::sym(const string& name) -> void* {
-  if(!handle) return nullptr;
-  return (void*)GetProcAddress((HMODULE)handle, name);
-}
-
-inline auto library::close() -> void {
-  if(!handle) return;
-  FreeLibrary((HMODULE)handle);
-  handle = 0;
-}
+//defined in dl.cpp
 #else
 inline auto library::open(const string&, const string&) -> bool { return false; }
 inline auto library::openAbsolute(const string&) -> bool { return false; }
@@ -124,3 +96,7 @@ inline auto library::close() -> void {}
 #endif
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/dl.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/file-map.cpp b/waterbox/ares64/ares/nall/file-map.cpp
new file mode 100644
index 0000000000..c579f0c607
--- /dev/null
+++ b/waterbox/ares64/ares/nall/file-map.cpp
@@ -0,0 +1,80 @@
+#include <nall/file-map.hpp>
+
+namespace nall {
+
+#if defined(API_WINDOWS)
+
+NALL_HEADER_INLINE auto file_map::open(const string& filename, u32 mode_) -> bool {
+  close();
+  if(file::exists(filename) && file::size(filename) == 0) return _open = true;
+
+  s32 desiredAccess, creationDisposition, protection, mapAccess;
+
+  switch(mode_) {
+  default: return false;
+  case mode::read:
+    desiredAccess = GENERIC_READ;
+    creationDisposition = OPEN_EXISTING;
+    protection = PAGE_READONLY;
+    mapAccess = FILE_MAP_READ;
+    break;
+  case mode::write:
+    //write access requires read access
+    desiredAccess = GENERIC_WRITE;
+    creationDisposition = CREATE_ALWAYS;
+    protection = PAGE_READWRITE;
+    mapAccess = FILE_MAP_ALL_ACCESS;
+    break;
+  case mode::modify:
+    desiredAccess = GENERIC_READ | GENERIC_WRITE;
+    creationDisposition = OPEN_EXISTING;
+    protection = PAGE_READWRITE;
+    mapAccess = FILE_MAP_ALL_ACCESS;
+    break;
+  case mode::append:
+    desiredAccess = GENERIC_READ | GENERIC_WRITE;
+    creationDisposition = CREATE_NEW;
+    protection = PAGE_READWRITE;
+    mapAccess = FILE_MAP_ALL_ACCESS;
+    break;
+  }
+
+  _file = CreateFileW(utf16_t(filename), desiredAccess, FILE_SHARE_READ, nullptr,
+    creationDisposition, FILE_ATTRIBUTE_NORMAL, nullptr);
+  if(_file == INVALID_HANDLE_VALUE) return false;
+
+  _size = GetFileSize(_file, nullptr);
+
+  _map = CreateFileMapping(_file, nullptr, protection, 0, _size, nullptr);
+  if(_map == INVALID_HANDLE_VALUE) {
+    CloseHandle(_file);
+    _file = INVALID_HANDLE_VALUE;
+    return false;
+  }
+
+  _data = (u8*)MapViewOfFile(_map, mapAccess, 0, 0, _size);
+  return _open = true;
+}
+
+NALL_HEADER_INLINE auto file_map::close() -> void {
+  if(_data) {
+    UnmapViewOfFile(_data);
+    _data = nullptr;
+  }
+
+  if(_map != INVALID_HANDLE_VALUE) {
+    CloseHandle(_map);
+    _map = INVALID_HANDLE_VALUE;
+  }
+
+  if(_file != INVALID_HANDLE_VALUE) {
+    CloseHandle(_file);
+    _file = INVALID_HANDLE_VALUE;
+  }
+
+  _open = false;
+}
+
+#endif
+
+}
diff --git a/waterbox/ares64/ares/nall/file-map.hpp b/waterbox/ares64/ares/nall/file-map.hpp
index 2f7e506e55..0a7ec70094 100644
--- a/waterbox/ares64/ares/nall/file-map.hpp
+++ b/waterbox/ares64/ares/nall/file-map.hpp
@@ -73,76 +73,9 @@ public:
     return *this;
   }
 
-  auto open(const string& filename, u32 mode_) -> bool {
-    close();
-    if(file::exists(filename) && file::size(filename) == 0) return _open = true;
+  auto open(const string& filename, u32 mode_) -> bool;
 
-    s32 desiredAccess, creationDisposition, protection, mapAccess;
-
-    switch(mode_) {
-    default: return false;
-    case mode::read:
-      desiredAccess = GENERIC_READ;
-      creationDisposition = OPEN_EXISTING;
-      protection = PAGE_READONLY;
-      mapAccess = FILE_MAP_READ;
-      break;
-    case mode::write:
-      //write access requires read access
-      desiredAccess = GENERIC_WRITE;
-      creationDisposition = CREATE_ALWAYS;
-      protection = PAGE_READWRITE;
-      mapAccess = FILE_MAP_ALL_ACCESS;
-      break;
-    case mode::modify:
-      desiredAccess = GENERIC_READ | GENERIC_WRITE;
-      creationDisposition = OPEN_EXISTING;
-      protection = PAGE_READWRITE;
-      mapAccess = FILE_MAP_ALL_ACCESS;
-      break;
-    case mode::append:
-      desiredAccess = GENERIC_READ | GENERIC_WRITE;
-      creationDisposition = CREATE_NEW;
-      protection = PAGE_READWRITE;
-      mapAccess = FILE_MAP_ALL_ACCESS;
-      break;
-    }
-
-    _file = CreateFileW(utf16_t(filename), desiredAccess, FILE_SHARE_READ, nullptr,
-      creationDisposition, FILE_ATTRIBUTE_NORMAL, nullptr);
-    if(_file == INVALID_HANDLE_VALUE) return false;
-
-    _size = GetFileSize(_file, nullptr);
-
-    _map = CreateFileMapping(_file, nullptr, protection, 0, _size, nullptr);
-    if(_map == INVALID_HANDLE_VALUE) {
-      CloseHandle(_file);
-      _file = INVALID_HANDLE_VALUE;
-      return false;
-    }
-
-    _data = (u8*)MapViewOfFile(_map, mapAccess, 0, 0, _size);
-    return _open = true;
-  }
-
-  auto close() -> void {
-    if(_data) {
-      UnmapViewOfFile(_data);
-      _data = nullptr;
-    }
-
-    if(_map != INVALID_HANDLE_VALUE) {
-      CloseHandle(_map);
-      _map = INVALID_HANDLE_VALUE;
-    }
-
-    if(_file != INVALID_HANDLE_VALUE) {
-      CloseHandle(_file);
-      _file = INVALID_HANDLE_VALUE;
-    }
-
-    _open = false;
-  }
+  auto close() -> void;
 
   #else
 
@@ -229,3 +162,7 @@ public:
 };
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/file-map.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/file.hpp b/waterbox/ares64/ares/nall/file.hpp
index a4bec188ba..91be160ef7 100644
--- a/waterbox/ares64/ares/nall/file.hpp
+++ b/waterbox/ares64/ares/nall/file.hpp
@@ -68,7 +68,7 @@ struct file : inode {
     struct __stat64 data;
     _wstat64(utf16_t(filename), &data);
     #endif
-    return S_ISREG(data.st_mode) ? data.st_size : 0u;
+    return (data.st_mode & S_IFMT) == S_IFREG ? data.st_size : 0u;
   }
 
   static auto read(const string& filename) -> vector<u8> {
diff --git a/waterbox/ares64/ares/nall/function.hpp b/waterbox/ares64/ares/nall/function.hpp
index d28b809f1d..617330c4ad 100644
--- a/waterbox/ares64/ares/nall/function.hpp
+++ b/waterbox/ares64/ares/nall/function.hpp
@@ -22,7 +22,6 @@ template<typename R, typename... P> struct function<auto (P...) -> R> {
   template<typename C> function(auto (C::*function)(P...) -> R, C* object) { callback = new member<C>(function, object); }
   template<typename C> function(auto (C::*function)(P...) const -> R, C* object) { callback = new member<C>((auto (C::*)(P...) -> R)function, object); }
   template<typename L, typename = enable_if_t<is_compatible<L>::value>> function(const L& object) { callback = new lambda<L>(object); }
-  explicit function(void* function) { if(function) callback = new global((auto (*)(P...) -> R)function); }
   ~function() { if(callback) delete callback; }
 
   explicit operator bool() const { return callback; }
@@ -37,12 +36,6 @@ template<typename R, typename... P> struct function<auto (P...) -> R> {
     return *this;
   }
 
-  auto operator=(void* source) -> function& {
-    if(callback) { delete callback; callback = nullptr; }
-    callback = new global((auto (*)(P...) -> R)source);
-    return *this;
-  }
-
 private:
   struct container {
     virtual auto operator()(P... p) const -> R = 0;
diff --git a/waterbox/ares64/ares/nall/http/client.cpp b/waterbox/ares64/ares/nall/http/client.cpp
new file mode 100644
index 0000000000..9d5c98ff7f
--- /dev/null
+++ b/waterbox/ares64/ares/nall/http/client.cpp
@@ -0,0 +1,36 @@
+#include <nall/http/client.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+  #include <ws2tcpip.h>
+#endif
+
+namespace nall::HTTP {
+
+NALL_HEADER_INLINE auto Client::open(const string& hostname, u16 port) -> bool {
+  addrinfo hint = {};
+  hint.ai_family = AF_UNSPEC;
+  hint.ai_socktype = SOCK_STREAM;
+  hint.ai_flags = AI_ADDRCONFIG;
+
+  if(getaddrinfo(hostname, string{port}, &hint, &info) != 0) return close(), false;
+
+  fd = socket(info->ai_family, info->ai_socktype, info->ai_protocol);
+  if(fd < 0) return close(), false;
+
+  if(connect(fd, info->ai_addr, info->ai_addrlen) < 0) return close(), false;
+  return true;
+}
+
+NALL_HEADER_INLINE auto Client::close() -> void {
+  if(fd) {
+    ::close(fd);
+    fd = -1;
+  }
+
+  if(info) {
+    freeaddrinfo(info);
+    info = nullptr;
+  }
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/http/client.hpp b/waterbox/ares64/ares/nall/http/client.hpp
index 1fbf54a04b..8e96417920 100644
--- a/waterbox/ares64/ares/nall/http/client.hpp
+++ b/waterbox/ares64/ares/nall/http/client.hpp
@@ -2,6 +2,8 @@
 
 #include <nall/http/role.hpp>
 
+struct addrinfo;
+
 namespace nall::HTTP {
 
 struct Client : Role {
@@ -16,21 +18,6 @@ private:
   addrinfo* info = nullptr;
 };
 
-inline auto Client::open(const string& hostname, u16 port) -> bool {
-  addrinfo hint = {};
-  hint.ai_family = AF_UNSPEC;
-  hint.ai_socktype = SOCK_STREAM;
-  hint.ai_flags = AI_ADDRCONFIG;
-
-  if(getaddrinfo(hostname, string{port}, &hint, &info) != 0) return close(), false;
-
-  fd = socket(info->ai_family, info->ai_socktype, info->ai_protocol);
-  if(fd < 0) return close(), false;
-
-  if(connect(fd, info->ai_addr, info->ai_addrlen) < 0) return close(), false;
-  return true;
-}
-
 inline auto Client::upload(const Request& request) -> bool {
   return Role::upload(fd, request);
 }
@@ -41,16 +28,8 @@ inline auto Client::download(const Request& request) -> Response {
   return response;
 }
 
-inline auto Client::close() -> void {
-  if(fd) {
-    ::close(fd);
-    fd = -1;
-  }
-
-  if(info) {
-    freeaddrinfo(info);
-    info = nullptr;
-  }
 }
 
-}
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/http/client.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/http/response.hpp b/waterbox/ares64/ares/nall/http/response.hpp
index 93a3b7a7b5..bc5aa12061 100644
--- a/waterbox/ares64/ares/nall/http/response.hpp
+++ b/waterbox/ares64/ares/nall/http/response.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <nall/chrono.hpp>
+#include <nall/file-map.hpp>
 #include <nall/http/message.hpp>
 
 namespace nall::HTTP {
diff --git a/waterbox/ares64/ares/nall/http/server.cpp b/waterbox/ares64/ares/nall/http/server.cpp
new file mode 100644
index 0000000000..98f225b729
--- /dev/null
+++ b/waterbox/ares64/ares/nall/http/server.cpp
@@ -0,0 +1,185 @@
+#include <nall/http/server.hpp>
+#include <nall/thread.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+  #include <ws2tcpip.h>
+#endif
+
+namespace nall::HTTP {
+
+NALL_HEADER_INLINE auto Server::open(u16 port, const string& serviceName, const string& command) -> bool {
+  if(serviceName) {
+    if(!service::command(serviceName, command)) return false;
+  }
+
+  fd4 = socket(AF_INET, SOCK_STREAM, 0);
+  fd6 = socket(AF_INET6, SOCK_STREAM, 0);
+  if(!ipv4() && !ipv6()) return false;
+
+  {
+  #if defined(SO_RCVTIMEO)
+  if(settings.timeoutReceive) {
+    struct timeval rcvtimeo;
+    rcvtimeo.tv_sec  = settings.timeoutReceive / 1000;
+    rcvtimeo.tv_usec = settings.timeoutReceive % 1000 * 1000;
+    if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_RCVTIMEO, &rcvtimeo, sizeof(struct timeval));
+    if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_RCVTIMEO, &rcvtimeo, sizeof(struct timeval));
+  }
+  #endif
+
+  #if defined(SO_SNDTIMEO)
+  if(settings.timeoutSend) {
+    struct timeval sndtimeo;
+    sndtimeo.tv_sec  = settings.timeoutSend / 1000;
+    sndtimeo.tv_usec = settings.timeoutSend % 1000 * 1000;
+    if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_SNDTIMEO, &sndtimeo, sizeof(struct timeval));
+    if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_SNDTIMEO, &sndtimeo, sizeof(struct timeval));
+  }
+  #endif
+
+  #if defined(SO_NOSIGPIPE)  //BSD, OSX
+  s32 nosigpipe = 1;
+  if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_NOSIGPIPE, &nosigpipe, sizeof(s32));
+  if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_NOSIGPIPE, &nosigpipe, sizeof(s32));
+  #endif
+
+  #if defined(SO_REUSEADDR)  //BSD, Linux, OSX
+  s32 reuseaddr = 1;
+  if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(s32));
+  if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(s32));
+  #endif
+
+  #if defined(SO_REUSEPORT)  //BSD, OSX
+  s32 reuseport = 1;
+  if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(s32));
+  if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(s32));
+  #endif
+  }
+
+  addrin4.sin_family = AF_INET;
+  addrin4.sin_addr.s_addr = htonl(INADDR_ANY);
+  addrin4.sin_port = htons(port);
+
+  addrin6.sin6_family = AF_INET6;
+  addrin6.sin6_addr = in6addr_any;
+  addrin6.sin6_port = htons(port);
+
+  if(bind(fd4, (struct sockaddr*)&addrin4, sizeof(addrin4)) < 0 || listen(fd4, SOMAXCONN) < 0) ipv4_close();
+  if(bind(fd6, (struct sockaddr*)&addrin6, sizeof(addrin6)) < 0 || listen(fd6, SOMAXCONN) < 0) ipv6_close();
+  return ipv4() || ipv6();
+}
+
+NALL_HEADER_INLINE auto Server::ipv4_scan() -> bool {
+  struct pollfd query = {0};
+  query.fd = fd4;
+  query.events = POLLIN;
+  poll(&query, 1, 0);
+
+  if(query.fd == fd4 && query.revents & POLLIN) {
+    ++connections;
+
+    thread::create([&](uintptr) {
+      thread::detach();
+
+      s32 clientfd = -1;
+      struct sockaddr_in settings = {0};
+      socklen_t socklen = sizeof(sockaddr_in);
+
+      clientfd = accept(fd4, (struct sockaddr*)&settings, &socklen);
+      if(clientfd < 0) return;
+
+      u32 ip = ntohl(settings.sin_addr.s_addr);
+
+      Request request;
+      request._ipv6 = false;
+      request._ip = {
+        (u8)(ip >> 24), ".",
+        (u8)(ip >> 16), ".",
+        (u8)(ip >>  8), ".",
+        (u8)(ip >>  0)
+      };
+
+      if(download(clientfd, request) && callback) {
+        auto response = callback(request);
+        upload(clientfd, response);
+      } else {
+        upload(clientfd, Response());  //"501 Not Implemented"
+      }
+
+      ::close(clientfd);
+      --connections;
+    }, 0, settings.threadStackSize);
+
+    return true;
+  }
+
+  return false;
+}
+
+NALL_HEADER_INLINE auto Server::ipv6_scan() -> bool {
+  struct pollfd query = {0};
+  query.fd = fd6;
+  query.events = POLLIN;
+  poll(&query, 1, 0);
+
+  if(query.fd == fd6 && query.revents & POLLIN) {
+    ++connections;
+
+    thread::create([&](uintptr) {
+      thread::detach();
+
+      s32 clientfd = -1;
+      struct sockaddr_in6 settings = {0};
+      socklen_t socklen = sizeof(sockaddr_in6);
+
+      clientfd = accept(fd6, (struct sockaddr*)&settings, &socklen);
+      if(clientfd < 0) return;
+
+      u8* ip = settings.sin6_addr.s6_addr;
+      u16 ipSegment[8];
+      for(auto n : range(8)) ipSegment[n] = ip[n * 2 + 0] * 256 + ip[n * 2 + 1];
+
+      Request request;
+      request._ipv6 = true;
+      //RFC5952 IPv6 encoding: the first longest 2+ consecutive zero-sequence is compressed to "::"
+      s32 zeroOffset  = -1;
+      s32 zeroLength  =  0;
+      s32 zeroCounter =  0;
+      for(auto n : range(8)) {
+        u16 value = ipSegment[n];
+        if(value == 0) zeroCounter++;
+        if(zeroCounter > zeroLength) {
+          zeroLength = zeroCounter;
+          zeroOffset = 1 + n - zeroLength;
+        }
+        if(value != 0) zeroCounter = 0;
+      }
+      if(zeroLength == 1) zeroOffset = -1;
+      for(u32 n = 0; n < 8;) {
+        if(n == zeroOffset) {
+          request._ip.append(n == 0 ? "::" : ":");
+          n += zeroLength;
+        } else {
+          u16 value = ipSegment[n];
+          request._ip.append(hex(value), n++ != 7 ? ":" : "");
+        }
+      }
+
+      if(download(clientfd, request) && callback) {
+        auto response = callback(request);
+        upload(clientfd, response);
+      } else {
+        upload(clientfd, Response());  //"501 Not Implemented"
+      }
+
+      ::close(clientfd);
+      --connections;
+    }, 0, settings.threadStackSize);
+
+    return true;
+  }
+
+  return false;
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/http/server.hpp b/waterbox/ares64/ares/nall/http/server.hpp
index 5560fec27e..94469ffba8 100644
--- a/waterbox/ares64/ares/nall/http/server.hpp
+++ b/waterbox/ares64/ares/nall/http/server.hpp
@@ -3,6 +3,9 @@
 #include <nall/service.hpp>
 #include <nall/http/role.hpp>
 
+struct sockaddr_in;
+struct sockaddr_in6;
+
 namespace nall::HTTP {
 
 struct Server : Role, service {
@@ -18,8 +21,10 @@ private:
 
   s32 fd4 = -1;
   s32 fd6 = -1;
-  struct sockaddr_in addrin4 = {0};
-  struct sockaddr_in6 addrin6 = {0};
+  u64 addrin4_storage[16] = {0};  //sizeof(sockaddr_storage) = 128
+  u64 addrin6_storage[16] = {0};
+  sockaddr_in& addrin4 = (sockaddr_in&)addrin4_storage;
+  sockaddr_in6& addrin6 = (sockaddr_in6&)addrin6_storage;
 
   auto ipv4() const -> bool { return fd4 >= 0; }
   auto ipv6() const -> bool { return fd6 >= 0; }
@@ -31,68 +36,6 @@ private:
   auto ipv6_scan() -> bool;
 };
 
-inline auto Server::open(u16 port, const string& serviceName, const string& command) -> bool {
-  if(serviceName) {
-    if(!service::command(serviceName, command)) return false;
-  }
-
-  fd4 = socket(AF_INET, SOCK_STREAM, 0);
-  fd6 = socket(AF_INET6, SOCK_STREAM, 0);
-  if(!ipv4() && !ipv6()) return false;
-
-  {
-  #if defined(SO_RCVTIMEO)
-  if(settings.timeoutReceive) {
-    struct timeval rcvtimeo;
-    rcvtimeo.tv_sec  = settings.timeoutReceive / 1000;
-    rcvtimeo.tv_usec = settings.timeoutReceive % 1000 * 1000;
-    if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_RCVTIMEO, &rcvtimeo, sizeof(struct timeval));
-    if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_RCVTIMEO, &rcvtimeo, sizeof(struct timeval));
-  }
-  #endif
-
-  #if defined(SO_SNDTIMEO)
-  if(settings.timeoutSend) {
-    struct timeval sndtimeo;
-    sndtimeo.tv_sec  = settings.timeoutSend / 1000;
-    sndtimeo.tv_usec = settings.timeoutSend % 1000 * 1000;
-    if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_SNDTIMEO, &sndtimeo, sizeof(struct timeval));
-    if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_SNDTIMEO, &sndtimeo, sizeof(struct timeval));
-  }
-  #endif
-
-  #if defined(SO_NOSIGPIPE)  //BSD, OSX
-  s32 nosigpipe = 1;
-  if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_NOSIGPIPE, &nosigpipe, sizeof(s32));
-  if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_NOSIGPIPE, &nosigpipe, sizeof(s32));
-  #endif
-
-  #if defined(SO_REUSEADDR)  //BSD, Linux, OSX
-  s32 reuseaddr = 1;
-  if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(s32));
-  if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(s32));
-  #endif
-
-  #if defined(SO_REUSEPORT)  //BSD, OSX
-  s32 reuseport = 1;
-  if(ipv4()) setsockopt(fd4, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(s32));
-  if(ipv6()) setsockopt(fd6, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(s32));
-  #endif
-  }
-
-  addrin4.sin_family = AF_INET;
-  addrin4.sin_addr.s_addr = htonl(INADDR_ANY);
-  addrin4.sin_port = htons(port);
-
-  addrin6.sin6_family = AF_INET6;
-  addrin6.sin6_addr = in6addr_any;
-  addrin6.sin6_port = htons(port);
-
-  if(bind(fd4, (struct sockaddr*)&addrin4, sizeof(addrin4)) < 0 || listen(fd4, SOMAXCONN) < 0) ipv4_close();
-  if(bind(fd6, (struct sockaddr*)&addrin6, sizeof(addrin6)) < 0 || listen(fd6, SOMAXCONN) < 0) ipv6_close();
-  return ipv4() || ipv6();
-}
-
 inline auto Server::main(const function<Response (Request&)>& function) -> void {
   callback = function;
 }
@@ -105,122 +48,13 @@ inline auto Server::scan() -> string {
   return "idle";
 }
 
-inline auto Server::ipv4_scan() -> bool {
-  struct pollfd query = {0};
-  query.fd = fd4;
-  query.events = POLLIN;
-  poll(&query, 1, 0);
-
-  if(query.fd == fd4 && query.revents & POLLIN) {
-    ++connections;
-
-    thread::create([&](uintptr) {
-      thread::detach();
-
-      s32 clientfd = -1;
-      struct sockaddr_in settings = {0};
-      socklen_t socklen = sizeof(sockaddr_in);
-
-      clientfd = accept(fd4, (struct sockaddr*)&settings, &socklen);
-      if(clientfd < 0) return;
-
-      u32 ip = ntohl(settings.sin_addr.s_addr);
-
-      Request request;
-      request._ipv6 = false;
-      request._ip = {
-        (u8)(ip >> 24), ".",
-        (u8)(ip >> 16), ".",
-        (u8)(ip >>  8), ".",
-        (u8)(ip >>  0)
-      };
-
-      if(download(clientfd, request) && callback) {
-        auto response = callback(request);
-        upload(clientfd, response);
-      } else {
-        upload(clientfd, Response());  //"501 Not Implemented"
-      }
-
-      ::close(clientfd);
-      --connections;
-    }, 0, settings.threadStackSize);
-
-    return true;
-  }
-
-  return false;
-}
-
-inline auto Server::ipv6_scan() -> bool {
-  struct pollfd query = {0};
-  query.fd = fd6;
-  query.events = POLLIN;
-  poll(&query, 1, 0);
-
-  if(query.fd == fd6 && query.revents & POLLIN) {
-    ++connections;
-
-    thread::create([&](uintptr) {
-      thread::detach();
-
-      s32 clientfd = -1;
-      struct sockaddr_in6 settings = {0};
-      socklen_t socklen = sizeof(sockaddr_in6);
-
-      clientfd = accept(fd6, (struct sockaddr*)&settings, &socklen);
-      if(clientfd < 0) return;
-
-      u8* ip = settings.sin6_addr.s6_addr;
-      u16 ipSegment[8];
-      for(auto n : range(8)) ipSegment[n] = ip[n * 2 + 0] * 256 + ip[n * 2 + 1];
-
-      Request request;
-      request._ipv6 = true;
-      //RFC5952 IPv6 encoding: the first longest 2+ consecutive zero-sequence is compressed to "::"
-      s32 zeroOffset  = -1;
-      s32 zeroLength  =  0;
-      s32 zeroCounter =  0;
-      for(auto n : range(8)) {
-        u16 value = ipSegment[n];
-        if(value == 0) zeroCounter++;
-        if(zeroCounter > zeroLength) {
-          zeroLength = zeroCounter;
-          zeroOffset = 1 + n - zeroLength;
-        }
-        if(value != 0) zeroCounter = 0;
-      }
-      if(zeroLength == 1) zeroOffset = -1;
-      for(u32 n = 0; n < 8;) {
-        if(n == zeroOffset) {
-          request._ip.append(n == 0 ? "::" : ":");
-          n += zeroLength;
-        } else {
-          u16 value = ipSegment[n];
-          request._ip.append(hex(value), n++ != 7 ? ":" : "");
-        }
-      }
-
-      if(download(clientfd, request) && callback) {
-        auto response = callback(request);
-        upload(clientfd, response);
-      } else {
-        upload(clientfd, Response());  //"501 Not Implemented"
-      }
-
-      ::close(clientfd);
-      --connections;
-    }, 0, settings.threadStackSize);
-
-    return true;
-  }
-
-  return false;
-}
-
 inline auto Server::close() -> void {
   ipv4_close();
   ipv6_close();
 }
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/http/server.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/inode.cpp b/waterbox/ares64/ares/nall/inode.cpp
new file mode 100644
index 0000000000..48bef1876a
--- /dev/null
+++ b/waterbox/ares64/ares/nall/inode.cpp
@@ -0,0 +1,15 @@
+#include <nall/inode.hpp>
+
+namespace nall {
+
+NALL_HEADER_INLINE auto inode::hidden(const string& name) -> bool {
+  #if defined(PLATFORM_WINDOWS)
+  auto attributes = GetFileAttributes(utf16_t(name));
+  return attributes & FILE_ATTRIBUTE_HIDDEN;
+  #else
+  //todo: is this really the best way to do this? stat doesn't have S_ISHIDDEN ...
+  return name.split("/").last().beginsWith(".");
+  #endif
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/inode.hpp b/waterbox/ares64/ares/nall/inode.hpp
index 491fdd1612..11d3fa2c89 100644
--- a/waterbox/ares64/ares/nall/inode.hpp
+++ b/waterbox/ares64/ares/nall/inode.hpp
@@ -6,6 +6,22 @@
 #include <nall/platform.hpp>
 #include <nall/string.hpp>
 
+#if !defined(F_OK)
+  #define F_OK 0
+#endif
+
+#if !defined(X_OK)
+  #define X_OK 1
+#endif
+
+#if !defined(W_OK)
+  #define W_OK 2
+#endif
+
+#if !defined(R_OK)
+  #define R_OK 4
+#endif
+
 namespace nall {
 
 struct inode {
@@ -31,15 +47,7 @@ struct inode {
     return access(name, X_OK) == 0;
   }
 
-  static auto hidden(const string& name) -> bool {
-    #if defined(PLATFORM_WINDOWS)
-    auto attributes = GetFileAttributes(utf16_t(name));
-    return attributes & FILE_ATTRIBUTE_HIDDEN;
-    #else
-    //todo: is this really the best way to do this? stat doesn't have S_ISHIDDEN ...
-    return name.split("/").last().beginsWith(".");
-    #endif
-  }
+  static auto hidden(const string& name) -> bool;
 
   static auto mode(const string& name) -> u32 {
     struct stat data{};
@@ -161,3 +169,7 @@ struct inode {
 };
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/inode.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/intrinsics.hpp b/waterbox/ares64/ares/nall/intrinsics.hpp
index 29c65da4ec..c3aa3d4871 100644
--- a/waterbox/ares64/ares/nall/intrinsics.hpp
+++ b/waterbox/ares64/ares/nall/intrinsics.hpp
@@ -1,5 +1,11 @@
 #pragma once
 
+#if defined(NALL_HEADER_ONLY)
+  #define NALL_HEADER_INLINE inline
+#else
+  #define NALL_HEADER_INLINE
+#endif
+
 #if defined(__APPLE__)
   #include <machine/endian.h>
 #elif defined(linux) || defined(__linux__)
@@ -19,6 +25,11 @@ namespace nall {
     static constexpr bool GCC       = 0;
     static constexpr bool Microsoft = 0;
   };
+  #pragma clang diagnostic error   "-Wc++20-extensions"
+  #pragma clang diagnostic error   "-Wgnu-case-range"
+  #pragma clang diagnostic error   "-Wgnu-statement-expression"
+  #pragma clang diagnostic error   "-Wvla"
+  #pragma clang diagnostic warning "-Wimplicit-fallthrough"
   #pragma clang diagnostic warning "-Wreturn-type"
   #pragma clang diagnostic ignored "-Wunused-result"
   #pragma clang diagnostic ignored "-Wunknown-pragmas"
@@ -28,7 +39,6 @@ namespace nall {
   #pragma clang diagnostic ignored "-Wswitch-bool"
   #pragma clang diagnostic ignored "-Wabsolute-value"
   #pragma clang diagnostic ignored "-Wtrigraphs"
-  #pragma clang diagnostic ignored "-Wnarrowing"
   #pragma clang diagnostic ignored "-Wattributes"
 #elif defined(__GNUC__)
   #define COMPILER_GCC
@@ -37,13 +47,14 @@ namespace nall {
     static constexpr bool GCC       = 1;
     static constexpr bool Microsoft = 0;
   };
+  #pragma GCC diagnostic error   "-Wvla"
+  #pragma GCC diagnostic warning "-Wimplicit-fallthrough"
   #pragma GCC diagnostic warning "-Wreturn-type"
   #pragma GCC diagnostic ignored "-Wunused-result"
   #pragma GCC diagnostic ignored "-Wunknown-pragmas"
   #pragma GCC diagnostic ignored "-Wpragmas"
   #pragma GCC diagnostic ignored "-Wswitch-bool"
   #pragma GCC diagnostic ignored "-Wtrigraphs"
-  #pragma GCC diagnostic ignored "-Wnarrowing"
   #pragma GCC diagnostic ignored "-Wattributes"
   #pragma GCC diagnostic ignored "-Wstringop-overflow"  //GCC 10.2 warning heuristic is buggy
 #elif defined(_MSC_VER)
@@ -180,7 +191,7 @@ namespace nall {
   };
 #elif defined(__amd64__) || defined(_M_AMD64)
   #define ARCHITECTURE_AMD64
-  #if defined(__SSE4_1__)
+  #if defined(__SSE4_1__) || defined(COMPILER_MICROSOFT)
     #define ARCHITECTURE_SUPPORTS_SSE4_1 1
   #endif
   struct Architecture {
@@ -191,9 +202,11 @@ namespace nall {
     static constexpr bool ppc64 = 0;
     static constexpr bool ppc32 = 0;
   };
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(_M_ARM64)
   #define ARCHITECTURE_ARM64
-  #define ARCHITECTURE_SUPPORTS_SSE4_1 1 // simulated via sse2neon.h
+  #if !defined(COMPILER_MICROSOFT)
+    #define ARCHITECTURE_SUPPORTS_SSE4_1 1 // simulated via sse2neon.h
+  #endif
   struct Architecture {
     static constexpr bool x86   = 0;
     static constexpr bool amd64 = 0;
@@ -242,7 +255,7 @@ namespace nall {
 
 /* Endian detection */
 
-#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || defined(__LITTLE_ENDIAN__) || defined(__i386__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_AMD64)
+#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || defined(__LITTLE_ENDIAN__) || defined(__i386__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_AMD64) || defined(_M_ARM64)
   #define ENDIAN_LITTLE
   struct Endian {
     static constexpr bool Little = 1;
diff --git a/waterbox/ares64/ares/nall/main.cpp b/waterbox/ares64/ares/nall/main.cpp
new file mode 100644
index 0000000000..37b7a5ee30
--- /dev/null
+++ b/waterbox/ares64/ares/nall/main.cpp
@@ -0,0 +1,55 @@
+#include <nall/main.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+  #include <objbase.h>
+  #include <shellapi.h>
+  #include <winsock2.h>
+#endif
+
+namespace nall {
+
+auto main(int argc, char** argv) -> int {
+  #if defined(PLATFORM_WINDOWS)
+  CoInitialize(0);
+  WSAData wsaData{0};
+  WSAStartup(MAKEWORD(2, 2), &wsaData);
+  _setmode(_fileno(stdin ), O_BINARY);
+  _setmode(_fileno(stdout), O_BINARY);
+  _setmode(_fileno(stderr), O_BINARY);
+  #endif
+
+  main(Arguments{argc, argv});
+
+  #if !defined(PLATFORM_WINDOWS)
+  //when a program is running, input on the terminal queues in stdin
+  //when terminating the program, the shell proceeds to try and execute all stdin data
+  //this is annoying behavior: this code tries to minimize the impact as much as it can
+  //we can flush all of stdin up to the last line feed, preventing spurious commands from executing
+  //however, even with setvbuf(_IONBF), we can't stop the last line from echoing to the terminal
+  auto flags = fcntl(fileno(stdin), F_GETFL, 0);
+  fcntl(fileno(stdin), F_SETFL, flags | O_NONBLOCK);  //don't allow read() to block when empty
+  char buffer[4096], data = false;
+  while(read(fileno(stdin), buffer, sizeof(buffer)) > 0) data = true;
+  fcntl(fileno(stdin), F_SETFL, flags);  //restore original flags for the terminal
+  if(data) putchar('\r');  //ensures PS1 is printed at the start of the line
+  #endif
+
+  return EXIT_SUCCESS;
+}
+
+}
+
+#if defined(PLATFORM_WINDOWS) && defined(SUBSYTEM_WINDOWS)
+
+auto WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, PSTR pCmdLine, int nCmdShow) -> int {
+  //arguments are retrieved later via GetCommandLineW()
+  return nall::main(0, nullptr);
+}
+
+#else
+
+auto main(int argc, char** argv) -> int {
+  return nall::main(argc, argv);
+}
+
+#endif
diff --git a/waterbox/ares64/ares/nall/main.hpp b/waterbox/ares64/ares/nall/main.hpp
index 273e43f605..f50cc3753e 100644
--- a/waterbox/ares64/ares/nall/main.hpp
+++ b/waterbox/ares64/ares/nall/main.hpp
@@ -7,36 +7,9 @@
 namespace nall {
   auto main(Arguments arguments) -> void;
 
-  auto main(int argc, char** argv) -> int {
-    #if defined(PLATFORM_WINDOWS)
-    CoInitialize(0);
-    WSAData wsaData{0};
-    WSAStartup(MAKEWORD(2, 2), &wsaData);
-    _setmode(_fileno(stdin ), O_BINARY);
-    _setmode(_fileno(stdout), O_BINARY);
-    _setmode(_fileno(stderr), O_BINARY);
-    #endif
-
-    main(Arguments{argc, argv});
-
-    #if !defined(PLATFORM_WINDOWS)
-    //when a program is running, input on the terminal queues in stdin
-    //when terminating the program, the shell proceeds to try and execute all stdin data
-    //this is annoying behavior: this code tries to minimize the impact as much as it can
-    //we can flush all of stdin up to the last line feed, preventing spurious commands from executing
-    //however, even with setvbuf(_IONBF), we can't stop the last line from echoing to the terminal
-    auto flags = fcntl(fileno(stdin), F_GETFL, 0);
-    fcntl(fileno(stdin), F_SETFL, flags | O_NONBLOCK);  //don't allow read() to block when empty
-    char buffer[4096], data = false;
-    while(read(fileno(stdin), buffer, sizeof(buffer)) > 0) data = true;
-    fcntl(fileno(stdin), F_SETFL, flags);  //restore original flags for the terminal
-    if(data) putchar('\r');  //ensures PS1 is printed at the start of the line
-    #endif
-
-    return EXIT_SUCCESS;
-  }
+  auto main(int argc, char** argv) -> int;
 }
 
-auto main(int argc, char** argv) -> int {
-  return nall::main(argc, argv);
-}
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/main.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/memory.cpp b/waterbox/ares64/ares/nall/memory.cpp
new file mode 100644
index 0000000000..8479cff96b
--- /dev/null
+++ b/waterbox/ares64/ares/nall/memory.cpp
@@ -0,0 +1,47 @@
+#include <nall/memory.hpp>
+
+namespace nall::memory {
+
+NALL_HEADER_INLINE auto map(u32 size, bool executable) -> void* {
+  #if defined(API_WINDOWS)
+  DWORD protect = executable ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
+  return VirtualAlloc(nullptr, size, MEM_RESERVE | MEM_COMMIT, protect);
+  #elif defined(API_POSIX)
+  int prot = PROT_READ | PROT_WRITE;
+  int flags = MAP_ANON | MAP_PRIVATE;
+  if(executable) {
+    prot |= PROT_EXEC;
+    #if defined(PLATFORM_MACOS)
+    flags |= MAP_JIT;
+    #endif
+  }
+  return mmap(nullptr, size, prot, flags, -1, 0);
+  #else
+  return nullptr;
+  #endif
+}
+
+NALL_HEADER_INLINE auto unmap(void* target, u32 size) -> void {
+  #if defined(API_WINDOWS)
+  VirtualFree(target, 0, MEM_RELEASE);
+  #elif defined(API_POSIX)
+  munmap(target, size);
+  #endif
+}
+
+NALL_HEADER_INLINE auto protect(void* target, u32 size, bool executable) -> void {
+  #if defined(API_WINDOWS)
+  DWORD protect = executable ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
+  DWORD oldProtect;
+  VirtualProtect(target, size, protect, &oldProtect);
+  #elif defined(API_POSIX)
+  int prot = PROT_READ | PROT_WRITE;
+  if(executable) {
+    prot |= PROT_EXEC;
+  }
+  int ret = mprotect(target, size, prot);
+  assert(ret == 0);
+  #endif
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/memory.hpp b/waterbox/ares64/ares/nall/memory.hpp
index 01161f5389..c932702bae 100644
--- a/waterbox/ares64/ares/nall/memory.hpp
+++ b/waterbox/ares64/ares/nall/memory.hpp
@@ -195,47 +195,11 @@ template<u32 size, typename T> auto writem(void* target, T data) -> void {
   for(s32 n = size - 1; n >= 0; n--) *p++ = data >> n * 8;
 }
 
-inline auto map(u32 size, bool executable) -> void* {
-  #if defined(API_WINDOWS)
-  DWORD protect = executable ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
-  return VirtualAlloc(nullptr, size, MEM_RESERVE | MEM_COMMIT, protect);
-  #elif defined(API_POSIX)
-  int prot = PROT_READ | PROT_WRITE;
-  int flags = MAP_ANON | MAP_PRIVATE;
-  if(executable) {
-    prot |= PROT_EXEC;
-    #if defined(PLATFORM_MACOS)
-    flags |= MAP_JIT;
-    #endif
-  }
-  return mmap(nullptr, size, prot, flags, -1, 0);
-  #else
-  return nullptr;
-  #endif
-}
+auto map(u32 size, bool executable) -> void*;
 
-inline auto unmap(void* target, u32 size) -> void {
-  #if defined(API_WINDOWS)
-  VirtualFree(target, 0, MEM_RELEASE);
-  #elif defined(API_POSIX)
-  munmap(target, size);
-  #endif
-}
+auto unmap(void* target, u32 size) -> void;
 
-inline auto protect(void* target, u32 size, bool executable) -> void {
-  #if defined(API_WINDOWS)
-  DWORD protect = executable ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
-  DWORD oldProtect;
-  VirtualProtect(target, size, protect, &oldProtect);
-  #elif defined(API_POSIX)
-  int prot = PROT_READ | PROT_WRITE;
-  if(executable) {
-    prot |= PROT_EXEC;
-  }
-  int ret = mprotect(target, size, prot);
-  assert(ret == 0);
-  #endif
-}
+auto protect(void* target, u32 size, bool executable) -> void;
 
 inline auto jitprotect(bool executable) -> void {
   #if defined(PLATFORM_MACOS)
@@ -246,3 +210,7 @@ inline auto jitprotect(bool executable) -> void {
 }
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/memory.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/nall.cpp b/waterbox/ares64/ares/nall/nall.cpp
new file mode 100644
index 0000000000..223dde853c
--- /dev/null
+++ b/waterbox/ares64/ares/nall/nall.cpp
@@ -0,0 +1,33 @@
+#if !defined(NALL_HEADER_ONLY)
+
+#include <nall/intrinsics.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+  #include <nall/windows/windows.hpp>
+#endif
+
+#include <nall/directory.cpp>
+#include <nall/dl.cpp>
+#include <nall/file-map.cpp>
+#include <nall/inode.cpp>
+//#include <nall/main.cpp>
+#include <nall/memory.cpp>
+#include <nall/path.cpp>
+#include <nall/platform.cpp>
+#include <nall/random.cpp>
+#include <nall/run.cpp>
+#include <nall/terminal.cpp>
+#include <nall/thread.cpp>
+//currently unused by ares
+//#include <nall/smtp.cpp>
+//#include <nall/http/client.cpp>
+//#include <nall/http/server.cpp>
+#if defined(PLATFORM_WINDOWS)
+  //#include <nall/windows/detour.cpp>
+  //#include <nall/windows/guid.cpp>
+  //#include <nall/windows/launcher.cpp>
+  #include <nall/windows/registry.cpp>
+  #include <nall/windows/utf8.cpp>
+#endif
+
+#endif
diff --git a/waterbox/ares64/ares/nall/path.cpp b/waterbox/ares64/ares/nall/path.cpp
new file mode 100644
index 0000000000..02a12b9f30
--- /dev/null
+++ b/waterbox/ares64/ares/nall/path.cpp
@@ -0,0 +1,144 @@
+#include <nall/path.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+  #include <shlobj.h>
+#endif
+
+namespace nall::Path {
+
+NALL_HEADER_INLINE auto program() -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  GetModuleFileName(nullptr, path, PATH_MAX);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  return Path::real(result);
+  #else
+  Dl_info info;
+  dladdr((void*)&program, &info);
+  return Path::real(info.dli_fname);
+  #endif
+}
+
+NALL_HEADER_INLINE auto root() -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  SHGetFolderPathW(nullptr, CSIDL_WINDOWS | CSIDL_FLAG_CREATE, nullptr, 0, path);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  return slice(result, 0, 3);
+  #else
+  return "/";
+  #endif
+}
+
+NALL_HEADER_INLINE auto user() -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  SHGetFolderPathW(nullptr, CSIDL_PROFILE | CSIDL_FLAG_CREATE, nullptr, 0, path);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  #else
+  struct passwd* userinfo = getpwuid(getuid());
+  string result = userinfo->pw_dir;
+  #endif
+  if(!result) result = ".";
+  if(!result.endsWith("/")) result.append("/");
+  return result;
+}
+
+NALL_HEADER_INLINE auto desktop(string_view name) -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  SHGetFolderPathW(nullptr, CSIDL_DESKTOP | CSIDL_FLAG_CREATE, nullptr, 0, path);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  #elif defined(PLATFORM_MACOS)
+  string result = {user(), "Desktop/"};
+  #else
+  string result;
+  if(const char *env = getenv("XDG_DESKTOP_DIR")) {
+    result = string(env);
+  } else {
+    result = {user(), "Desktop/"};
+  }
+  #endif
+  if(!result) result = ".";
+  if(!result.endsWith("/")) result.append("/");
+  return result.append(name);
+}
+
+NALL_HEADER_INLINE auto userSettings() -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  SHGetFolderPathW(nullptr, CSIDL_APPDATA | CSIDL_FLAG_CREATE, nullptr, 0, path);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  #elif defined(PLATFORM_MACOS)
+  string result = {Path::user(), "Library/Application Support/"};
+  #else
+  string result;
+  if(const char *env = getenv("XDG_CONFIG_HOME")) {
+    result = string(env);
+  } else {
+    result = {Path::user(), ".config/"};
+  }
+  #endif
+  if(!result) result = ".";
+  if(!result.endsWith("/")) result.append("/");
+  return result;
+}
+
+NALL_HEADER_INLINE auto userData() -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  SHGetFolderPathW(nullptr, CSIDL_LOCAL_APPDATA | CSIDL_FLAG_CREATE, nullptr, 0, path);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  #elif defined(PLATFORM_MACOS)
+  string result = {Path::user(), "Library/Application Support/"};
+  #else
+  string result;
+  if(const char* env = getenv("XDG_DATA_HOME")) {
+    result = string(env);
+  } else {
+    result = {Path::user(), ".local/share/"};
+  }
+  #endif
+  if(!result) result = ".";
+  if(!result.endsWith("/")) result.append("/");
+  return result;
+}
+
+NALL_HEADER_INLINE auto sharedData() -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  SHGetFolderPathW(nullptr, CSIDL_COMMON_APPDATA | CSIDL_FLAG_CREATE, nullptr, 0, path);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  #elif defined(PLATFORM_MACOS)
+  string result = "/Library/Application Support/";
+  #else
+  string result = "/usr/share/";
+  #endif
+  if(!result) result = ".";
+  if(!result.endsWith("/")) result.append("/");
+  return result;
+}
+
+NALL_HEADER_INLINE auto temporary() -> string {
+  #if defined(PLATFORM_WINDOWS)
+  wchar_t path[PATH_MAX] = L"";
+  GetTempPathW(PATH_MAX, path);
+  string result = (const char*)utf8_t(path);
+  result.transform("\\", "/");
+  #elif defined(P_tmpdir)
+  string result = P_tmpdir;
+  #else
+  string result = "/tmp/";
+  #endif
+  if(!result.endsWith("/")) result.append("/");
+  return result;
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/path.hpp b/waterbox/ares64/ares/nall/path.hpp
index f63eed2dd2..b9b602a68d 100644
--- a/waterbox/ares64/ares/nall/path.hpp
+++ b/waterbox/ares64/ares/nall/path.hpp
@@ -24,73 +24,19 @@ inline auto real(string_view name) -> string {
   return result;
 }
 
-inline auto program() -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  GetModuleFileName(nullptr, path, PATH_MAX);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  return Path::real(result);
-  #else
-  Dl_info info;
-  dladdr((void*)&program, &info);
-  return Path::real(info.dli_fname);
-  #endif
-}
+auto program() -> string;
 
 // /
 // c:/
-inline auto root() -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  SHGetFolderPathW(nullptr, CSIDL_WINDOWS | CSIDL_FLAG_CREATE, nullptr, 0, path);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  return slice(result, 0, 3);
-  #else
-  return "/";
-  #endif
-}
+auto root() -> string;
 
 // /home/username/
 // c:/users/username/
-inline auto user() -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  SHGetFolderPathW(nullptr, CSIDL_PROFILE | CSIDL_FLAG_CREATE, nullptr, 0, path);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  #else
-  struct passwd* userinfo = getpwuid(getuid());
-  string result = userinfo->pw_dir;
-  #endif
-  if(!result) result = ".";
-  if(!result.endsWith("/")) result.append("/");
-  return result;
-}
+auto user() -> string;
 
 // /home/username/Desktop/
 // c:/users/username/Desktop/
-inline auto desktop(string_view name = {}) -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  SHGetFolderPathW(nullptr, CSIDL_DESKTOP | CSIDL_FLAG_CREATE, nullptr, 0, path);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  #elif defined(PLATFORM_MACOS)
-  string result = {user(), "Desktop/"};
-  #else
-  string result;
-  if(const char *env = getenv("XDG_DESKTOP_DIR")) {
-    result = string(env);
-  } else {
-    result = {user(), "Desktop/"};
-  }
-  #endif
-  if(!result) result = ".";
-  if(!result.endsWith("/")) result.append("/");
-  return result.append(name);
-}
+auto desktop(string_view name = {}) -> string;
 
 //todo: MacOS uses the same location for userData() and userSettings()
 //... is there a better option here?
@@ -98,85 +44,24 @@ inline auto desktop(string_view name = {}) -> string {
 // /home/username/.config/
 // ~/Library/Application Support/
 // c:/users/username/appdata/roaming/
-inline auto userSettings() -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  SHGetFolderPathW(nullptr, CSIDL_APPDATA | CSIDL_FLAG_CREATE, nullptr, 0, path);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  #elif defined(PLATFORM_MACOS)
-  string result = {Path::user(), "Library/Application Support/"};
-  #else
-  string result;
-  if(const char *env = getenv("XDG_CONFIG_HOME")) {
-    result = string(env);
-  } else {
-    result = {Path::user(), ".config/"};
-  }
-  #endif
-  if(!result) result = ".";
-  if(!result.endsWith("/")) result.append("/");
-  return result;
-}
+auto userSettings() -> string;
 
 // /home/username/.local/share/
 // ~/Library/Application Support/
 // c:/users/username/appdata/local/
-inline auto userData() -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  SHGetFolderPathW(nullptr, CSIDL_LOCAL_APPDATA | CSIDL_FLAG_CREATE, nullptr, 0, path);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  #elif defined(PLATFORM_MACOS)
-  string result = {Path::user(), "Library/Application Support/"};
-  #else
-  string result;
-  if(const char* env = getenv("XDG_DATA_HOME")) {
-    result = string(env);
-  } else {
-    result = {Path::user(), ".local/share/"};
-  }
-  #endif
-  if(!result) result = ".";
-  if(!result.endsWith("/")) result.append("/");
-  return result;
-}
+auto userData() -> string;
 
 // /usr/share
 // /Library/Application Support/
 // c:/ProgramData/
-inline auto sharedData() -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  SHGetFolderPathW(nullptr, CSIDL_COMMON_APPDATA | CSIDL_FLAG_CREATE, nullptr, 0, path);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  #elif defined(PLATFORM_MACOS)
-  string result = "/Library/Application Support/";
-  #else
-  string result = "/usr/share/";
-  #endif
-  if(!result) result = ".";
-  if(!result.endsWith("/")) result.append("/");
-  return result;
-}
+auto sharedData() -> string;
 
 // /tmp
 // c:/users/username/AppData/Local/Temp/
-inline auto temporary() -> string {
-  #if defined(PLATFORM_WINDOWS)
-  wchar_t path[PATH_MAX] = L"";
-  GetTempPathW(PATH_MAX, path);
-  string result = (const char*)utf8_t(path);
-  result.transform("\\", "/");
-  #elif defined(P_tmpdir)
-  string result = P_tmpdir;
-  #else
-  string result = "/tmp/";
-  #endif
-  if(!result.endsWith("/")) result.append("/");
-  return result;
-}
+auto temporary() -> string;
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/path.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/platform.cpp b/waterbox/ares64/ares/nall/platform.cpp
new file mode 100644
index 0000000000..0c21aa283f
--- /dev/null
+++ b/waterbox/ares64/ares/nall/platform.cpp
@@ -0,0 +1,33 @@
+#include <nall/platform.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+
+#include <winsock2.h>
+
+NALL_HEADER_INLINE auto poll(struct pollfd fds[], unsigned long nfds, int timeout) -> int { return WSAPoll(fds, nfds, timeout); }
+
+namespace nall {
+
+NALL_HEADER_INLINE auto recv(int socket, void* buffer, size_t length, int flags) -> ssize_t {
+  return ::recv(socket, (char*)buffer, length, flags);
+}
+
+NALL_HEADER_INLINE auto send(int socket, const void* buffer, size_t length, int flags) -> ssize_t {
+  return ::send(socket, (const char*)buffer, length, flags);
+}
+
+NALL_HEADER_INLINE auto setsockopt(int socket, int level, int option_name, const void* option_value, int option_len) -> int {
+  return ::setsockopt(socket, level, option_name, (const char*)option_value, option_len);
+}
+
+NALL_HEADER_INLINE auto usleep(unsigned int us) -> int {
+  if(us != 0) {
+    Sleep(us / 1000);
+  }
+
+  return 0;
+}
+
+}
+
+#endif
diff --git a/waterbox/ares64/ares/nall/platform.hpp b/waterbox/ares64/ares/nall/platform.hpp
index 9b7b0b035a..5fabcb2aed 100644
--- a/waterbox/ares64/ares/nall/platform.hpp
+++ b/waterbox/ares64/ares/nall/platform.hpp
@@ -9,18 +9,13 @@ namespace Math {
 }
 
 #if defined(PLATFORM_WINDOWS)
-  #include <nall/windows/guard.hpp>
-  #include <initguid.h>
-  #include <cguid.h>
-  #include <winsock2.h>
-  #include <ws2tcpip.h>
-  #include <windows.h>
+  #if defined(NALL_HEADER_ONLY)
+    #include <nall/windows/windows.hpp>
+  #endif
   #include <direct.h>
   #include <io.h>
   #include <wchar.h>
-  #include <shlobj.h>
-  #include <shellapi.h>
-  #include <nall/windows/guard.hpp>
+  #include <sys/utime.h>
   #include <nall/windows/utf8.hpp>
 #endif
 
@@ -39,9 +34,7 @@ namespace Math {
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-#include <utime.h>
 #include <fcntl.h>
-#include <unistd.h>
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -49,6 +42,7 @@ namespace Math {
 #if !defined(PLATFORM_WINDOWS)
   #include <dlfcn.h>
   #include <unistd.h>
+  #include <utime.h>
   #include <pwd.h>
   #include <grp.h>
   #include <sys/mman.h>
@@ -68,25 +62,21 @@ namespace Math {
   #define __has_builtin(x) 0
 #endif
 
-#if defined(COMPILER_MICROSOFT)
-  #define va_copy(dest, src) ((dest) = (src))
-#endif
-
 #if defined(PLATFORM_WINDOWS)
-  #undef  IN
-  #undef  OUT
-  #undef  interface
   #define dllexport __declspec(dllexport)
   #define MSG_NOSIGNAL 0
+  #define PATH_MAX 260
 
-  extern "C" {
-    using pollfd = WSAPOLLFD;
-  }
+  #if !defined(INVALID_HANDLE_VALUE)
+    #define INVALID_HANDLE_VALUE ((HANDLE)-1)
+  #endif
+
+  typedef void* HANDLE;
 
   inline auto access(const char* path, int amode) -> int { return _waccess(nall::utf16_t(path), amode); }
   inline auto getcwd(char* buf, size_t size) -> char* { wchar_t wpath[PATH_MAX] = L""; if(!_wgetcwd(wpath, size)) return nullptr; strcpy(buf, nall::utf8_t(wpath)); return buf; }
   inline auto mkdir(const char* path, int mode) -> int { return _wmkdir(nall::utf16_t(path)); }
-  inline auto poll(struct pollfd fds[], unsigned long nfds, int timeout) -> int { return WSAPoll(fds, nfds, timeout); }
+  inline auto poll(struct pollfd fds[], unsigned long nfds, int timeout) -> int;
   inline auto putenv(const char* value) -> int { return _wputenv(nall::utf16_t(value)); }
   inline auto realpath(const char* file_name, char* resolved_name) -> char* { wchar_t wfile_name[PATH_MAX] = L""; if(!_wfullpath(wfile_name, nall::utf16_t(file_name), PATH_MAX)) return nullptr; strcpy(resolved_name, nall::utf8_t(wfile_name)); return resolved_name; }
   inline auto rename(const char* oldname, const char* newname) -> int { return _wrename(nall::utf16_t(oldname), nall::utf16_t(newname)); }
@@ -94,17 +84,11 @@ namespace Math {
   namespace nall {
     //network functions take void*, not char*. this allows them to be used without casting
 
-    inline auto recv(int socket, void* buffer, size_t length, int flags) -> ssize_t {
-      return ::recv(socket, (char*)buffer, length, flags);
-    }
+    auto recv(int socket, void* buffer, size_t length, int flags) -> ssize_t;
+    auto send(int socket, const void* buffer, size_t length, int flags) -> ssize_t;
+    auto setsockopt(int socket, int level, int option_name, const void* option_value, int option_len) -> int;
 
-    inline auto send(int socket, const void* buffer, size_t length, int flags) -> ssize_t {
-      return ::send(socket, (const char*)buffer, length, flags);
-    }
-
-    inline auto setsockopt(int socket, int level, int option_name, const void* option_value, socklen_t option_len) -> int {
-      return ::setsockopt(socket, level, option_name, (const char*)option_value, option_len);
-    }
+    auto usleep(unsigned int us) -> int;
   }
 #else
   #define dllexport
@@ -132,6 +116,12 @@ namespace Math {
   #endif
   #define likely(expression) __builtin_expect(bool(expression), true)
   #define unlikely(expression) __builtin_expect(bool(expression), false)
+#elif defined(COMPILER_MICROSOFT)
+  #define bswap16(value) _byteswap_ushort(value)
+  #define bswap32(value) _byteswap_ulong(value)
+  #define bswap64(value) _byteswap_uint64(value)
+  #define likely(expression) expression
+  #define unlikely(expression) expression
 #else
   inline auto bswap16(u16 value) -> u16 {
     return value << 8 | value >> 8;
@@ -151,16 +141,21 @@ namespace Math {
   #define unlikely(expression) expression
 #endif
 
-//notify the processor/operating system that this thread is currently awaiting an event (eg a spinloop)
-//calling this function aims to avoid consuming 100% CPU resources on the active thread during spinloops
-inline auto spinloop() -> void {
-  #if defined(COMPILER_CLANG) || defined(COMPILER_GCC)
+namespace nall {
+  //notify the processor/operating system that this thread is currently awaiting an event (eg a spinloop)
+  //calling this function aims to avoid consuming 100% CPU resources on the active thread during spinloops
+  inline auto spinloop() -> void {
     #if defined(ARCHITECTURE_X86) || defined(ARCHITECTURE_AMD64)
-      __builtin_ia32_pause();
-      return;
+      #if defined(COMPILER_CLANG) || defined(COMPILER_GCC)
+        __builtin_ia32_pause();
+        return;
+      #elif defined(COMPILER_MICROSOFT)
+        _mm_pause();
+        return;
+      #endif
     #endif
-  #endif
-  usleep(1);
+    usleep(1);
+  }
 }
 
 #if defined(PLATFORM_MACOS) && !defined(MSG_NOSIGNAL)
@@ -184,6 +179,8 @@ inline auto spinloop() -> void {
 //P0627: [[unreachable]] -- impossible to simulate with identical syntax, must omit brackets ...
 #if defined(COMPILER_CLANG) || defined(COMPILER_GCC)
   #define unreachable __builtin_unreachable()
+#elif defined(COMPILER_MICROSOFT)
+  #define unreachable __assume(0)
 #else
   #define unreachable throw
 #endif
@@ -194,3 +191,7 @@ inline auto spinloop() -> void {
 
 #define export $export
 #define register $register
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/platform.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/primitives/bit-field.hpp b/waterbox/ares64/ares/nall/primitives/bit-field.hpp
index d2bd0d667d..ffbf3e51ed 100644
--- a/waterbox/ares64/ares/nall/primitives/bit-field.hpp
+++ b/waterbox/ares64/ares/nall/primitives/bit-field.hpp
@@ -18,6 +18,7 @@ template<s32 Precision, s32 Index> struct BitField<Precision, Index> {
   enum : type { mask = 1ull << shift };
 
   BitField(const BitField&) = delete;
+  BitField(BitField&&) = default;
 
   auto& operator=(const BitField& source) {
     target = target & ~mask | (bool)source << shift;
@@ -72,6 +73,7 @@ template<s32 Precision> struct BitField<Precision> {
     void>>>>;
 
   BitField(const BitField&) = delete;
+  BitField(BitField&&) = default;
 
   auto& operator=(const BitField& source) {
     target = target & ~mask | (bool)source << shift;
diff --git a/waterbox/ares64/ares/nall/primitives/bit-range.hpp b/waterbox/ares64/ares/nall/primitives/bit-range.hpp
index ea9345c07c..ae979113ad 100644
--- a/waterbox/ares64/ares/nall/primitives/bit-range.hpp
+++ b/waterbox/ares64/ares/nall/primitives/bit-range.hpp
@@ -20,6 +20,7 @@ template<s32 Precision, s32 Lo, s32 Hi> struct BitRange {
   enum : u32 { shift = lo };
 
   BitRange(const BitRange& source) = delete;
+  BitRange(BitRange&& source) = default;
 
   auto& operator=(const BitRange& source) {
     target = target & ~mask | ((source.target & source.mask) >> source.shift) << shift & mask;
@@ -138,6 +139,7 @@ template<typename Type, s32 Precision = Type::bits()> struct DynamicBitRange {
     void>>>>;
 
   DynamicBitRange(const DynamicBitRange& source) = delete;
+  DynamicBitRange(DynamicBitRange&& source) = default;
 
   auto& operator=(const DynamicBitRange& source) {
     target = target & ~mask | ((source.target & source.mask) >> source.shift) << shift & mask;
diff --git a/waterbox/ares64/ares/nall/primitives/literals.hpp b/waterbox/ares64/ares/nall/primitives/literals.hpp
index 0e85233876..54df171039 100644
--- a/waterbox/ares64/ares/nall/primitives/literals.hpp
+++ b/waterbox/ares64/ares/nall/primitives/literals.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-namespace nall {
+namespace nall::primitives {
 
 inline auto operator"" _b(unsigned long long value) { return boolean{value}; }
 inline auto operator"" _n(unsigned long long value) { return natural{value}; }
diff --git a/waterbox/ares64/ares/nall/primitives/types.hpp b/waterbox/ares64/ares/nall/primitives/types.hpp
index 74ca6a6ab3..33e3143804 100644
--- a/waterbox/ares64/ares/nall/primitives/types.hpp
+++ b/waterbox/ares64/ares/nall/primitives/types.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-namespace nall {
+namespace nall::primitives {
   using boolean = Boolean;
   using natural = Natural<>;
   using integer = Integer<>;
diff --git a/waterbox/ares64/ares/nall/priority-queue.hpp b/waterbox/ares64/ares/nall/priority-queue.hpp
index 4b96079553..4afd9cfe6b 100644
--- a/waterbox/ares64/ares/nall/priority-queue.hpp
+++ b/waterbox/ares64/ares/nall/priority-queue.hpp
@@ -82,10 +82,15 @@ struct priority_queue<T[Size]> {
     return nothing;
   }
 
-  auto remove(const T& event) -> void {
+  auto remove(const T& event) -> u32 {
+    u32 cycles = 0;
     for(u32 i = 0; i < size; i++) {
-      if(heap[i].event == event) heap[i].valid = false;
+      if(heap[i].event == event) {
+        heap[i].valid = false;
+        cycles = max(cycles, heap[i].clock - clock);
+      }
     }
+    return cycles;
   }
 
   auto serialize(serializer& s) -> void {
diff --git a/waterbox/ares64/ares/nall/random.cpp b/waterbox/ares64/ares/nall/random.cpp
new file mode 100644
index 0000000000..02ef580cf8
--- /dev/null
+++ b/waterbox/ares64/ares/nall/random.cpp
@@ -0,0 +1,40 @@
+#include <nall/random.hpp>
+
+#if defined(PLATFORM_LINUX) && __has_include(<sys/random.h>)
+  #include <sys/random.h>
+#elif defined(PLATFORM_ANDROID) && __has_include(<sys/syscall.h>)
+  #include <sys/syscall.h>
+#elif defined(PLATFORM_WINDOWS) && __has_include(<wincrypt.h>)
+  #include <wincrypt.h>
+#else
+  #include <stdio.h>
+#endif
+
+namespace nall {
+
+NALL_HEADER_INLINE auto RNGBase::randomSeed() -> u256 {
+  u256 seed = 0;
+  #if defined(PLATFORM_BSD) || defined(PLATFORM_MACOS)
+  for(u32 n : range(8)) seed = seed << 32 | (u32)arc4random();
+  #elif defined(PLATFORM_LINUX) && __has_include(<sys/random.h>)
+  getrandom(&seed, 32, GRND_NONBLOCK);
+  #elif defined(PLATFORM_ANDROID) && __has_include(<sys/syscall.h>)
+  syscall(__NR_getrandom, &seed, 32, 0x0001);  //GRND_NONBLOCK
+  #elif defined(PLATFORM_WINDOWS) && __has_include(<wincrypt.h>)
+  HCRYPTPROV provider;
+  if(CryptAcquireContext(&provider, nullptr, MS_STRONG_PROV, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) {
+    CryptGenRandom(provider, 32, (BYTE*)&seed);
+    CryptReleaseContext(provider, 0);
+  }
+  #else
+  srand(time(nullptr));
+  for(u32 n : range(32)) seed = seed << 8 | (u8)rand();
+  if(auto fp = fopen("/dev/urandom", "rb")) {
+    fread(&seed, 32, 1, fp);
+    fclose(fp);
+  }
+  #endif
+  return seed;
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/random.hpp b/waterbox/ares64/ares/nall/random.hpp
index d93532f6ee..c8114ba285 100644
--- a/waterbox/ares64/ares/nall/random.hpp
+++ b/waterbox/ares64/ares/nall/random.hpp
@@ -9,19 +9,14 @@
 #include <nall/cipher/chacha20.hpp>
 #endif
 
-#if defined(PLATFORM_LINUX) && __has_include(<sys/random.h>)
-  #include <sys/random.h>
-#elif defined(PLATFORM_ANDROID) && __has_include(<sys/syscall.h>)
-  #include <sys/syscall.h>
-#elif defined(PLATFORM_WINDOWS) && __has_include(<wincrypt.h>)
-  #include <wincrypt.h>
-#else
-  #include <stdio.h>
-#endif
-
 namespace nall {
 
-template<typename Base> struct RNG {
+struct RNGBase {
+protected:
+  auto randomSeed() -> u256;
+};
+
+template<typename Base> struct RNG : RNGBase {
   template<typename T = u64> auto random() -> T {
     u64 value = 0;
     for(u32 n : range((sizeof(T) + 3) / 4)) {
@@ -37,32 +32,6 @@ template<typename Base> struct RNG {
       if(value >= threshold) return value % range;
     }
   }
-
-protected:
-  auto randomSeed() -> u256 {
-    u256 seed = 0;
-    #if defined(PLATFORM_BSD) || defined(PLATFORM_MACOS)
-    for(u32 n : range(8)) seed = seed << 32 | (u32)arc4random();
-    #elif defined(PLATFORM_LINUX) && __has_include(<sys/random.h>)
-    getrandom(&seed, 32, GRND_NONBLOCK);
-    #elif defined(PLATFORM_ANDROID) && __has_include(<sys/syscall.h>)
-    syscall(__NR_getrandom, &seed, 32, 0x0001);  //GRND_NONBLOCK
-    #elif defined(PLATFORM_WINDOWS) && __has_include(<wincrypt.h>)
-    HCRYPTPROV provider;
-    if(CryptAcquireContext(&provider, nullptr, MS_STRONG_PROV, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) {
-      CryptGenRandom(provider, 32, (BYTE*)&seed);
-      CryptReleaseContext(provider, 0);
-    }
-    #else
-    srand(time(nullptr));
-    for(u32 n : range(32)) seed = seed << 8 | (u8)rand();
-    if(auto fp = fopen("/dev/urandom", "rb")) {
-      fread(&seed, 32, 1, fp);
-      fclose(fp);
-    }
-    #endif
-    return seed;
-  }
 };
 
 namespace PRNG {
@@ -170,3 +139,7 @@ template<typename T = u64> inline auto random() -> T {
 }
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/random.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/recompiler/amd64/encoder-instructions.hpp b/waterbox/ares64/ares/nall/recompiler/amd64/encoder-instructions.hpp
index 9985c4112c..54aecf3804 100644
--- a/waterbox/ares64/ares/nall/recompiler/amd64/encoder-instructions.hpp
+++ b/waterbox/ares64/ares/nall/recompiler/amd64/encoder-instructions.hpp
@@ -150,13 +150,13 @@
     if(ds.reg == rsp || ds.reg == r12) emit.sib(0, 4, 4);
   alwaysinline auto adc(reg8 rt, dis ds) { op(0x12); }
   alwaysinline auto add(reg8 rt, dis ds) { op(0x02); }
-  alwaysinline auto and(reg8 rt, dis ds) { op(0x22); }
+  alwaysinline auto and_(reg8 rt, dis ds) { op(0x22); }
   alwaysinline auto cmp(reg8 rt, dis ds) { op(0x3a); }
   alwaysinline auto mov(reg8 rt, dis ds) { op(0x8a); }
-  alwaysinline auto or (reg8 rt, dis ds) { op(0x0a); }
+  alwaysinline auto or_(reg8 rt, dis ds) { op(0x0a); }
   alwaysinline auto sbb(reg8 rt, dis ds) { op(0x1a); }
   alwaysinline auto sub(reg8 rt, dis ds) { op(0x2a); }
-  alwaysinline auto xor(reg8 rt, dis ds) { op(0x32); }
+  alwaysinline auto xor_(reg8 rt, dis ds) { op(0x32); }
   #undef op
 
   //op reg8,[reg64+imm8]
@@ -168,13 +168,13 @@
     emit.byte(ds.imm);
   alwaysinline auto adc(reg8 rt, dis8 ds) { op(0x12); }
   alwaysinline auto add(reg8 rt, dis8 ds) { op(0x02); }
-  alwaysinline auto and(reg8 rt, dis8 ds) { op(0x22); }
+  alwaysinline auto and_(reg8 rt, dis8 ds) { op(0x22); }
   alwaysinline auto cmp(reg8 rt, dis8 ds) { op(0x3a); }
   alwaysinline auto mov(reg8 rt, dis8 ds) { op(0x8a); }
-  alwaysinline auto or (reg8 rt, dis8 ds) { op(0x0a); }
+  alwaysinline auto or_(reg8 rt, dis8 ds) { op(0x0a); }
   alwaysinline auto sbb(reg8 rt, dis8 ds) { op(0x1a); }
   alwaysinline auto sub(reg8 rt, dis8 ds) { op(0x2a); }
-  alwaysinline auto xor(reg8 rt, dis8 ds) { op(0x32); }
+  alwaysinline auto xor_(reg8 rt, dis8 ds) { op(0x32); }
   #undef op
 
   //op reg32,[reg64]
@@ -185,13 +185,13 @@
     if(ds.reg == rsp || ds.reg == r12) emit.sib(0, 4, 4);
   alwaysinline auto adc(reg32 rt, dis ds) { op(0x13); }
   alwaysinline auto add(reg32 rt, dis ds) { op(0x03); }
-  alwaysinline auto and(reg32 rt, dis ds) { op(0x23); }
+  alwaysinline auto and_(reg32 rt, dis ds) { op(0x23); }
   alwaysinline auto cmp(reg32 rt, dis ds) { op(0x3b); }
   alwaysinline auto mov(reg32 rt, dis ds) { op(0x8b); }
-  alwaysinline auto or (reg32 rt, dis ds) { op(0x0b); }
+  alwaysinline auto or_(reg32 rt, dis ds) { op(0x0b); }
   alwaysinline auto sbb(reg32 rt, dis ds) { op(0x1b); }
   alwaysinline auto sub(reg32 rt, dis ds) { op(0x2b); }
-  alwaysinline auto xor(reg32 rt, dis ds) { op(0x33); }
+  alwaysinline auto xor_(reg32 rt, dis ds) { op(0x33); }
   #undef op
 
   //op reg32,[reg64+imm8]
@@ -203,13 +203,13 @@
     emit.byte(ds.imm);
   alwaysinline auto adc(reg32 rt, dis8 ds) { op(0x13); }
   alwaysinline auto add(reg32 rt, dis8 ds) { op(0x03); }
-  alwaysinline auto and(reg32 rt, dis8 ds) { op(0x23); }
+  alwaysinline auto and_(reg32 rt, dis8 ds) { op(0x23); }
   alwaysinline auto cmp(reg32 rt, dis8 ds) { op(0x3b); }
   alwaysinline auto mov(reg32 rt, dis8 ds) { op(0x8b); }
-  alwaysinline auto or (reg32 rt, dis8 ds) { op(0x0b); }
+  alwaysinline auto or_(reg32 rt, dis8 ds) { op(0x0b); }
   alwaysinline auto sbb(reg32 rt, dis8 ds) { op(0x1b); }
   alwaysinline auto sub(reg32 rt, dis8 ds) { op(0x2b); }
-  alwaysinline auto xor(reg32 rt, dis8 ds) { op(0x33); }
+  alwaysinline auto xor_(reg32 rt, dis8 ds) { op(0x33); }
   #undef op
 
   //op reg64,[reg64]
@@ -220,13 +220,13 @@
     if(ds.reg == rsp || ds.reg == r12) emit.sib(0, 4, 4);
   alwaysinline auto adc(reg64 rt, dis ds) { op(0x13); }
   alwaysinline auto add(reg64 rt, dis ds) { op(0x03); }
-  alwaysinline auto and(reg64 rt, dis ds) { op(0x23); }
+  alwaysinline auto and_(reg64 rt, dis ds) { op(0x23); }
   alwaysinline auto cmp(reg64 rt, dis ds) { op(0x3b); }
   alwaysinline auto mov(reg64 rt, dis ds) { op(0x8b); }
-  alwaysinline auto or (reg64 rt, dis ds) { op(0x0b); }
+  alwaysinline auto or_(reg64 rt, dis ds) { op(0x0b); }
   alwaysinline auto sbb(reg64 rt, dis ds) { op(0x1b); }
   alwaysinline auto sub(reg64 rt, dis ds) { op(0x2b); }
-  alwaysinline auto xor(reg64 rt, dis ds) { op(0x33); }
+  alwaysinline auto xor_(reg64 rt, dis ds) { op(0x33); }
   #undef op
 
   //op reg64,[reg64+imm8]
@@ -238,13 +238,13 @@
     emit.byte(ds.imm);
   alwaysinline auto adc(reg64 rt, dis8 ds) { op(0x13); }
   alwaysinline auto add(reg64 rt, dis8 ds) { op(0x03); }
-  alwaysinline auto and(reg64 rt, dis8 ds) { op(0x23); }
+  alwaysinline auto and_(reg64 rt, dis8 ds) { op(0x23); }
   alwaysinline auto cmp(reg64 rt, dis8 ds) { op(0x3b); }
   alwaysinline auto mov(reg64 rt, dis8 ds) { op(0x8b); }
-  alwaysinline auto or (reg64 rt, dis8 ds) { op(0x0b); }
+  alwaysinline auto or_(reg64 rt, dis8 ds) { op(0x0b); }
   alwaysinline auto sbb(reg64 rt, dis8 ds) { op(0x1b); }
   alwaysinline auto sub(reg64 rt, dis8 ds) { op(0x2b); }
-  alwaysinline auto xor(reg64 rt, dis8 ds) { op(0x33); }
+  alwaysinline auto xor_(reg64 rt, dis8 ds) { op(0x33); }
   #undef op
 
   //op reg64,[reg64+imm32]
@@ -256,13 +256,13 @@
     emit.dword(ds.imm);
   alwaysinline auto adc(reg64 rt, dis32 ds) { op(0x13); }
   alwaysinline auto add(reg64 rt, dis32 ds) { op(0x03); }
-  alwaysinline auto and(reg64 rt, dis32 ds) { op(0x23); }
+  alwaysinline auto and_(reg64 rt, dis32 ds) { op(0x23); }
   alwaysinline auto cmp(reg64 rt, dis32 ds) { op(0x3b); }
   alwaysinline auto mov(reg64 rt, dis32 ds) { op(0x8b); }
-  alwaysinline auto or (reg64 rt, dis32 ds) { op(0x0b); }
+  alwaysinline auto or_(reg64 rt, dis32 ds) { op(0x0b); }
   alwaysinline auto sbb(reg64 rt, dis32 ds) { op(0x1b); }
   alwaysinline auto sub(reg64 rt, dis32 ds) { op(0x2b); }
-  alwaysinline auto xor(reg64 rt, dis32 ds) { op(0x33); }
+  alwaysinline auto xor_(reg64 rt, dis32 ds) { op(0x33); }
   #undef op
 
   //op [reg64+imm8],reg8
@@ -274,13 +274,13 @@
     emit.byte(dt.imm);
   alwaysinline auto adc(dis8 dt, reg8 rs) { op(0x10); }
   alwaysinline auto add(dis8 dt, reg8 rs) { op(0x00); }
-  alwaysinline auto and(dis8 dt, reg8 rs) { op(0x20); }
+  alwaysinline auto and_(dis8 dt, reg8 rs) { op(0x20); }
   alwaysinline auto cmp(dis8 dt, reg8 rs) { op(0x38); }
   alwaysinline auto mov(dis8 dt, reg8 rs) { op(0x88); }
-  alwaysinline auto or (dis8 dt, reg8 rs) { op(0x08); }
+  alwaysinline auto or_(dis8 dt, reg8 rs) { op(0x08); }
   alwaysinline auto sbb(dis8 dt, reg8 rs) { op(0x18); }
   alwaysinline auto sub(dis8 dt, reg8 rs) { op(0x28); }
-  alwaysinline auto xor(dis8 dt, reg8 rs) { op(0x30); }
+  alwaysinline auto xor_(dis8 dt, reg8 rs) { op(0x30); }
   #undef op
 
   //op reg64,imm32
@@ -290,12 +290,12 @@
     emit.modrm(3, group, rt & 7); \
     emit.dword(is.data);
   alwaysinline auto add(reg64 rt, imm32 is) { op(0); }
-  alwaysinline auto or (reg64 rt, imm32 is) { op(1); }
+  alwaysinline auto or_(reg64 rt, imm32 is) { op(1); }
   alwaysinline auto adc(reg64 rt, imm32 is) { op(2); }
   alwaysinline auto sbb(reg64 rt, imm32 is) { op(3); }
-  alwaysinline auto and(reg64 rt, imm32 is) { op(4); }
+  alwaysinline auto and_(reg64 rt, imm32 is) { op(4); }
   alwaysinline auto sub(reg64 rt, imm32 is) { op(5); }
-  alwaysinline auto xor(reg64 rt, imm32 is) { op(6); }
+  alwaysinline auto xor_(reg64 rt, imm32 is) { op(6); }
   alwaysinline auto cmp(reg64 rt, imm32 is) { op(7); }
   #undef op
 
@@ -325,13 +325,13 @@
     if(dt.reg == rsp || dt.reg == r12) emit.sib(0, 4, 4);
   alwaysinline auto adc(dis dt, reg32 rs) { op(0x11); }
   alwaysinline auto add(dis dt, reg32 rs) { op(0x01); }
-  alwaysinline auto and(dis dt, reg32 rs) { op(0x21); }
+  alwaysinline auto and_(dis dt, reg32 rs) { op(0x21); }
   alwaysinline auto cmp(dis dt, reg32 rs) { op(0x39); }
   alwaysinline auto mov(dis dt, reg32 rs) { op(0x89); }
-  alwaysinline auto or (dis dt, reg32 rs) { op(0x09); }
+  alwaysinline auto or_(dis dt, reg32 rs) { op(0x09); }
   alwaysinline auto sbb(dis dt, reg32 rs) { op(0x19); }
   alwaysinline auto sub(dis dt, reg32 rs) { op(0x29); }
-  alwaysinline auto xor(dis dt, reg32 rs) { op(0x31); }
+  alwaysinline auto xor_(dis dt, reg32 rs) { op(0x31); }
   #undef op
 
   //op [reg64+imm8],reg32
@@ -343,13 +343,13 @@
     emit.byte(dt.imm);
   alwaysinline auto adc(dis8 dt, reg32 rs) { op(0x11); }
   alwaysinline auto add(dis8 dt, reg32 rs) { op(0x01); }
-  alwaysinline auto and(dis8 dt, reg32 rs) { op(0x21); }
+  alwaysinline auto and_(dis8 dt, reg32 rs) { op(0x21); }
   alwaysinline auto cmp(dis8 dt, reg32 rs) { op(0x39); }
   alwaysinline auto mov(dis8 dt, reg32 rs) { op(0x89); }
-  alwaysinline auto or (dis8 dt, reg32 rs) { op(0x09); }
+  alwaysinline auto or_(dis8 dt, reg32 rs) { op(0x09); }
   alwaysinline auto sbb(dis8 dt, reg32 rs) { op(0x19); }
   alwaysinline auto sub(dis8 dt, reg32 rs) { op(0x29); }
-  alwaysinline auto xor(dis8 dt, reg32 rs) { op(0x31); }
+  alwaysinline auto xor_(dis8 dt, reg32 rs) { op(0x31); }
   #undef op
 
   //op [reg64],reg64
@@ -360,13 +360,13 @@
     if(dt.reg == rsp || dt.reg == r12) emit.sib(0, 4, 4);
   alwaysinline auto adc(dis dt, reg64 rs) { op(0x11); }
   alwaysinline auto add(dis dt, reg64 rs) { op(0x01); }
-  alwaysinline auto and(dis dt, reg64 rs) { op(0x21); }
+  alwaysinline auto and_(dis dt, reg64 rs) { op(0x21); }
   alwaysinline auto cmp(dis dt, reg64 rs) { op(0x39); }
   alwaysinline auto mov(dis dt, reg64 rs) { op(0x89); }
-  alwaysinline auto or (dis dt, reg64 rs) { op(0x09); }
+  alwaysinline auto or_(dis dt, reg64 rs) { op(0x09); }
   alwaysinline auto sbb(dis dt, reg64 rs) { op(0x19); }
   alwaysinline auto sub(dis dt, reg64 rs) { op(0x29); }
-  alwaysinline auto xor(dis dt, reg64 rs) { op(0x31); }
+  alwaysinline auto xor_(dis dt, reg64 rs) { op(0x31); }
   #undef op
 
   //op [reg64+imm8],reg64
@@ -378,13 +378,13 @@
     emit.byte(dt.imm);
   alwaysinline auto adc(dis8 dt, reg64 rs) { op(0x11); }
   alwaysinline auto add(dis8 dt, reg64 rs) { op(0x01); }
-  alwaysinline auto and(dis8 dt, reg64 rs) { op(0x21); }
+  alwaysinline auto and_(dis8 dt, reg64 rs) { op(0x21); }
   alwaysinline auto cmp(dis8 dt, reg64 rs) { op(0x39); }
   alwaysinline auto mov(dis8 dt, reg64 rs) { op(0x89); }
-  alwaysinline auto or (dis8 dt, reg64 rs) { op(0x09); }
+  alwaysinline auto or_(dis8 dt, reg64 rs) { op(0x09); }
   alwaysinline auto sbb(dis8 dt, reg64 rs) { op(0x19); }
   alwaysinline auto sub(dis8 dt, reg64 rs) { op(0x29); }
-  alwaysinline auto xor(dis8 dt, reg64 rs) { op(0x31); }
+  alwaysinline auto xor_(dis8 dt, reg64 rs) { op(0x31); }
   #undef op
 
   //op [reg64+imm32],reg64
@@ -396,13 +396,13 @@
     emit.dword(dt.imm);
   alwaysinline auto adc(dis32 dt, reg64 rs) { op(0x11); }
   alwaysinline auto add(dis32 dt, reg64 rs) { op(0x01); }
-  alwaysinline auto and(dis32 dt, reg64 rs) { op(0x21); }
+  alwaysinline auto and_(dis32 dt, reg64 rs) { op(0x21); }
   alwaysinline auto cmp(dis32 dt, reg64 rs) { op(0x39); }
   alwaysinline auto mov(dis32 dt, reg64 rs) { op(0x89); }
-  alwaysinline auto or (dis32 dt, reg64 rs) { op(0x09); }
+  alwaysinline auto or_(dis32 dt, reg64 rs) { op(0x09); }
   alwaysinline auto sbb(dis32 dt, reg64 rs) { op(0x19); }
   alwaysinline auto sub(dis32 dt, reg64 rs) { op(0x29); }
-  alwaysinline auto xor(dis32 dt, reg64 rs) { op(0x31); }
+  alwaysinline auto xor_(dis32 dt, reg64 rs) { op(0x31); }
   #undef op
 
   //op reg32,reg8
@@ -581,14 +581,14 @@
     emit.modrm(3, rs & 7, rt & 7);
   alwaysinline auto adc (reg8 rt, reg8 rs) { op(0x10); }
   alwaysinline auto add (reg8 rt, reg8 rs) { op(0x00); }
-  alwaysinline auto and (reg8 rt, reg8 rs) { op(0x20); }
+  alwaysinline auto and_(reg8 rt, reg8 rs) { op(0x20); }
   alwaysinline auto cmp (reg8 rt, reg8 rs) { op(0x38); }
   alwaysinline auto mov (reg8 rt, reg8 rs) { op(0x88); }
-  alwaysinline auto or  (reg8 rt, reg8 rs) { op(0x08); }
+  alwaysinline auto or_ (reg8 rt, reg8 rs) { op(0x08); }
   alwaysinline auto sbb (reg8 rt, reg8 rs) { op(0x18); }
   alwaysinline auto sub (reg8 rt, reg8 rs) { op(0x28); }
   alwaysinline auto test(reg8 rt, reg8 rs) { op(0x84); }
-  alwaysinline auto xor (reg8 rt, reg8 rs) { op(0x30); }
+  alwaysinline auto xor_(reg8 rt, reg8 rs) { op(0x30); }
   #undef op
 
   #define op(code) \
@@ -598,14 +598,14 @@
     emit.modrm(3, rs & 7, rt & 7);
   alwaysinline auto adc (reg16 rt, reg16 rs) { op(0x11); }
   alwaysinline auto add (reg16 rt, reg16 rs) { op(0x01); }
-  alwaysinline auto and (reg16 rt, reg16 rs) { op(0x21); }
+  alwaysinline auto and_(reg16 rt, reg16 rs) { op(0x21); }
   alwaysinline auto cmp (reg16 rt, reg16 rs) { op(0x39); }
   alwaysinline auto mov (reg16 rt, reg16 rs) { op(0x89); }
-  alwaysinline auto or  (reg16 rt, reg16 rs) { op(0x09); }
+  alwaysinline auto or_ (reg16 rt, reg16 rs) { op(0x09); }
   alwaysinline auto sbb (reg16 rt, reg16 rs) { op(0x19); }
   alwaysinline auto sub (reg16 rt, reg16 rs) { op(0x29); }
   alwaysinline auto test(reg16 rt, reg16 rs) { op(0x85); }
-  alwaysinline auto xor (reg16 rt, reg16 rs) { op(0x31); }
+  alwaysinline auto xor_(reg16 rt, reg16 rs) { op(0x31); }
   #undef op
 
   #define op(code) \
@@ -614,14 +614,14 @@
     emit.modrm(3, rs & 7, rt & 7);
   alwaysinline auto adc (reg32 rt, reg32 rs) { op(0x11); }
   alwaysinline auto add (reg32 rt, reg32 rs) { op(0x01); }
-  alwaysinline auto and (reg32 rt, reg32 rs) { op(0x21); }
+  alwaysinline auto and_(reg32 rt, reg32 rs) { op(0x21); }
   alwaysinline auto cmp (reg32 rt, reg32 rs) { op(0x39); }
   alwaysinline auto mov (reg32 rt, reg32 rs) { op(0x89); }
-  alwaysinline auto or  (reg32 rt, reg32 rs) { op(0x09); }
+  alwaysinline auto or_ (reg32 rt, reg32 rs) { op(0x09); }
   alwaysinline auto sbb (reg32 rt, reg32 rs) { op(0x19); }
   alwaysinline auto sub (reg32 rt, reg32 rs) { op(0x29); }
   alwaysinline auto test(reg32 rt, reg32 rs) { op(0x85); }
-  alwaysinline auto xor (reg32 rt, reg32 rs) { op(0x31); }
+  alwaysinline auto xor_(reg32 rt, reg32 rs) { op(0x31); }
   #undef op
 
   #define op(code) \
@@ -630,14 +630,14 @@
     emit.modrm(3, rs & 7, rt & 7);
   alwaysinline auto adc (reg64 rt, reg64 rs) { op(0x11); }
   alwaysinline auto add (reg64 rt, reg64 rs) { op(0x01); }
-  alwaysinline auto and (reg64 rt, reg64 rs) { op(0x21); }
+  alwaysinline auto and_(reg64 rt, reg64 rs) { op(0x21); }
   alwaysinline auto cmp (reg64 rt, reg64 rs) { op(0x39); }
   alwaysinline auto mov (reg64 rt, reg64 rs) { op(0x89); }
-  alwaysinline auto or  (reg64 rt, reg64 rs) { op(0x09); }
+  alwaysinline auto or_ (reg64 rt, reg64 rs) { op(0x09); }
   alwaysinline auto sbb (reg64 rt, reg64 rs) { op(0x19); }
   alwaysinline auto sub (reg64 rt, reg64 rs) { op(0x29); }
   alwaysinline auto test(reg64 rt, reg64 rs) { op(0x85); }
-  alwaysinline auto xor (reg64 rt, reg64 rs) { op(0x31); }
+  alwaysinline auto xor_(reg64 rt, reg64 rs) { op(0x31); }
   #undef op
 
   #define op(code) \
@@ -647,12 +647,12 @@
     emit.byte(is.data);
   alwaysinline auto adc(reg32 rt, imm8 is) { op(2); }
   alwaysinline auto add(reg32 rt, imm8 is) { op(0); }
-  alwaysinline auto and(reg32 rt, imm8 is) { op(4); }
+  alwaysinline auto and_(reg32 rt, imm8 is) { op(4); }
   alwaysinline auto cmp(reg32 rt, imm8 is) { op(7); }
-  alwaysinline auto or (reg32 rt, imm8 is) { op(1); }
+  alwaysinline auto or_(reg32 rt, imm8 is) { op(1); }
   alwaysinline auto sbb(reg32 rt, imm8 is) { op(3); }
   alwaysinline auto sub(reg32 rt, imm8 is) { op(5); }
-  alwaysinline auto xor(reg32 rt, imm8 is) { op(6); }
+  alwaysinline auto xor_(reg32 rt, imm8 is) { op(6); }
   #undef op
 
   #define op(code) \
@@ -662,12 +662,12 @@
     emit.byte(is.data);
   alwaysinline auto adc(reg64 rt, imm8 is) { op(2); }
   alwaysinline auto add(reg64 rt, imm8 is) { op(0); }
-  alwaysinline auto and(reg64 rt, imm8 is) { op(4); }
+  alwaysinline auto and_(reg64 rt, imm8 is) { op(4); }
   alwaysinline auto cmp(reg64 rt, imm8 is) { op(7); }
-  alwaysinline auto or (reg64 rt, imm8 is) { op(1); }
+  alwaysinline auto or_(reg64 rt, imm8 is) { op(1); }
   alwaysinline auto sbb(reg64 rt, imm8 is) { op(3); }
   alwaysinline auto sub(reg64 rt, imm8 is) { op(5); }
-  alwaysinline auto xor(reg64 rt, imm8 is) { op(6); }
+  alwaysinline auto xor_(reg64 rt, imm8 is) { op(6); }
   #undef op
 
   #define op(code, group) \
@@ -682,12 +682,12 @@
     }
   alwaysinline auto adc(reg8 rt, imm8 is) { op(0x14, 2); }
   alwaysinline auto add(reg8 rt, imm8 is) { op(0x04, 0); }
-  alwaysinline auto and(reg8 rt, imm8 is) { op(0x24, 4); }
+  alwaysinline auto and_(reg8 rt, imm8 is) { op(0x24, 4); }
   alwaysinline auto cmp(reg8 rt, imm8 is) { op(0x3c, 7); }
-  alwaysinline auto or (reg8 rt, imm8 is) { op(0x0c, 1); }
+  alwaysinline auto or_(reg8 rt, imm8 is) { op(0x0c, 1); }
   alwaysinline auto sbb(reg8 rt, imm8 is) { op(0x1c, 3); }
   alwaysinline auto sub(reg8 rt, imm8 is) { op(0x2c, 5); }
-  alwaysinline auto xor(reg8 rt, imm8 is) { op(0x34, 6); }
+  alwaysinline auto xor_(reg8 rt, imm8 is) { op(0x34, 6); }
   #undef op
 
   #define op(code, group) \
@@ -702,19 +702,19 @@
     }
   alwaysinline auto adc(reg32 rt, imm32 is) { op(0x15, 2); }
   alwaysinline auto add(reg32 rt, imm32 is) { op(0x05, 0); }
-  alwaysinline auto and(reg32 rt, imm32 is) { op(0x25, 4); }
+  alwaysinline auto and_(reg32 rt, imm32 is) { op(0x25, 4); }
   alwaysinline auto cmp(reg32 rt, imm32 is) { op(0x3d, 7); }
-  alwaysinline auto or (reg32 rt, imm32 is) { op(0x0d, 1); }
+  alwaysinline auto or_(reg32 rt, imm32 is) { op(0x0d, 1); }
   alwaysinline auto sbb(reg32 rt, imm32 is) { op(0x1d, 3); }
   alwaysinline auto sub(reg32 rt, imm32 is) { op(0x2d, 5); }
-  alwaysinline auto xor(reg32 rt, imm32 is) { op(0x35, 6); }
+  alwaysinline auto xor_(reg32 rt, imm32 is) { op(0x35, 6); }
   #undef op
 
   #define op(code) \
     emit.rex(0, 0, 0, rt & 8); \
     emit.byte(0xf7); \
     emit.modrm(3, code, rt & 7);
-  alwaysinline auto not (reg32 rt) { op(2); }
+  alwaysinline auto not_(reg32 rt) { op(2); }
   alwaysinline auto neg (reg32 rt) { op(3); }
   alwaysinline auto mul (reg32 rt) { op(4); }
   alwaysinline auto imul(reg32 rt) { op(5); }
@@ -726,7 +726,7 @@
     emit.rex(1, 0, 0, rt & 8); \
     emit.byte(0xf7); \
     emit.modrm(3, code, rt & 7);
-  alwaysinline auto not (reg64 rt) { op(2); }
+  alwaysinline auto not_(reg64 rt) { op(2); }
   alwaysinline auto neg (reg64 rt) { op(3); }
   alwaysinline auto mul (reg64 rt) { op(4); }
   alwaysinline auto imul(reg64 rt) { op(5); }
diff --git a/waterbox/ares64/ares/nall/recompiler/generic/encoder-instructions.hpp b/waterbox/ares64/ares/nall/recompiler/generic/encoder-instructions.hpp
index a37a6b5632..aab3b71e00 100644
--- a/waterbox/ares64/ares/nall/recompiler/generic/encoder-instructions.hpp
+++ b/waterbox/ares64/ares/nall/recompiler/generic/encoder-instructions.hpp
@@ -30,8 +30,6 @@
   OP1(mov64_s16, MOV_S16)
   OP1(mov64_u32, MOV_U32)
   OP1(mov64_s32, MOV_S32)
-  OP1(not32, NOT32)
-  OP1(not64, NOT)
 #undef OP1
 
   //2 operand instructions
@@ -71,10 +69,20 @@
   OP2(xor64, XOR)
   OP2(shl32, SHL32)
   OP2(shl64, SHL)
+  OP2(mshl32, MSHL32)
+  OP2(mshl64, MSHL)
   OP2(lshr32, LSHR32)
   OP2(lshr64, LSHR)
+  OP2(mlshr32, MLSHR32)
+  OP2(mlshr64, MLSHR)
   OP2(ashr32, ASHR32)
   OP2(ashr64, ASHR)
+  OP2(mashr32, MASHR32)
+  OP2(mashr64, MASHR)
+  OP2(rotl32, ROTL32)
+  OP2(rotl64, ROTL)
+  OP2(rotr32, ROTR32)
+  OP2(rotr64, ROTR)
 #undef OP2
 
   //compare instructions
diff --git a/waterbox/ares64/ares/nall/recompiler/generic/generic.hpp b/waterbox/ares64/ares/nall/recompiler/generic/generic.hpp
index 28e7b2125c..8872c460d2 100644
--- a/waterbox/ares64/ares/nall/recompiler/generic/generic.hpp
+++ b/waterbox/ares64/ares/nall/recompiler/generic/generic.hpp
@@ -3,6 +3,8 @@
 #if defined(SLJIT)
 namespace nall::recompiler {
   struct generic {
+    static constexpr bool supported = Architecture::amd64 | Architecture::arm64 | Architecture::ppc64;
+
     bump_allocator& allocator;
     sljit_compiler* compiler = nullptr;
     sljit_label* epilogue = nullptr;
diff --git a/waterbox/ares64/ares/nall/run.cpp b/waterbox/ares64/ares/nall/run.cpp
new file mode 100644
index 0000000000..a4f566f5b6
--- /dev/null
+++ b/waterbox/ares64/ares/nall/run.cpp
@@ -0,0 +1,102 @@
+#include <nall/run.hpp>
+#include <nall/path.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+  #include <shellapi.h>
+#endif
+
+namespace nall {
+
+#if defined(PLATFORM_WINDOWS)
+
+NALL_HEADER_INLINE auto execute(const string& name, vector<string> argl) -> execute_result_t {
+  for(auto& arg : argl) if(arg.find(" ")) arg = {"\"", arg, "\""};
+  string arguments = argl.merge(" ");
+
+  SECURITY_ATTRIBUTES sa;
+  ZeroMemory(&sa, sizeof(SECURITY_ATTRIBUTES));
+  sa.nLength = sizeof(SECURITY_ATTRIBUTES);
+  sa.bInheritHandle = true;
+  sa.lpSecurityDescriptor = nullptr;
+
+  HANDLE stdoutRead;
+  HANDLE stdoutWrite;
+  if(!CreatePipe(&stdoutRead, &stdoutWrite, &sa, 0)) return {};
+  if(!SetHandleInformation(stdoutRead, HANDLE_FLAG_INHERIT, 0)) return {};
+
+  HANDLE stderrRead;
+  HANDLE stderrWrite;
+  if(!CreatePipe(&stderrRead, &stderrWrite, &sa, 0)) return {};
+  if(!SetHandleInformation(stderrRead, HANDLE_FLAG_INHERIT, 0)) return {};
+
+  HANDLE stdinRead;
+  HANDLE stdinWrite;
+  if(!CreatePipe(&stdinRead, &stdinWrite, &sa, 0)) return {};
+  if(!SetHandleInformation(stdinWrite, HANDLE_FLAG_INHERIT, 0)) return {};
+
+  STARTUPINFO si;
+  ZeroMemory(&si, sizeof(STARTUPINFO));
+  si.cb = sizeof(STARTUPINFO);
+  si.hStdOutput = stdoutWrite;
+  si.hStdError = stderrWrite;
+  si.hStdInput = stdinRead;
+  si.dwFlags = STARTF_USESTDHANDLES;
+
+  PROCESS_INFORMATION pi;
+  ZeroMemory(&pi, sizeof(PROCESS_INFORMATION));
+
+  if(!CreateProcess(
+    nullptr, utf16_t(arguments),
+    nullptr, nullptr, true, CREATE_NO_WINDOW,
+    nullptr, nullptr, &si, &pi
+  )) return {};
+
+  DWORD exitCode = EXIT_FAILURE;
+  if(WaitForSingleObject(pi.hProcess, INFINITE)) return {};
+  if(!GetExitCodeProcess(pi.hProcess, &exitCode)) return {};
+  CloseHandle(pi.hThread);
+  CloseHandle(pi.hProcess);
+
+  char buffer[256];
+  execute_result_t result;
+  result.code = exitCode;
+
+  while(true) {
+    DWORD read, available, remaining;
+    if(!PeekNamedPipe(stdoutRead, nullptr, sizeof(buffer), &read, &available, &remaining)) break;
+    if(read == 0) break;
+
+    if(!ReadFile(stdoutRead, buffer, sizeof(buffer), &read, nullptr)) break;
+    if(read == 0) break;
+
+    auto offset = result.output.size();
+    result.output.resize(offset + read);
+    memory::copy(result.output.get() + offset, buffer, read);
+  }
+
+  while(true) {
+    DWORD read, available, remaining;
+    if(!PeekNamedPipe(stderrRead, nullptr, sizeof(buffer), &read, &available, &remaining)) break;
+    if(read == 0) break;
+
+    if(!ReadFile(stderrRead, buffer, sizeof(buffer), &read, nullptr)) break;
+    if(read == 0) break;
+
+    auto offset = result.error.size();
+    result.error.resize(offset + read);
+    memory::copy(result.error.get() + offset, buffer, read);
+  }
+
+  return result;
+}
+
+NALL_HEADER_INLINE auto invoke(const string& name, vector<string> argl) -> void {
+  for(auto& arg : argl) if(arg.find(" ")) arg = {"\"", arg, "\""};
+  string arguments = argl.merge(" ");
+  string directory = Path::program().replace("/", "\\");
+  ShellExecute(nullptr, nullptr, utf16_t(name), utf16_t(arguments), utf16_t(directory), SW_SHOWNORMAL);
+}
+
+#endif
+
+}
diff --git a/waterbox/ares64/ares/nall/run.hpp b/waterbox/ares64/ares/nall/run.hpp
index 17a2733f3b..0dae8713ac 100644
--- a/waterbox/ares64/ares/nall/run.hpp
+++ b/waterbox/ares64/ares/nall/run.hpp
@@ -110,94 +110,18 @@ template<typename... P> inline auto invoke(const string& name, P&&... p) -> void
 
 #elif defined(PLATFORM_WINDOWS)
 
+auto execute(const string& name, vector<string> argl) -> execute_result_t;
+
 template<typename... P> inline auto execute(const string& name, P&&... p) -> execute_result_t {
   vector<string> argl(name, std::forward<P>(p)...);
-  for(auto& arg : argl) if(arg.find(" ")) arg = {"\"", arg, "\""};
-  string arguments = argl.merge(" ");
-
-  SECURITY_ATTRIBUTES sa;
-  ZeroMemory(&sa, sizeof(SECURITY_ATTRIBUTES));
-  sa.nLength = sizeof(SECURITY_ATTRIBUTES);
-  sa.bInheritHandle = true;
-  sa.lpSecurityDescriptor = nullptr;
-
-  HANDLE stdoutRead;
-  HANDLE stdoutWrite;
-  if(!CreatePipe(&stdoutRead, &stdoutWrite, &sa, 0)) return {};
-  if(!SetHandleInformation(stdoutRead, HANDLE_FLAG_INHERIT, 0)) return {};
-
-  HANDLE stderrRead;
-  HANDLE stderrWrite;
-  if(!CreatePipe(&stderrRead, &stderrWrite, &sa, 0)) return {};
-  if(!SetHandleInformation(stderrRead, HANDLE_FLAG_INHERIT, 0)) return {};
-
-  HANDLE stdinRead;
-  HANDLE stdinWrite;
-  if(!CreatePipe(&stdinRead, &stdinWrite, &sa, 0)) return {};
-  if(!SetHandleInformation(stdinWrite, HANDLE_FLAG_INHERIT, 0)) return {};
-
-  STARTUPINFO si;
-  ZeroMemory(&si, sizeof(STARTUPINFO));
-  si.cb = sizeof(STARTUPINFO);
-  si.hStdOutput = stdoutWrite;
-  si.hStdError = stderrWrite;
-  si.hStdInput = stdinRead;
-  si.dwFlags = STARTF_USESTDHANDLES;
-
-  PROCESS_INFORMATION pi;
-  ZeroMemory(&pi, sizeof(PROCESS_INFORMATION));
-
-  if(!CreateProcess(
-    nullptr, utf16_t(arguments),
-    nullptr, nullptr, true, CREATE_NO_WINDOW,
-    nullptr, nullptr, &si, &pi
-  )) return {};
-
-  DWORD exitCode = EXIT_FAILURE;
-  if(WaitForSingleObject(pi.hProcess, INFINITE)) return {};
-  if(!GetExitCodeProcess(pi.hProcess, &exitCode)) return {};
-  CloseHandle(pi.hThread);
-  CloseHandle(pi.hProcess);
-
-  char buffer[256];
-  execute_result_t result;
-  result.code = exitCode;
-
-  while(true) {
-    DWORD read, available, remaining;
-    if(!PeekNamedPipe(stdoutRead, nullptr, sizeof(buffer), &read, &available, &remaining)) break;
-    if(read == 0) break;
-
-    if(!ReadFile(stdoutRead, buffer, sizeof(buffer), &read, nullptr)) break;
-    if(read == 0) break;
-
-    auto offset = result.output.size();
-    result.output.resize(offset + read);
-    memory::copy(result.output.get() + offset, buffer, read);
-  }
-
-  while(true) {
-    DWORD read, available, remaining;
-    if(!PeekNamedPipe(stderrRead, nullptr, sizeof(buffer), &read, &available, &remaining)) break;
-    if(read == 0) break;
-
-    if(!ReadFile(stderrRead, buffer, sizeof(buffer), &read, nullptr)) break;
-    if(read == 0) break;
-
-    auto offset = result.error.size();
-    result.error.resize(offset + read);
-    memory::copy(result.error.get() + offset, buffer, read);
-  }
-
-  return result;
+  return execute(name, std::move(argl));
 }
 
+auto invoke(const string& name, vector<string> argl) -> void;
+
 template<typename... P> inline auto invoke(const string& name, P&&... p) -> void {
   vector<string> argl(std::forward<P>(p)...);
-  for(auto& arg : argl) if(arg.find(" ")) arg = {"\"", arg, "\""};
-  string arguments = argl.merge(" ");
-  string directory = Path::program().replace("/", "\\");
-  ShellExecute(nullptr, nullptr, utf16_t(name), utf16_t(arguments), utf16_t(directory), SW_SHOWNORMAL);
+  invoke(name, std::move(argl));
 }
 
 #else
@@ -212,3 +136,7 @@ template<typename... P> inline auto invoke(const string& name, P&&... p) -> void
 #endif
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/run.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/serializer.hpp b/waterbox/ares64/ares/nall/serializer.hpp
index 229edf23d6..741753db1c 100644
--- a/waterbox/ares64/ares/nall/serializer.hpp
+++ b/waterbox/ares64/ares/nall/serializer.hpp
@@ -76,10 +76,11 @@ struct serializer {
   }
 
   template<typename T> auto operator()(T& value) -> serializer& {
-    static_assert(has_serialize_v<T> || is_integral_v<T> || is_floating_point_v<T>);
+    constexpr bool integral = is_integral_v<T> || is_same_v<T, u128>;
+    static_assert(has_serialize_v<T> || integral || is_floating_point_v<T>);
     if constexpr(has_serialize_v<T>) {
       value.serialize(*this);
-    } else if constexpr(is_integral_v<T>) {
+    } else if constexpr(integral) {
       integer(value);
     } else if constexpr(is_floating_point_v<T>) {
       real(value);
diff --git a/waterbox/ares64/ares/nall/shared-pointer.hpp b/waterbox/ares64/ares/nall/shared-pointer.hpp
index 0666e065d6..190b6fd019 100644
--- a/waterbox/ares64/ares/nall/shared-pointer.hpp
+++ b/waterbox/ares64/ares/nall/shared-pointer.hpp
@@ -27,7 +27,7 @@ struct shared_pointer_this_base{};
 template<typename T>
 struct shared_pointer {
   template<typename... P> static auto create(P&&... p) {
-    return shared_pointer<T>{new T{std::forward<P>(p)...}};
+    return shared_pointer<T>{new T(std::forward<P>(p)...)};
   }
 
   using type = T;
diff --git a/waterbox/ares64/ares/nall/smtp.cpp b/waterbox/ares64/ares/nall/smtp.cpp
new file mode 100644
index 0000000000..36739c3c88
--- /dev/null
+++ b/waterbox/ares64/ares/nall/smtp.cpp
@@ -0,0 +1,149 @@
+#include <nall/smtp.hpp>
+
+#if defined(PLATFORM_WINDOWS)
+  #include <ws2tcpip.h>
+#endif
+
+namespace nall {
+
+NALL_HEADER_INLINE auto SMTP::send() -> bool {
+  info.message.append("From: =?UTF-8?B?", Encode::Base64(contact(info.from)), "?=\r\n");
+  info.message.append("To: =?UTF-8?B?", Encode::Base64(contacts(info.to)), "?=\r\n");
+  info.message.append("Cc: =?UTF-8?B?", Encode::Base64(contacts(info.cc)), "?=\r\n");
+  info.message.append("Subject: =?UTF-8?B?", Encode::Base64(info.subject), "?=\r\n");
+
+  string uniqueID = boundary();
+
+  info.message.append("MIME-Version: 1.0\r\n");
+  info.message.append("Content-Type: multipart/mixed; boundary=", uniqueID, "\r\n");
+  info.message.append("\r\n");
+
+  string format = (info.format == Format::Plain ? "text/plain" : "text/html");
+
+  info.message.append("--", uniqueID, "\r\n");
+  info.message.append("Content-Type: ", format, "; charset=UTF-8\r\n");
+  info.message.append("Content-Transfer-Encoding: base64\r\n");
+  info.message.append("\r\n");
+  info.message.append(split(Encode::Base64(info.body)), "\r\n");
+  info.message.append("\r\n");
+
+  for(auto& attachment : info.attachments) {
+    info.message.append("--", uniqueID, "\r\n");
+    info.message.append("Content-Type: application/octet-stream\r\n");
+    info.message.append("Content-Transfer-Encoding: base64\r\n");
+    info.message.append("Content-Disposition: attachment; size=", attachment.buffer.size(), "; filename*=UTF-8''", filename(attachment.name), "\r\n");
+    info.message.append("\r\n");
+    info.message.append(split(Encode::Base64(attachment.buffer)), "\r\n");
+    info.message.append("\r\n");
+  }
+
+  info.message.append("--", uniqueID, "--\r\n");
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(addrinfo));
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+  hints.ai_flags = AI_PASSIVE;
+
+  addrinfo* serverinfo;
+  s32 status = getaddrinfo(info.server, string(info.port), &hints, &serverinfo);
+  if(status != 0) return false;
+
+  s32 sock = socket(serverinfo->ai_family, serverinfo->ai_socktype, serverinfo->ai_protocol);
+  if(sock == -1) return false;
+
+  s32 result = connect(sock, serverinfo->ai_addr, serverinfo->ai_addrlen);
+  if(result == -1) return false;
+
+  string response;
+  info.response.append(response = recv(sock));
+  if(!response.beginsWith("220 ")) { close(sock); return false; }
+
+  send(sock, {"HELO ", info.server, "\r\n"});
+  info.response.append(response = recv(sock));
+  if(!response.beginsWith("250 ")) { close(sock); return false; }
+
+  send(sock, {"MAIL FROM: <", info.from.mail, ">\r\n"});
+  info.response.append(response = recv(sock));
+  if(!response.beginsWith("250 ")) { close(sock); return false; }
+
+  for(auto& contact : info.to) {
+    send(sock, {"RCPT TO: <", contact.mail, ">\r\n"});
+    info.response.append(response = recv(sock));
+    if(!response.beginsWith("250 ")) { close(sock); return false; }
+  }
+
+  for(auto& contact : info.cc) {
+    send(sock, {"RCPT TO: <", contact.mail, ">\r\n"});
+    info.response.append(response = recv(sock));
+    if(!response.beginsWith("250 ")) { close(sock); return false; }
+  }
+
+  for(auto& contact : info.bcc) {
+    send(sock, {"RCPT TO: <", contact.mail, ">\r\n"});
+    info.response.append(response = recv(sock));
+    if(!response.beginsWith("250 ")) { close(sock); return false; }
+  }
+
+  send(sock, {"DATA\r\n"});
+  info.response.append(response = recv(sock));
+  if(!response.beginsWith("354 ")) { close(sock); return false; }
+
+  send(sock, {info.message, "\r\n", ".\r\n"});
+  info.response.append(response = recv(sock));
+  if(!response.beginsWith("250 ")) { close(sock); return false; }
+
+  send(sock, {"QUIT\r\n"});
+  info.response.append(response = recv(sock));
+//if(!response.beginsWith("221 ")) { close(sock); return false; }
+
+  close(sock);
+  return true;
+}
+
+NALL_HEADER_INLINE auto SMTP::send(s32 sock, const string& text) -> bool {
+  const char* data = text.data();
+  u32 size = text.size();
+  while(size) {
+    s32 length = ::send(sock, (const char*)data, size, 0);
+    if(length == -1) return false;
+    data += length;
+    size -= length;
+  }
+  return true;
+}
+
+NALL_HEADER_INLINE auto SMTP::recv(s32 sock) -> string {
+  vector<u8> buffer;
+  while(true) {
+    char c;
+    if(::recv(sock, &c, sizeof(char), 0) < 1) break;
+    buffer.append(c);
+    if(c == '\n') break;
+  }
+  buffer.append(0);
+  return buffer;
+}
+
+#if defined(API_WINDOWS)
+
+NALL_HEADER_INLINE auto SMTP::close(s32 sock) -> s32 {
+  return closesocket(sock);
+}
+
+NALL_HEADER_INLINE SMTP::SMTP() {
+  s32 sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+  if(sock == INVALID_SOCKET && WSAGetLastError() == WSANOTINITIALISED) {
+    WSADATA wsaData;
+    if(WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
+      WSACleanup();
+      return;
+    }
+  } else {
+    close(sock);
+  }
+}
+
+#endif
+
+}
diff --git a/waterbox/ares64/ares/nall/smtp.hpp b/waterbox/ares64/ares/nall/smtp.hpp
index 82f2d7d2c4..d19b4b1c65 100644
--- a/waterbox/ares64/ares/nall/smtp.hpp
+++ b/waterbox/ares64/ares/nall/smtp.hpp
@@ -1,17 +1,17 @@
 #pragma once
 
-#include <nall/base64.hpp>
 #include <nall/stdint.hpp>
 #include <nall/string.hpp>
+#include <nall/file.hpp>
+#include <nall/location.hpp>
+#include <nall/random.hpp>
+#include <nall/encode/base64.hpp>
 
 #if !defined(PLATFORM_WINDOWS)
   #include <sys/types.h>
   #include <sys/socket.h>
   #include <netinet/in.h>
   #include <netdb.h>
-#else
-  #include <winsock2.h>
-  #include <ws2tcpip.h>
 #endif
 
 namespace nall {
@@ -102,7 +102,7 @@ inline auto SMTP::attachment(const u8* data, u32 size, string name) -> void {
 
 inline auto SMTP::attachment(string filename, string name) -> bool {
   if(!file::exists(filename)) return false;
-  if(name == "") name = notdir(filename);
+  if(name == "") name = Location::file(filename);
   auto buffer = file::read(filename);
   info.attachments.append({std::move(buffer), name});
   return true;
@@ -117,101 +117,6 @@ inline auto SMTP::body(string body, Format format) -> void {
   info.format = format;
 }
 
-inline auto SMTP::send() -> bool {
-  info.message.append("From: =?UTF-8?B?", Base64::encode(contact(info.from)), "?=\r\n");
-  info.message.append("To: =?UTF-8?B?", Base64::encode(contacts(info.to)), "?=\r\n");
-  info.message.append("Cc: =?UTF-8?B?", Base64::encode(contacts(info.cc)), "?=\r\n");
-  info.message.append("Subject: =?UTF-8?B?", Base64::encode(info.subject), "?=\r\n");
-
-  string uniqueID = boundary();
-
-  info.message.append("MIME-Version: 1.0\r\n");
-  info.message.append("Content-Type: multipart/mixed; boundary=", uniqueID, "\r\n");
-  info.message.append("\r\n");
-
-  string format = (info.format == Format::Plain ? "text/plain" : "text/html");
-
-  info.message.append("--", uniqueID, "\r\n");
-  info.message.append("Content-Type: ", format, "; charset=UTF-8\r\n");
-  info.message.append("Content-Transfer-Encoding: base64\r\n");
-  info.message.append("\r\n");
-  info.message.append(split(Base64::encode(info.body)), "\r\n");
-  info.message.append("\r\n");
-
-  for(auto& attachment : info.attachments) {
-    info.message.append("--", uniqueID, "\r\n");
-    info.message.append("Content-Type: application/octet-stream\r\n");
-    info.message.append("Content-Transfer-Encoding: base64\r\n");
-    info.message.append("Content-Disposition: attachment; size=", attachment.buffer.size(), "; filename*=UTF-8''", filename(attachment.name), "\r\n");
-    info.message.append("\r\n");
-    info.message.append(split(Base64::encode(attachment.buffer)), "\r\n");
-    info.message.append("\r\n");
-  }
-
-  info.message.append("--", uniqueID, "--\r\n");
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(addrinfo));
-  hints.ai_family = AF_UNSPEC;
-  hints.ai_socktype = SOCK_STREAM;
-  hints.ai_flags = AI_PASSIVE;
-
-  addrinfo* serverinfo;
-  s32 status = getaddrinfo(info.server, string(info.port), &hints, &serverinfo);
-  if(status != 0) return false;
-
-  s32 sock = socket(serverinfo->ai_family, serverinfo->ai_socktype, serverinfo->ai_protocol);
-  if(sock == -1) return false;
-
-  s32 result = connect(sock, serverinfo->ai_addr, serverinfo->ai_addrlen);
-  if(result == -1) return false;
-
-  string response;
-  info.response.append(response = recv(sock));
-  if(!response.beginswith("220 ")) { close(sock); return false; }
-
-  send(sock, {"HELO ", info.server, "\r\n"});
-  info.response.append(response = recv(sock));
-  if(!response.beginswith("250 ")) { close(sock); return false; }
-
-  send(sock, {"MAIL FROM: <", info.from.mail, ">\r\n"});
-  info.response.append(response = recv(sock));
-  if(!response.beginswith("250 ")) { close(sock); return false; }
-
-  for(auto& contact : info.to) {
-    send(sock, {"RCPT TO: <", contact.mail, ">\r\n"});
-    info.response.append(response = recv(sock));
-    if(!response.beginswith("250 ")) { close(sock); return false; }
-  }
-
-  for(auto& contact : info.cc) {
-    send(sock, {"RCPT TO: <", contact.mail, ">\r\n"});
-    info.response.append(response = recv(sock));
-    if(!response.beginswith("250 ")) { close(sock); return false; }
-  }
-
-  for(auto& contact : info.bcc) {
-    send(sock, {"RCPT TO: <", contact.mail, ">\r\n"});
-    info.response.append(response = recv(sock));
-    if(!response.beginswith("250 ")) { close(sock); return false; }
-  }
-
-  send(sock, {"DATA\r\n"});
-  info.response.append(response = recv(sock));
-  if(!response.beginswith("354 ")) { close(sock); return false; }
-
-  send(sock, {info.message, "\r\n", ".\r\n"});
-  info.response.append(response = recv(sock));
-  if(!response.beginswith("250 ")) { close(sock); return false; }
-
-  send(sock, {"QUIT\r\n"});
-  info.response.append(response = recv(sock));
-//if(!response.beginswith("221 ")) { close(sock); return false; }
-
-  close(sock);
-  return true;
-}
-
 inline auto SMTP::message() -> string {
   return info.message;
 }
@@ -220,42 +125,18 @@ inline auto SMTP::response() -> string {
   return info.response;
 }
 
-inline auto SMTP::send(s32 sock, const string& text) -> bool {
-  const char* data = text.data();
-  u32 size = text.size();
-  while(size) {
-    s32 length = ::send(sock, (const char*)data, size, 0);
-    if(length == -1) return false;
-    data += length;
-    size -= length;
-  }
-  return true;
-}
-
-inline auto SMTP::recv(s32 sock) -> string {
-  vector<u8> buffer;
-  while(true) {
-    char c;
-    if(::recv(sock, &c, sizeof(char), 0) < 1) break;
-    buffer.append(c);
-    if(c == '\n') break;
-  }
-  buffer.append(0);
-  return buffer;
-}
-
 inline auto SMTP::boundary() -> string {
-  random_lfsr random;
+  PRNG::LFSR random;
   random.seed(time(0));
   string boundary;
-  for(u32 n = 0; n < 16; n++) boundary.append(hex<2>(random()));
+  for(u32 n = 0; n < 16; n++) boundary.append(hex(random.random(), 2L));
   return boundary;
 }
 
 inline auto SMTP::filename(const string& filename) -> string {
   string result;
   for(auto& n : filename) {
-    if(n <= 32 || n >= 127) result.append("%", hex<2>(n));
+    if(n <= 32 || n >= 127) result.append("%", hex(n, 2L));
     else result.append(n);
   }
   return result;
@@ -292,23 +173,8 @@ inline auto SMTP::split(const string& text) -> string {
   return result;
 }
 
-#if defined(API_WINDOWS)
-inline auto SMTP::close(s32 sock) -> s32 {
-  return closesocket(sock);
 }
 
-inline SMTP::SMTP() {
-  s32 sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-  if(sock == INVALID_SOCKET && WSAGetLastError() == WSANOTINITIALISED) {
-    WSADATA wsaData;
-    if(WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
-      WSACleanup();
-      return;
-    }
-  } else {
-    close(sock);
-  }
-}
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/smtp.cpp>
 #endif
-
-}
diff --git a/waterbox/ares64/ares/nall/stdint.hpp b/waterbox/ares64/ares/nall/stdint.hpp
index 9cee334c93..a6243976b1 100644
--- a/waterbox/ares64/ares/nall/stdint.hpp
+++ b/waterbox/ares64/ares/nall/stdint.hpp
@@ -8,8 +8,10 @@
   typedef int64_t intmax_t;
   #if defined(_WIN64)
   typedef int64_t intptr_t;
+  typedef int64_t ssize_t;
   #else
   typedef int32_t intptr_t;
+  typedef int32_t ssize_t;
   #endif
 
   typedef unsigned char uint8_t;
@@ -80,4 +82,7 @@ using f64 = float64_t;
 #if defined(__SIZEOF_INT128__)
   using s128 =  int128_t;
   using u128 = uint128_t;
+#else
+  //arithmetic.hpp
+  namespace nall { struct u128; }
 #endif
diff --git a/waterbox/ares64/ares/nall/string/markup/node.hpp b/waterbox/ares64/ares/nall/string/markup/node.hpp
index 33ed459baf..2f1ad7bfed 100644
--- a/waterbox/ares64/ares/nall/string/markup/node.hpp
+++ b/waterbox/ares64/ares/nall/string/markup/node.hpp
@@ -109,9 +109,11 @@ struct Node {
     return std::swap(shared->_children[x], shared->_children[y]), true;
   }
 
-  auto sort(function<bool (Node, Node)> comparator = [](auto x, auto y) {
-    return nall::string::compare(x.shared->_name, y.shared->_name) < 0;
-  }) -> void {
+  auto operator<(const Node& node) -> bool {
+    return nall::string::compare(shared->_name, node.shared->_name) < 0;
+  }
+
+  auto sort(function<bool (Node, Node)> comparator = [](auto x, auto y) { return x < y; }) -> void {
     nall::sort(shared->_children.data(), shared->_children.size(), [&](auto x, auto y) {
       return comparator(x, y);  //this call converts SharedNode objects to Node objects
     });
diff --git a/waterbox/ares64/ares/nall/string/markup/xml.hpp b/waterbox/ares64/ares/nall/string/markup/xml.hpp
index 6de2d32755..21ac8e33a5 100644
--- a/waterbox/ares64/ares/nall/string/markup/xml.hpp
+++ b/waterbox/ares64/ares/nall/string/markup/xml.hpp
@@ -43,7 +43,8 @@ protected:
 
   //copy part of string from source document into target string; decode markup while copying
   auto copy(string& target, const char* source, u32 length) -> void {
-    target.reserve(length + 1);
+    string buffer;
+    buffer.resize(length);
 
     #if defined(NALL_XML_LITERAL)
     memory::copy(target.pointer(), source, length);
@@ -51,7 +52,7 @@ protected:
     return;
     #endif
 
-    char* output = target.get();
+    char* output = buffer.get();
     while(length) {
       if(*source == '&') {
         if(!memory::compare(source, "&lt;",   4)) { *output++ = '<';  source += 4; length -= 4; continue; }
@@ -63,7 +64,7 @@ protected:
 
       if(_metadata == 0 && source[0] == '<' && source[1] == '!') {
         //comment
-        if(!memory::compare(source, "<!--", 4)) {
+    if(!memory::compare(source, "<!--", 4)) {
           source += 4, length -= 4;
           while(memory::compare(source, "-->", 3)) source++, length--;
           source += 3, length -= 3;
@@ -81,7 +82,8 @@ protected:
 
       *output++ = *source++, length--;
     }
-    *output = 0;
+    buffer.resize(output - buffer.get());
+    target = std::move(buffer);
   }
 
   auto parseExpression(const char*& p) -> bool {
@@ -199,19 +201,19 @@ protected:
     copy(_value, dataStart, dataEnd - dataStart);
   }
 
-  friend auto unserialize(const string&) -> Markup::SharedNode;
+  friend auto unserialize(const string&) -> Markup::Node;
 };
 
-inline auto unserialize(const string& markup) -> Markup::SharedNode {
-  auto node = new ManagedNode;
+inline auto unserialize(const string& markup) -> Markup::Node {
+  SharedNode node(new ManagedNode);
   try {
+
     const char* p = markup;
     node->parse(p);
   } catch(const char* error) {
-    delete node;
-    node = nullptr;
+    node.reset();
   }
-  return node;
+  return (Markup::SharedNode&)node;
 }
 
 }
diff --git a/waterbox/ares64/ares/nall/string/transform/dml.hpp b/waterbox/ares64/ares/nall/string/transform/dml.hpp
index f5bc64278e..6111f6ca58 100644
--- a/waterbox/ares64/ares/nall/string/transform/dml.hpp
+++ b/waterbox/ares64/ares/nall/string/transform/dml.hpp
@@ -257,11 +257,11 @@ inline auto DML::anchor(const string& text) -> string {
 inline auto DML::markup(const string& s) -> string {
   string t;
 
-  boolean strong;
-  boolean emphasis;
-  boolean insertion;
-  boolean deletion;
-  boolean code;
+  Boolean strong;
+  Boolean emphasis;
+  Boolean insertion;
+  Boolean deletion;
+  Boolean code;
 
   maybe<u32> link;
   maybe<u32> image;
@@ -300,7 +300,7 @@ inline auto DML::markup(const string& s) -> string {
       string uri = address(list.last());
       string name = list.size() == 2 ? list.first() : uri.split("//", 1L).last();
       list = side(1).split("; ");
-      boolean link, title, caption;
+      Boolean link, title, caption;
       string Class, width, height;
       for(auto p : list) {
         if(p == "link") { link = true; continue; }
diff --git a/waterbox/ares64/ares/nall/string/utility.hpp b/waterbox/ares64/ares/nall/string/utility.hpp
index 0417bce6c8..4b46f58176 100644
--- a/waterbox/ares64/ares/nall/string/utility.hpp
+++ b/waterbox/ares64/ares/nall/string/utility.hpp
@@ -161,7 +161,7 @@ template<typename T> inline auto fromReal(char* result, T value) -> u32 {
   //Windows C-runtime does not support long double via sprintf()
   sprintf(buffer, "%f", (double)value);
   #else
-  sprintf(buffer, "%Lf", (long double)value);
+  snprintf(buffer, sizeof(buffer), "%Lf", (long double)value);
   #endif
 
   //remove excess 0's in fraction (2.500000 -> 2.5)
diff --git a/waterbox/ares64/ares/nall/terminal.cpp b/waterbox/ares64/ares/nall/terminal.cpp
new file mode 100644
index 0000000000..063d96da75
--- /dev/null
+++ b/waterbox/ares64/ares/nall/terminal.cpp
@@ -0,0 +1,21 @@
+#include <nall/terminal.hpp>
+
+namespace nall::terminal {
+
+NALL_HEADER_INLINE auto redirectStdioToTerminal(bool create) -> void {
+#if defined(PLATFORM_WINDOWS)
+  if(create) {
+    FreeConsole();
+    if(!AllocConsole()) return;
+  } else if(!AttachConsole(ATTACH_PARENT_PROCESS)) {
+    return;
+  }
+
+  //unless a new terminal was requested, do not reopen already valid handles (allow redirection to/from file)
+  if(create || _get_osfhandle(_fileno(stdin )) < 0) freopen("CONIN$" , "r", stdin );
+  if(create || _get_osfhandle(_fileno(stdout)) < 0) freopen("CONOUT$", "w", stdout);
+  if(create || _get_osfhandle(_fileno(stderr)) < 0) freopen("CONOUT$", "w", stderr);
+#endif
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/terminal.hpp b/waterbox/ares64/ares/nall/terminal.hpp
index 6006b3feb8..9c4253f4db 100644
--- a/waterbox/ares64/ares/nall/terminal.hpp
+++ b/waterbox/ares64/ares/nall/terminal.hpp
@@ -4,6 +4,11 @@
 
 namespace nall::terminal {
 
+//control sequence introducer
+constexpr char csi[] = "\x1b[";
+
+auto redirectStdioToTerminal(bool create) -> void;
+
 inline auto escapable() -> bool {
   #if defined(PLATFORM_WINDOWS)
   //todo: colors are supported by Windows 10+ and with alternate terminals (eg msys)
@@ -17,49 +22,53 @@ namespace color {
 
 template<typename... P> inline auto black(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[30m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "30m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto blue(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[94m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "94m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto green(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[92m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "92m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto cyan(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[96m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "96m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto red(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[91m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "91m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto magenta(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[95m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "95m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto yellow(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[93m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "93m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto white(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[97m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "97m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 template<typename... P> inline auto gray(P&&... p) -> string {
   if(!escapable()) return string{std::forward<P>(p)...};
-  return {"\e[37m", string{std::forward<P>(p)...}, "\e[0m"};
+  return {csi, "37m", string{std::forward<P>(p)...}, csi, "0m"};
 }
 
 }
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/terminal.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/thread.cpp b/waterbox/ares64/ares/nall/thread.cpp
new file mode 100644
index 0000000000..73e3cc311b
--- /dev/null
+++ b/waterbox/ares64/ares/nall/thread.cpp
@@ -0,0 +1,53 @@
+#include <nall/thread.hpp>
+
+namespace nall {
+
+#if defined(API_WINDOWS)
+
+NALL_HEADER_INLINE auto WINAPI _threadCallback(void* parameter) -> DWORD {
+  auto context = (thread::context*)parameter;
+  context->callback(context->parameter);
+  delete context;
+  return 0;
+}
+
+NALL_HEADER_INLINE auto thread::close() -> void {
+  if(handle) {
+    CloseHandle(handle);
+    handle = 0;
+  }
+}
+
+NALL_HEADER_INLINE auto thread::join() -> void {
+  if(handle) {
+    //wait until the thread has finished executing ...
+    WaitForSingleObject(handle, INFINITE);
+    CloseHandle(handle);
+    handle = 0;
+  }
+}
+
+NALL_HEADER_INLINE auto thread::create(const function<void (uintptr)>& callback, uintptr parameter, u32 stacksize) -> thread {
+  thread instance;
+
+  auto context = new thread::context;
+  context->callback = callback;
+  context->parameter = parameter;
+
+  instance.handle = CreateThread(nullptr, stacksize, _threadCallback, (void*)context, 0, nullptr);
+  return instance;
+}
+
+NALL_HEADER_INLINE auto thread::detach() -> void {
+  //Windows threads do not use this concept:
+  //~thread() frees resources via CloseHandle()
+  //thread continues to run even after handle is closed
+}
+
+NALL_HEADER_INLINE auto thread::exit() -> void {
+  ExitThread(0);
+}
+
+#endif
+
+}
diff --git a/waterbox/ares64/ares/nall/thread.hpp b/waterbox/ares64/ares/nall/thread.hpp
index f7204e825f..a76896f9ac 100644
--- a/waterbox/ares64/ares/nall/thread.hpp
+++ b/waterbox/ares64/ares/nall/thread.hpp
@@ -128,50 +128,10 @@ private:
   HANDLE handle = 0;
 };
 
-inline auto WINAPI _threadCallback(void* parameter) -> DWORD {
-  auto context = (thread::context*)parameter;
-  context->callback(context->parameter);
-  delete context;
-  return 0;
-}
-
-inline auto thread::close() -> void {
-  if(handle) {
-    CloseHandle(handle);
-    handle = 0;
-  }
-}
-
-inline auto thread::join() -> void {
-  if(handle) {
-    //wait until the thread has finished executing ...
-    WaitForSingleObject(handle, INFINITE);
-    CloseHandle(handle);
-    handle = 0;
-  }
-}
-
-inline auto thread::create(const function<void (uintptr)>& callback, uintptr parameter, u32 stacksize) -> thread {
-  thread instance;
-
-  auto context = new thread::context;
-  context->callback = callback;
-  context->parameter = parameter;
-
-  instance.handle = CreateThread(nullptr, stacksize, _threadCallback, (void*)context, 0, nullptr);
-  return instance;
-}
-
-inline auto thread::detach() -> void {
-  //Windows threads do not use this concept:
-  //~thread() frees resources via CloseHandle()
-  //thread continues to run even after handle is closed
-}
-
-inline auto thread::exit() -> void {
-  ExitThread(0);
-}
-
 }
 
 #endif
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/thread.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/traits.hpp b/waterbox/ares64/ares/nall/traits.hpp
index 0e810e66b9..5862b36bb7 100644
--- a/waterbox/ares64/ares/nall/traits.hpp
+++ b/waterbox/ares64/ares/nall/traits.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <initializer_list>
 #include <type_traits>
 #include <nall/stdint.hpp>
 
diff --git a/waterbox/ares64/ares/nall/vfs/cdrom.hpp b/waterbox/ares64/ares/nall/vfs/cdrom.hpp
index 084fa1ee95..f410a0b66b 100644
--- a/waterbox/ares64/ares/nall/vfs/cdrom.hpp
+++ b/waterbox/ares64/ares/nall/vfs/cdrom.hpp
@@ -134,9 +134,9 @@ private:
               memory::assign(target + 0, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff);  //sync
               memory::assign(target + 6, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00);  //sync
               auto [minute, second, frame] = CD::MSF(lbaFileBase + index.lba + sector);
-              target[12] = CD::BCD::encode(minute);
-              target[13] = CD::BCD::encode(second);
-              target[14] = CD::BCD::encode(frame);
+              target[12] = BCD::encode(minute);
+              target[13] = BCD::encode(second);
+              target[14] = BCD::encode(frame);
               target[15] = 0x01;  //mode
               filedata.read({target + 16, length});
               CD::RSPC::encodeMode1({target, 2352});
@@ -215,7 +215,20 @@ private:
         for(s32 sector : range(index.sectorCount())) {
           auto target = _image.data() + 2448ull * (LeadInSectors + index.lba + sector);
           auto sectorData = chd.read(lba);
-          memory::copy(target, 2352, sectorData.data(), sectorData.size());
+          if(sectorData.size() == 2048) {
+            //ISO: generate header + parity data
+            memory::assign(target + 0, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff);  //sync
+            memory::assign(target + 6, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00);  //sync
+            auto [minute, second, frame] = CD::MSF(index.lba + sector);
+            target[12] = BCD::encode(minute);
+            target[13] = BCD::encode(second);
+            target[14] = BCD::encode(frame);
+            target[15] = 0x01;  //mode
+            memory::copy(target + 16, 2048, sectorData.data(), sectorData.size());
+            CD::RSPC::encodeMode1({target, 2352});
+          } else {
+            memory::copy(target, 2352, sectorData.data(), sectorData.size());
+          }
           lba++;
         }
       }
diff --git a/waterbox/ares64/ares/nall/windows/detour.cpp b/waterbox/ares64/ares/nall/windows/detour.cpp
new file mode 100644
index 0000000000..b5a2612711
--- /dev/null
+++ b/waterbox/ares64/ares/nall/windows/detour.cpp
@@ -0,0 +1,69 @@
+#include <nall/windows/detour.hpp>
+
+namespace nall {
+
+NALL_HEADER_INLINE auto detour::insert(const string& moduleName, const string& functionName, void*& source, void* target) -> bool {
+  #if defined(ARCHITECTURE_X86)
+  HMODULE module = GetModuleHandleW(utf16_t(moduleName));
+  if(!module) return false;
+
+  u8* sourceData = (u8*)GetProcAddress(module, functionName);
+  if(!sourceData) return false;
+
+  u32 sourceLength = detour::length(sourceData);
+  if(sourceLength < 5) {
+    //unable to clone enough bytes to insert hook
+    #if 1
+    string output = {"detour::insert(", moduleName, "::", functionName, ") failed: "};
+    for(u32 n = 0; n < 16; n++) output.append(hex(sourceData[n], 2L), " ");
+    output.trimRight(" ", 1L);
+    MessageBoxA(0, output, "nall::detour", MB_OK);
+    #endif
+    return false;
+  }
+
+  auto mirrorData = new u8[512]();
+  detour::mirror(mirrorData, sourceData);
+
+  DWORD privileges;
+  VirtualProtect((void*)mirrorData, 512, PAGE_EXECUTE_READWRITE, &privileges);
+  VirtualProtect((void*)sourceData, 256, PAGE_EXECUTE_READWRITE, &privileges);
+  u64 address = (u64)target - ((u64)sourceData + 5);
+  sourceData[0] = 0xe9;  //jmp target
+  sourceData[1] = address >>  0;
+  sourceData[2] = address >>  8;
+  sourceData[3] = address >> 16;
+  sourceData[4] = address >> 24;
+  VirtualProtect((void*)sourceData, 256, privileges, &privileges);
+
+  source = (void*)mirrorData;
+  return true;
+  #else
+  return false;
+  #endif
+}
+
+NALL_HEADER_INLINE auto detour::remove(const string& moduleName, const string& functionName, void*& source) -> bool {
+  HMODULE module = GetModuleHandleW(utf16_t(moduleName));
+  if(!module) return false;
+
+  auto sourceData = (u8*)GetProcAddress(module, functionName);
+  if(!sourceData) return false;
+
+  auto mirrorData = (u8*)source;
+  if(mirrorData == sourceData) return false;  //hook was never installed
+
+  u32 length = detour::length(256 + mirrorData);
+  if(length < 5) return false;
+
+  DWORD privileges;
+  VirtualProtect((void*)sourceData, 256, PAGE_EXECUTE_READWRITE, &privileges);
+  for(u32 n = 0; n < length; n++) sourceData[n] = mirrorData[256 + n];
+  VirtualProtect((void*)sourceData, 256, privileges, &privileges);
+
+  source = (void*)sourceData;
+  delete[] mirrorData;
+  return true;
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/windows/detour.hpp b/waterbox/ares64/ares/nall/windows/detour.hpp
index b80b8d9dce..ebc3113dea 100644
--- a/waterbox/ares64/ares/nall/windows/detour.hpp
+++ b/waterbox/ares64/ares/nall/windows/detour.hpp
@@ -1,10 +1,9 @@
 #pragma once
 
-#include <nall/foreach.hpp>
 #include <nall/platform.hpp>
 #include <nall/stdint.hpp>
 #include <nall/string.hpp>
-#include <nall/utf8.hpp>
+#include <nall/windows/utf8.hpp>
 
 namespace nall {
 
@@ -25,107 +24,46 @@ protected:
     u32 mode;
     u16 modify;
   };
-  static opcode opcodes[];
+  static constexpr opcode opcodes[] = {
+    //TODO:
+    //* fs:, gs: should force another opcode copy
+    //* conditional branches within +5-byte range should fail
+
+      {0x50, 1},                   //push eax
+      {0x51, 1},                   //push ecx
+      {0x52, 1},                   //push edx
+      {0x53, 1},                   //push ebx
+      {0x54, 1},                   //push esp
+      {0x55, 1},                   //push ebp
+      {0x56, 1},                   //push esi
+      {0x57, 1},                   //push edi
+      {0x58, 1},                   //pop eax
+      {0x59, 1},                   //pop ecx
+      {0x5a, 1},                   //pop edx
+      {0x5b, 1},                   //pop ebx
+      {0x5c, 1},                   //pop esp
+      {0x5d, 1},                   //pop ebp
+      {0x5e, 1},                   //pop esi
+      {0x5f, 1},                   //pop edi
+      {0x64, 1},                   //fs:
+      {0x65, 1},                   //gs:
+      {0x68, 5},                   //push dword
+      {0x6a, 2},                   //push byte
+      {0x74, 2, RelNear, 0x0f84},  //je near      -> je far
+      {0x75, 2, RelNear, 0x0f85},  //jne near     -> jne far
+      {0x89, 2},                   //mov reg,reg
+      {0x8b, 2},                   //mov reg,reg
+      {0x90, 1},                   //nop
+      {0xa1, 5},                   //mov eax,[dword]
+      {0xeb, 2, RelNear,   0xe9},  //jmp near     -> jmp far
+  };
 };
 
-//TODO:
-//* fs:, gs: should force another opcode copy
-//* conditional branches within +5-byte range should fail
-detour::opcode detour::opcodes[] = {
-  {0x50, 1},                   //push eax
-  {0x51, 1},                   //push ecx
-  {0x52, 1},                   //push edx
-  {0x53, 1},                   //push ebx
-  {0x54, 1},                   //push esp
-  {0x55, 1},                   //push ebp
-  {0x56, 1},                   //push esi
-  {0x57, 1},                   //push edi
-  {0x58, 1},                   //pop eax
-  {0x59, 1},                   //pop ecx
-  {0x5a, 1},                   //pop edx
-  {0x5b, 1},                   //pop ebx
-  {0x5c, 1},                   //pop esp
-  {0x5d, 1},                   //pop ebp
-  {0x5e, 1},                   //pop esi
-  {0x5f, 1},                   //pop edi
-  {0x64, 1},                   //fs:
-  {0x65, 1},                   //gs:
-  {0x68, 5},                   //push dword
-  {0x6a, 2},                   //push byte
-  {0x74, 2, RelNear, 0x0f84},  //je near      -> je far
-  {0x75, 2, RelNear, 0x0f85},  //jne near     -> jne far
-  {0x89, 2},                   //mov reg,reg
-  {0x8b, 2},                   //mov reg,reg
-  {0x90, 1},                   //nop
-  {0xa1, 5},                   //mov eax,[dword]
-  {0xeb, 2, RelNear,   0xe9},  //jmp near     -> jmp far
-};
-
-inline auto detour::insert(const string& moduleName, const string& functionName, void*& source, void* target) -> bool {
-  HMODULE module = GetModuleHandleW(utf16_t(moduleName));
-  if(!module) return false;
-
-  u8* sourceData = (u8*)GetProcAddress(module, functionName);
-  if(!sourceData) return false;
-
-  u32 sourceLength = detour::length(sourceData);
-  if(sourceLength < 5) {
-    //unable to clone enough bytes to insert hook
-    #if 1
-    string output = {"detour::insert(", moduleName, "::", functionName, ") failed: "};
-    for(u32 n = 0; n < 16; n++) output.append(hex<2>(sourceData[n]), " ");
-    output.trimRight(" ", 1L);
-    MessageBoxA(0, output, "nall::detour", MB_OK);
-    #endif
-    return false;
-  }
-
-  auto mirrorData = new u8[512]();
-  detour::mirror(mirrorData, sourceData);
-
-  DWORD privileges;
-  VirtualProtect((void*)mirrorData, 512, PAGE_EXECUTE_READWRITE, &privileges);
-  VirtualProtect((void*)sourceData, 256, PAGE_EXECUTE_READWRITE, &privileges);
-  u64 address = (u64)target - ((u64)sourceData + 5);
-  sourceData[0] = 0xe9;  //jmp target
-  sourceData[1] = address >>  0;
-  sourceData[2] = address >>  8;
-  sourceData[3] = address >> 16;
-  sourceData[4] = address >> 24;
-  VirtualProtect((void*)sourceData, 256, privileges, &privileges);
-
-  source = (void*)mirrorData;
-  return true;
-}
-
-inline auto detour::remove(const string& moduleName, const string& functionName, void*& source) -> bool {
-  HMODULE module = GetModuleHandleW(utf16_t(moduleName));
-  if(!module) return false;
-
-  auto sourceData = (u8*)GetProcAddress(module, functionName);
-  if(!sourceData) return false;
-
-  auto mirrorData = (u8*)source;
-  if(mirrorData == sourceData) return false;  //hook was never installed
-
-  u32 length = detour::length(256 + mirrorData);
-  if(length < 5) return false;
-
-  DWORD privileges;
-  VirtualProtect((void*)sourceData, 256, PAGE_EXECUTE_READWRITE, &privileges);
-  for(u32 n = 0; n < length; n++) sourceData[n] = mirrorData[256 + n];
-  VirtualProtect((void*)sourceData, 256, privileges, &privileges);
-
-  source = (void*)sourceData;
-  delete[] mirrorData;
-  return true;
-}
-
 inline auto detour::length(const u8* function) -> u32 {
   u32 length = 0;
   while(length < 5) {
-    detour::opcode *opcode = 0;
-    foreach(op, detour::opcodes) {
+    const detour::opcode *opcode = 0;
+    for(auto& op : detour::opcodes) {
       if(function[length] == op.prefix) {
         opcode = &op;
         break;
@@ -143,8 +81,8 @@ inline auto detour::mirror(u8* target, const u8* source) -> u32 {
 
   u32 size = detour::length(source);
   while(size) {
-    detour::opcode* opcode = nullptr;
-    foreach(op, detour::opcodes) {
+    const detour::opcode* opcode = nullptr;
+    for(auto& op : detour::opcodes) {
       if(*source == op.prefix) {
         opcode = &op;
         break;
@@ -183,7 +121,11 @@ inline auto detour::mirror(u8* target, const u8* source) -> u32 {
   return source - entryPoint;
 }
 
-#undef Implied
+#undef Copy
 #undef RelNear
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/windows/detour.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/windows/guard.hpp b/waterbox/ares64/ares/nall/windows/guard.hpp
deleted file mode 100644
index 147c567624..0000000000
--- a/waterbox/ares64/ares/nall/windows/guard.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef NALL_WINDOWS_GUARD_HPP
-#define NALL_WINDOWS_GUARD_HPP
-
-#define boolean WindowsBoolean
-#define interface WindowsInterface
-
-#undef UNICODE
-#undef WINVER
-#undef WIN32_LEAN_AND_LEAN
-#undef _WIN32_WINNT
-#undef _WIN32_IE
-#undef NOMINMAX
-#undef PATH_MAX
-
-#define UNICODE
-#define WINVER 0x0601
-#define WIN32_LEAN_AND_MEAN
-#define _WIN32_WINNT WINVER
-#define _WIN32_IE WINVER
-#define NOMINMAX
-#define PATH_MAX 260
-
-#else
-#undef NALL_WINDOWS_GUARD_HPP
-
-#undef boolean
-#undef interface
-
-#undef far
-#undef near
-
-#endif
diff --git a/waterbox/ares64/ares/nall/windows/guid.cpp b/waterbox/ares64/ares/nall/windows/guid.cpp
new file mode 100644
index 0000000000..7f0f593906
--- /dev/null
+++ b/waterbox/ares64/ares/nall/windows/guid.cpp
@@ -0,0 +1,17 @@
+#include <nall/windows/guid.hpp>
+
+#include <combaseapi.h>
+
+namespace nall {
+
+NALL_HEADER_INLINE auto guid() -> string {
+  GUID guidInstance;
+  CoCreateGuid(&guidInstance);
+
+  wchar_t guidString[39];
+  StringFromGUID2(guidInstance, guidString, 39);
+
+  return (char*)utf8_t(guidString);
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/windows/guid.hpp b/waterbox/ares64/ares/nall/windows/guid.hpp
index 314683d490..e28ad8e7ce 100644
--- a/waterbox/ares64/ares/nall/windows/guid.hpp
+++ b/waterbox/ares64/ares/nall/windows/guid.hpp
@@ -1,17 +1,13 @@
-#pragma once
-
-#include <nall/string.hpp>
-
-namespace nall {
-
-inline auto guid() -> string {
-  GUID guidInstance;
-  CoCreateGuid(&guidInstance);
-
-  wchar_t guidString[39];
-  StringFromGUID2(guidInstance, guidString, 39);
-
-  return (char*)utf8_t(guidString);
-}
-
-}
+#pragma once
+
+#include <nall/string.hpp>
+
+namespace nall {
+
+auto guid() -> string;
+
+}
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/windows/guid.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/windows/launcher.cpp b/waterbox/ares64/ares/nall/windows/launcher.cpp
new file mode 100644
index 0000000000..a25db095a8
--- /dev/null
+++ b/waterbox/ares64/ares/nall/windows/launcher.cpp
@@ -0,0 +1,93 @@
+#include <nall/windows/launcher.hpp>
+
+namespace nall {
+
+NALL_HEADER_INLINE auto launch(const char* applicationName, const char* libraryName, u32 entryPoint) -> bool {
+  #if defined(ARCHITECTURE_X86)
+  //if a launcher does not send at least one message, a wait cursor will appear
+  PostThreadMessage(GetCurrentThreadId(), WM_USER, 0, 0);
+  MSG msg;
+  GetMessage(&msg, 0, 0, 0);
+
+  STARTUPINFOW si;
+  PROCESS_INFORMATION pi;
+
+  memset(&si, 0, sizeof(STARTUPINFOW));
+  BOOL result = CreateProcessW(
+    utf16_t(applicationName), GetCommandLineW(), NULL, NULL, TRUE,
+    DEBUG_PROCESS | DEBUG_ONLY_THIS_PROCESS,  //do not break if application creates its own processes
+    NULL, NULL, &si, &pi
+  );
+  if(result == false) return false;
+
+  u8 entryData[1024], entryHook[1024] = {
+    0x68, 0x00, 0x00, 0x00, 0x00,  //push libraryName
+    0xb8, 0x00, 0x00, 0x00, 0x00,  //mov eax,LoadLibraryW
+    0xff, 0xd0,                    //call eax
+    0xcd, 0x03,                    //int 3
+  };
+
+  entryHook[1] = (u8)((entryPoint + 14) >>  0);
+  entryHook[2] = (u8)((entryPoint + 14) >>  8);
+  entryHook[3] = (u8)((entryPoint + 14) >> 16);
+  entryHook[4] = (u8)((entryPoint + 14) >> 24);
+
+  auto pLoadLibraryW = (u32)GetProcAddress(GetModuleHandleW(L"kernel32"), "LoadLibraryW");
+  entryHook[6] = pLoadLibraryW >>  0;
+  entryHook[7] = pLoadLibraryW >>  8;
+  entryHook[8] = pLoadLibraryW >> 16;
+  entryHook[9] = pLoadLibraryW >> 24;
+
+  utf16_t buffer = utf16_t(libraryName);
+  memcpy(entryHook + 14, buffer, 2 * wcslen(buffer) + 2);
+
+  while(true) {
+    DEBUG_EVENT event;
+    WaitForDebugEvent(&event, INFINITE);
+
+    if(event.dwDebugEventCode == EXIT_PROCESS_DEBUG_EVENT) break;
+
+    if(event.dwDebugEventCode == EXCEPTION_DEBUG_EVENT) {
+      if(event.u.Exception.ExceptionRecord.ExceptionCode == EXCEPTION_BREAKPOINT) {
+        if(event.u.Exception.ExceptionRecord.ExceptionAddress == (void*)(entryPoint + 14 - 1)) {
+          HANDLE hProcess = OpenProcess(0, FALSE, event.dwProcessId);
+          HANDLE hThread = OpenThread(THREAD_ALL_ACCESS, FALSE, event.dwThreadId);
+
+          CONTEXT context;
+          context.ContextFlags = CONTEXT_FULL;
+          GetThreadContext(hThread, &context);
+
+          WriteProcessMemory(pi.hProcess, (void*)entryPoint, (void*)&entryData, sizeof entryData, NULL);
+          context.Eip = entryPoint;
+          SetThreadContext(hThread, &context);
+
+          CloseHandle(hThread);
+          CloseHandle(hProcess);
+        }
+
+        ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_CONTINUE);
+        continue;
+      }
+
+      ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_EXCEPTION_NOT_HANDLED);
+      continue;
+    }
+
+    if(event.dwDebugEventCode == CREATE_PROCESS_DEBUG_EVENT) {
+      ReadProcessMemory(pi.hProcess, (void*)entryPoint, (void*)&entryData, sizeof entryData, NULL);
+      WriteProcessMemory(pi.hProcess, (void*)entryPoint, (void*)&entryHook, sizeof entryHook, NULL);
+
+      ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_CONTINUE);
+      continue;
+    }
+
+    ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_CONTINUE);
+  }
+
+  return true;
+  #else
+  return false;
+  #endif
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/windows/launcher.hpp b/waterbox/ares64/ares/nall/windows/launcher.hpp
index 0d09faaf7e..1415b2869d 100644
--- a/waterbox/ares64/ares/nall/windows/launcher.hpp
+++ b/waterbox/ares64/ares/nall/windows/launcher.hpp
@@ -1,91 +1,15 @@
 #pragma once
 
+#include <nall/stdint.hpp>
+
 namespace nall {
 
 //launch a new process and inject specified DLL into it
 
-inline auto launch(const char* applicationName, const char* libraryName, u32 entryPoint) -> bool {
-  //if a launcher does not send at least one message, a wait cursor will appear
-  PostThreadMessage(GetCurrentThreadId(), WM_USER, 0, 0);
-  MSG msg;
-  GetMessage(&msg, 0, 0, 0);
-
-  STARTUPINFOW si;
-  PROCESS_INFORMATION pi;
-
-  memset(&si, 0, sizeof(STARTUPINFOW));
-  BOOL result = CreateProcessW(
-    utf16_t(applicationName), GetCommandLineW(), NULL, NULL, TRUE,
-    DEBUG_PROCESS | DEBUG_ONLY_THIS_PROCESS,  //do not break if application creates its own processes
-    NULL, NULL, &si, &pi
-  );
-  if(result == false) return false;
-
-  u8 entryData[1024], entryHook[1024] = {
-    0x68, 0x00, 0x00, 0x00, 0x00,  //push libraryName
-    0xb8, 0x00, 0x00, 0x00, 0x00,  //mov eax,LoadLibraryW
-    0xff, 0xd0,                    //call eax
-    0xcd, 0x03,                    //int 3
-  };
-
-  entryHook[1] = (u8)((entryPoint + 14) >>  0);
-  entryHook[2] = (u8)((entryPoint + 14) >>  8);
-  entryHook[3] = (u8)((entryPoint + 14) >> 16);
-  entryHook[4] = (u8)((entryPoint + 14) >> 24);
-
-  auto pLoadLibraryW = (u32)GetProcAddress(GetModuleHandleW(L"kernel32"), "LoadLibraryW");
-  entryHook[6] = pLoadLibraryW >>  0;
-  entryHook[7] = pLoadLibraryW >>  8;
-  entryHook[8] = pLoadLibraryW >> 16;
-  entryHook[9] = pLoadLibraryW >> 24;
-
-  utf16_t buffer = utf16_t(libraryName);
-  memcpy(entryHook + 14, buffer, 2 * wcslen(buffer) + 2);
-
-  while(true) {
-    DEBUG_EVENT event;
-    WaitForDebugEvent(&event, INFINITE);
-
-    if(event.dwDebugEventCode == EXIT_PROCESS_DEBUG_EVENT) break;
-
-    if(event.dwDebugEventCode == EXCEPTION_DEBUG_EVENT) {
-      if(event.u.Exception.ExceptionRecord.ExceptionCode == EXCEPTION_BREAKPOINT) {
-        if(event.u.Exception.ExceptionRecord.ExceptionAddress == (void*)(entryPoint + 14 - 1)) {
-          HANDLE hProcess = OpenProcess(0, FALSE, event.dwProcessId);
-          HANDLE hThread = OpenThread(THREAD_ALL_ACCESS, FALSE, event.dwThreadId);
-
-          CONTEXT context;
-          context.ContextFlags = CONTEXT_FULL;
-          GetThreadContext(hThread, &context);
-
-          WriteProcessMemory(pi.hProcess, (void*)entryPoint, (void*)&entryData, sizeof entryData, NULL);
-          context.Eip = entryPoint;
-          SetThreadContext(hThread, &context);
-
-          CloseHandle(hThread);
-          CloseHandle(hProcess);
-        }
-
-        ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_CONTINUE);
-        continue;
-      }
-
-      ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_EXCEPTION_NOT_HANDLED);
-      continue;
-    }
-
-    if(event.dwDebugEventCode == CREATE_PROCESS_DEBUG_EVENT) {
-      ReadProcessMemory(pi.hProcess, (void*)entryPoint, (void*)&entryData, sizeof entryData, NULL);
-      WriteProcessMemory(pi.hProcess, (void*)entryPoint, (void*)&entryHook, sizeof entryHook, NULL);
-
-      ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_CONTINUE);
-      continue;
-    }
-
-    ContinueDebugEvent(event.dwProcessId, event.dwThreadId, DBG_CONTINUE);
-  }
-
-  return true;
-}
+auto launch(const char* applicationName, const char* libraryName, u32 entryPoint) -> bool;
 
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/windows/launcher.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/windows/registry.cpp b/waterbox/ares64/ares/nall/windows/registry.cpp
new file mode 100644
index 0000000000..7703ab202e
--- /dev/null
+++ b/waterbox/ares64/ares/nall/windows/registry.cpp
@@ -0,0 +1,113 @@
+#include <nall/windows/registry.hpp>
+
+#include <shlwapi.h>
+
+#ifndef KEY_WOW64_64KEY
+  #define KEY_WOW64_64KEY 0x0100
+#endif
+#ifndef KEY_WOW64_32KEY
+  #define KEY_WOW64_32KEY 0x0200
+#endif
+
+#ifndef NWR_FLAGS
+  #define NWR_FLAGS KEY_WOW64_64KEY
+#endif
+
+#ifndef NWR_SIZE
+  #define NWR_SIZE 4096
+#endif
+
+namespace nall {
+
+NALL_HEADER_INLINE auto registry::root(const string& name) {
+  if(name == "HKCR") return HKEY_CLASSES_ROOT;
+  if(name == "HKCC") return HKEY_CURRENT_CONFIG;
+  if(name == "HKCU") return HKEY_CURRENT_USER;
+  if(name == "HKLM") return HKEY_LOCAL_MACHINE;
+  if(name == "HKU" ) return HKEY_USERS;
+  return (HKEY)nullptr;
+}
+
+NALL_HEADER_INLINE auto registry::exists(const string& name) -> bool {
+  auto part = name.split("\\");
+  HKEY handle, rootKey = root(part.takeLeft());
+  string node = part.takeRight();
+  string path = part.merge("\\");
+  if(RegOpenKeyExW(rootKey, utf16_t(path), 0, NWR_FLAGS | KEY_READ, &handle) == ERROR_SUCCESS) {
+    wchar_t data[NWR_SIZE] = L"";
+    DWORD size = NWR_SIZE * sizeof(wchar_t);
+    LONG result = RegQueryValueExW(handle, utf16_t(node), nullptr, nullptr, (LPBYTE)&data, (LPDWORD)&size);
+    RegCloseKey(handle);
+    if(result == ERROR_SUCCESS) return true;
+  }
+  return false;
+}
+
+NALL_HEADER_INLINE auto registry::read(const string& name) -> string {
+  auto part = name.split("\\");
+  HKEY handle, rootKey = root(part.takeLeft());
+  string node = part.takeRight();
+  string path = part.merge("\\");
+  if(RegOpenKeyExW(rootKey, utf16_t(path), 0, NWR_FLAGS | KEY_READ, &handle) == ERROR_SUCCESS) {
+    wchar_t data[NWR_SIZE] = L"";
+    DWORD size = NWR_SIZE * sizeof(wchar_t);
+    LONG result = RegQueryValueExW(handle, utf16_t(node), nullptr, nullptr, (LPBYTE)&data, (LPDWORD)&size);
+    RegCloseKey(handle);
+    if(result == ERROR_SUCCESS) return (const char*)utf8_t(data);
+  }
+  return "";
+}
+
+NALL_HEADER_INLINE auto registry::write(const string& name, const string& data) -> void {
+  auto part = name.split("\\");
+  HKEY handle, rootKey = root(part.takeLeft());
+  string node = part.takeRight(), path;
+  DWORD disposition;
+  for(u32 n = 0; n < part.size(); n++) {
+    path.append(part[n]);
+    if(RegCreateKeyExW(rootKey, utf16_t(path), 0, nullptr, 0, NWR_FLAGS | KEY_ALL_ACCESS, nullptr, &handle, &disposition) == ERROR_SUCCESS) {
+      if(n == part.size() - 1) {
+        RegSetValueExW(handle, utf16_t(node), 0, REG_SZ, (BYTE*)(wchar_t*)utf16_t(data), (data.length() + 1) * sizeof(wchar_t));
+      }
+      RegCloseKey(handle);
+    }
+    path.append("\\");
+  }
+}
+
+NALL_HEADER_INLINE auto registry::remove(const string& name) -> bool {
+  auto part = name.split("\\");
+  HKEY rootKey = root(part.takeLeft());
+  string node = part.takeRight();
+  string path = part.merge("\\");
+  if(!node) return SHDeleteKeyW(rootKey, utf16_t(path)) == ERROR_SUCCESS;
+  return SHDeleteValueW(rootKey, utf16_t(path), utf16_t(node)) == ERROR_SUCCESS;
+}
+
+NALL_HEADER_INLINE auto registry::contents(const string& name) -> vector<string> {
+  vector<string> result;
+  auto part = name.split("\\");
+  HKEY handle, rootKey = root(part.takeLeft());
+  part.removeRight();
+  string path = part.merge("\\");
+  if(RegOpenKeyExW(rootKey, utf16_t(path), 0, NWR_FLAGS | KEY_READ, &handle) == ERROR_SUCCESS) {
+    DWORD folders, nodes;
+    RegQueryInfoKey(handle, nullptr, nullptr, nullptr, &folders, nullptr, nullptr, &nodes, nullptr, nullptr, nullptr, nullptr);
+    for(u32 n = 0; n < folders; n++) {
+      wchar_t name[NWR_SIZE] = L"";
+      DWORD size = NWR_SIZE * sizeof(wchar_t);
+      RegEnumKeyEx(handle, n, (wchar_t*)&name, &size, nullptr, nullptr, nullptr, nullptr);
+      result.append(string{(const char*)utf8_t(name), "\\"});
+    }
+    for(u32 n = 0; n < nodes; n++) {
+      wchar_t name[NWR_SIZE] = L"";
+      DWORD size = NWR_SIZE * sizeof(wchar_t);
+      RegEnumValueW(handle, n, (wchar_t*)&name, &size, nullptr, nullptr, nullptr, nullptr);
+      result.append((const char*)utf8_t(name));
+    }
+    RegCloseKey(handle);
+  }
+  return result;
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/windows/registry.hpp b/waterbox/ares64/ares/nall/windows/registry.hpp
index e63e037db4..46f59ed2f3 100644
--- a/waterbox/ares64/ares/nall/windows/registry.hpp
+++ b/waterbox/ares64/ares/nall/windows/registry.hpp
@@ -1,119 +1,27 @@
-#pragma once
-
-#include <nall/platform.hpp>
-#include <nall/string.hpp>
-
-#include <shlwapi.h>
-#undef interface
-#ifndef KEY_WOW64_64KEY
-  #define KEY_WOW64_64KEY 0x0100
-#endif
-#ifndef KEY_WOW64_32KEY
-  #define KEY_WOW64_32KEY 0x0200
-#endif
-
-#ifndef NWR_FLAGS
-  #define NWR_FLAGS KEY_WOW64_64KEY
-#endif
-
-#ifndef NWR_SIZE
-  #define NWR_SIZE 4096
-#endif
-
-namespace nall {
-
-struct registry {
-  static auto exists(const string& name) -> bool {
-    auto part = name.split("\\");
-    HKEY handle, rootKey = root(part.takeLeft());
-    string node = part.takeRight();
-    string path = part.merge("\\");
-    if(RegOpenKeyExW(rootKey, utf16_t(path), 0, NWR_FLAGS | KEY_READ, &handle) == ERROR_SUCCESS) {
-      wchar_t data[NWR_SIZE] = L"";
-      DWORD size = NWR_SIZE * sizeof(wchar_t);
-      LONG result = RegQueryValueExW(handle, utf16_t(node), nullptr, nullptr, (LPBYTE)&data, (LPDWORD)&size);
-      RegCloseKey(handle);
-      if(result == ERROR_SUCCESS) return true;
-    }
-    return false;
-  }
-
-  static auto read(const string& name) -> string {
-    auto part = name.split("\\");
-    HKEY handle, rootKey = root(part.takeLeft());
-    string node = part.takeRight();
-    string path = part.merge("\\");
-    if(RegOpenKeyExW(rootKey, utf16_t(path), 0, NWR_FLAGS | KEY_READ, &handle) == ERROR_SUCCESS) {
-      wchar_t data[NWR_SIZE] = L"";
-      DWORD size = NWR_SIZE * sizeof(wchar_t);
-      LONG result = RegQueryValueExW(handle, utf16_t(node), nullptr, nullptr, (LPBYTE)&data, (LPDWORD)&size);
-      RegCloseKey(handle);
-      if(result == ERROR_SUCCESS) return (const char*)utf8_t(data);
-    }
-    return "";
-  }
-
-  static auto write(const string& name, const string& data = "") -> void {
-    auto part = name.split("\\");
-    HKEY handle, rootKey = root(part.takeLeft());
-    string node = part.takeRight(), path;
-    DWORD disposition;
-    for(u32 n = 0; n < part.size(); n++) {
-      path.append(part[n]);
-      if(RegCreateKeyExW(rootKey, utf16_t(path), 0, nullptr, 0, NWR_FLAGS | KEY_ALL_ACCESS, nullptr, &handle, &disposition) == ERROR_SUCCESS) {
-        if(n == part.size() - 1) {
-          RegSetValueExW(handle, utf16_t(node), 0, REG_SZ, (BYTE*)(wchar_t*)utf16_t(data), (data.length() + 1) * sizeof(wchar_t));
-        }
-        RegCloseKey(handle);
-      }
-      path.append("\\");
-    }
-  }
-
-  static auto remove(const string& name) -> bool {
-    auto part = name.split("\\");
-    HKEY rootKey = root(part.takeLeft());
-    string node = part.takeRight();
-    string path = part.merge("\\");
-    if(!node) return SHDeleteKeyW(rootKey, utf16_t(path)) == ERROR_SUCCESS;
-    return SHDeleteValueW(rootKey, utf16_t(path), utf16_t(node)) == ERROR_SUCCESS;
-  }
-
-  static auto contents(const string& name) -> vector<string> {
-    vector<string> result;
-    auto part = name.split("\\");
-    HKEY handle, rootKey = root(part.takeLeft());
-    part.removeRight();
-    string path = part.merge("\\");
-    if(RegOpenKeyExW(rootKey, utf16_t(path), 0, NWR_FLAGS | KEY_READ, &handle) == ERROR_SUCCESS) {
-      DWORD folders, nodes;
-      RegQueryInfoKey(handle, nullptr, nullptr, nullptr, &folders, nullptr, nullptr, &nodes, nullptr, nullptr, nullptr, nullptr);
-      for(u32 n = 0; n < folders; n++) {
-        wchar_t name[NWR_SIZE] = L"";
-        DWORD size = NWR_SIZE * sizeof(wchar_t);
-        RegEnumKeyEx(handle, n, (wchar_t*)&name, &size, nullptr, nullptr, nullptr, nullptr);
-        result.append(string{(const char*)utf8_t(name), "\\"});
-      }
-      for(u32 n = 0; n < nodes; n++) {
-        wchar_t name[NWR_SIZE] = L"";
-        DWORD size = NWR_SIZE * sizeof(wchar_t);
-        RegEnumValueW(handle, n, (wchar_t*)&name, &size, nullptr, nullptr, nullptr, nullptr);
-        result.append((const char*)utf8_t(name));
-      }
-      RegCloseKey(handle);
-    }
-    return result;
-  }
-
-private:
-  static auto root(const string& name) -> HKEY {
-    if(name == "HKCR") return HKEY_CLASSES_ROOT;
-    if(name == "HKCC") return HKEY_CURRENT_CONFIG;
-    if(name == "HKCU") return HKEY_CURRENT_USER;
-    if(name == "HKLM") return HKEY_LOCAL_MACHINE;
-    if(name == "HKU" ) return HKEY_USERS;
-    return nullptr;
-  }
-};
-
-}
+#pragma once
+
+#include <nall/platform.hpp>
+#include <nall/string.hpp>
+
+namespace nall {
+
+struct registry {
+  static auto exists(const string& name) -> bool;
+
+  static auto read(const string& name) -> string;
+
+  static auto write(const string& name, const string& data = "") -> void;
+
+  static auto remove(const string& name) -> bool;
+
+  static auto contents(const string& name) -> vector<string>;
+
+private:
+  static auto root(const string& name);
+};
+
+}
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/windows/registry.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/windows/utf8.cpp b/waterbox/ares64/ares/nall/windows/utf8.cpp
new file mode 100644
index 0000000000..bf6c13fc39
--- /dev/null
+++ b/waterbox/ares64/ares/nall/windows/utf8.cpp
@@ -0,0 +1,37 @@
+#include <nall/windows/utf8.hpp>
+
+#include <shellapi.h>
+
+namespace nall {
+
+NALL_HEADER_INLINE auto utf16_t::operator=(const char* s) -> utf16_t& {
+  reset();
+  if(!s) s = "";
+  length = MultiByteToWideChar(CP_UTF8, 0, s, -1, nullptr, 0);
+  buffer = new wchar_t[length + 1];
+  MultiByteToWideChar(CP_UTF8, 0, s, -1, buffer, length);
+  buffer[length] = 0;
+  return *this;
+}
+
+NALL_HEADER_INLINE auto utf8_t::operator=(const wchar_t* s) -> utf8_t& {
+  reset();
+  if(!s) s = L"";
+  length = WideCharToMultiByte(CP_UTF8, 0, s, -1, nullptr, 0, nullptr, nullptr);
+  buffer = new char[length + 1];
+  WideCharToMultiByte(CP_UTF8, 0, s, -1, buffer, length, nullptr, nullptr);
+  buffer[length] = 0;
+  return *this;
+}
+
+NALL_HEADER_INLINE auto utf8_arguments(int& argc, char**& argv) -> void {
+  wchar_t** wargv = CommandLineToArgvW(GetCommandLineW(), &argc);
+  argv = new char*[argc + 1]();
+  for(u32 i = 0; i < argc; i++) {
+    utf8_t arg(wargv[i]);
+    argv[i] = new char[arg.size() + 1];
+    strcpy(argv[i], arg);
+  }
+}
+
+}
diff --git a/waterbox/ares64/ares/nall/windows/utf8.hpp b/waterbox/ares64/ares/nall/windows/utf8.hpp
index b13d260adb..d97f20f452 100644
--- a/waterbox/ares64/ares/nall/windows/utf8.hpp
+++ b/waterbox/ares64/ares/nall/windows/utf8.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <nall/stdint.hpp>
+
 namespace nall {
   //UTF-8 to UTF-16
   struct utf16_t {
@@ -9,15 +11,7 @@ namespace nall {
     utf16_t(const utf16_t&) = delete;
     auto operator=(const utf16_t&) -> utf16_t& = delete;
 
-    auto operator=(const char* s) -> utf16_t& {
-      reset();
-      if(!s) s = "";
-      length = MultiByteToWideChar(CP_UTF8, 0, s, -1, nullptr, 0);
-      buffer = new wchar_t[length + 1];
-      MultiByteToWideChar(CP_UTF8, 0, s, -1, buffer, length);
-      buffer[length] = 0;
-      return *this;
-    }
+    auto operator=(const char* s) -> utf16_t&;
 
     operator wchar_t*() { return buffer; }
     operator const wchar_t*() const { return buffer; }
@@ -45,15 +39,7 @@ namespace nall {
     utf8_t(const utf8_t&) = delete;
     auto operator=(const utf8_t&) -> utf8_t& = delete;
 
-    auto operator=(const wchar_t* s) -> utf8_t& {
-      reset();
-      if(!s) s = L"";
-      length = WideCharToMultiByte(CP_UTF8, 0, s, -1, nullptr, 0, nullptr, nullptr);
-      buffer = new char[length + 1];
-      WideCharToMultiByte(CP_UTF8, 0, s, -1, buffer, length, nullptr, nullptr);
-      buffer[length] = 0;
-      return *this;
-    }
+    auto operator=(const wchar_t* s) -> utf8_t&;
 
     auto reset() -> void {
       delete[] buffer;
@@ -73,12 +59,10 @@ namespace nall {
     u32 length = 0;
   };
 
-  inline auto utf8_arguments(int& argc, char**& argv) -> void {
-    wchar_t** wargv = CommandLineToArgvW(GetCommandLineW(), &argc);
-    argv = new char*[argc + 1]();
-    for(u32 i = 0; i < argc; i++) {
-      argv[i] = new char[PATH_MAX];
-      strcpy(argv[i], nall::utf8_t(wargv[i]));
-    }
-  }
+  auto utf8_arguments(int& argc, char**& argv) -> void;
+
 }
+
+#if defined(NALL_HEADER_ONLY)
+  #include <nall/windows/utf8.cpp>
+#endif
diff --git a/waterbox/ares64/ares/nall/windows/windows.hpp b/waterbox/ares64/ares/nall/windows/windows.hpp
new file mode 100644
index 0000000000..edb1b2e90d
--- /dev/null
+++ b/waterbox/ares64/ares/nall/windows/windows.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#undef  NOMINMAX
+#define NOMINMAX
+
+#undef  UNICODE
+#define UNICODE
+
+#undef  WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+
+#include <windows.h>
diff --git a/waterbox/ares64/ares/thirdparty/angrylion-rdp b/waterbox/ares64/ares/thirdparty/angrylion-rdp
index 7b20c1c308..1bd37704d0 160000
--- a/waterbox/ares64/ares/thirdparty/angrylion-rdp
+++ b/waterbox/ares64/ares/thirdparty/angrylion-rdp
@@ -1 +1 @@
-Subproject commit 7b20c1c308340fcf8ccd15b97ebbb52a61c3cca0
+Subproject commit 1bd37704d0d48d85ab2ed36140d8f9c012dcf804
diff --git a/waterbox/ares64/ares/thirdparty/sljit/API_CHANGES b/waterbox/ares64/ares/thirdparty/sljit/API_CHANGES
index 34e2702c16..8ec367e302 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/API_CHANGES
+++ b/waterbox/ares64/ares/thirdparty/sljit/API_CHANGES
@@ -1,5 +1,30 @@
 This file is the short summary of the API changes:
 
+02.02.2022 - Backward compatible
+    All SLJIT_SET_* constants are
+    even numbers.
+
+27.01.2022 - Non-backward compatible
+    The arguments of sljit_emit_shift_into
+    are changed.
+
+17.12.2022 - Non-backward compatible
+    Replace sljit_emit_fast_enter and
+    sljit_get_return_address with
+    sljit_emit_op_dst.
+
+13.12.2022 - Non-backward compatible
+    Replace SLJIT_NOT with SLJIT_XOR.
+
+10.11.2022 - Non-backward compatible
+    Extract the pre/post update operations from
+    sljit_emit_mem to sljit_emit_mem_update
+    and sljit_emit_fmem to sljit_emit_fmem_update.
+
+04.11.2022 - Non-backward compatible
+    The SLJIT_32 flag is combined with the type
+    argument of cmov, not the dst_reg.
+
 16.06.2022 - Non-backward compatible
     Remove SLJIT_ENTER_CDECL and SLJIT_CALL_CDECL.
     The default calling mode is cdecl now.
@@ -8,7 +33,7 @@ This file is the short summary of the API changes:
     Floating point comparison types are renamed.
 
 01.03.2022 - Non-backward compatible
-    Remove SLJIT_NEG. Instead substraction from
+    Remove SLJIT_NEG. Instead subtraction from
     immedate 0 is preferred.
 
 31.01.2022 - Non-backward compatible
diff --git a/waterbox/ares64/ares/thirdparty/sljit/Makefile b/waterbox/ares64/ares/thirdparty/sljit/Makefile
index 8ac03ad773..eb85af35b3 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/Makefile
+++ b/waterbox/ares64/ares/thirdparty/sljit/Makefile
@@ -32,7 +32,10 @@ EXAMPLE_TARGET = $(BINDIR)/func_call $(BINDIR)/first_program $(BINDIR)/branch $(
 SLJIT_HEADERS = $(SRCDIR)/sljitLir.h $(SRCDIR)/sljitConfig.h $(SRCDIR)/sljitConfigInternal.h
 
 SLJIT_LIR_FILES = $(SRCDIR)/sljitLir.c $(SRCDIR)/sljitUtils.c \
-	$(SRCDIR)/sljitExecAllocator.c $(SRCDIR)/sljitProtExecAllocator.c $(SRCDIR)/sljitWXExecAllocator.c \
+	$(SRCDIR)/allocator_src/sljitExecAllocatorCore.c $(SRCDIR)/allocator_src/sljitExecAllocatorApple.c \
+	$(SRCDIR)/allocator_src/sljitExecAllocatorPosix.c $(SRCDIR)/allocator_src/sljitExecAllocatorWindows.c \
+	$(SRCDIR)/allocator_src/sljitProtExecAllocatorNetBSD.c $(SRCDIR)/allocator_src/sljitProtExecAllocatorPosix.c \
+	$(SRCDIR)/allocator_src/sljitWXExecAllocatorPosix.c $(SRCDIR)/allocator_src/sljitWXExecAllocatorWindows.c \
 	$(SRCDIR)/sljitNativeARM_32.c $(SRCDIR)/sljitNativeARM_T2_32.c $(SRCDIR)/sljitNativeARM_64.c \
 	$(SRCDIR)/sljitNativeMIPS_common.c $(SRCDIR)/sljitNativeMIPS_32.c $(SRCDIR)/sljitNativeMIPS_64.c \
 	$(SRCDIR)/sljitNativePPC_common.c $(SRCDIR)/sljitNativePPC_32.c $(SRCDIR)/sljitNativePPC_64.c \
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorApple.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorApple.c
new file mode 100644
index 0000000000..6352377c4d
--- /dev/null
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorApple.c
@@ -0,0 +1,118 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+/*
+   On macOS systems, returns MAP_JIT if it is defined _and_ we're running on a
+   version where it's OK to have more than one JIT block or where MAP_JIT is
+   required.
+   On non-macOS systems, returns MAP_JIT if it is defined.
+*/
+#include <TargetConditionals.h>
+#if TARGET_OS_OSX
+#if defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86
+#include <sys/utsname.h>
+#include <stdlib.h>
+
+#define SLJIT_MAP_JIT	(get_map_jit_flag())
+#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
+
+static SLJIT_INLINE int get_map_jit_flag()
+{
+	size_t page_size;
+	void *ptr;
+	struct utsname name;
+	static int map_jit_flag = -1;
+
+	if (map_jit_flag < 0) {
+		map_jit_flag = 0;
+		uname(&name);
+
+		/* Kernel version for 10.14.0 (Mojave) or later */
+		if (atoi(name.release) >= 18) {
+			page_size = get_page_alignment() + 1;
+			/* Only use MAP_JIT if a hardened runtime is used */
+			ptr = mmap(NULL, page_size, PROT_WRITE | PROT_EXEC,
+					MAP_PRIVATE | MAP_ANON, -1, 0);
+
+			if (ptr != MAP_FAILED)
+				munmap(ptr, page_size);
+			else
+				map_jit_flag = MAP_JIT;
+		}
+	}
+	return map_jit_flag;
+}
+#else /* !SLJIT_CONFIG_X86 */
+#if !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
+#error "Unsupported architecture"
+#endif /* SLJIT_CONFIG_ARM */
+
+#include <AvailabilityMacros.h>
+#include <pthread.h>
+
+#define SLJIT_MAP_JIT	(MAP_JIT)
+#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec) \
+		apple_update_wx_flags(enable_exec)
+
+static SLJIT_INLINE void apple_update_wx_flags(sljit_s32 enable_exec)
+{
+#if MAC_OS_X_VERSION_MIN_REQUIRED >= 110000
+	pthread_jit_write_protect_np(enable_exec);
+#else
+#error "Must target Big Sur or newer"
+#endif /* BigSur */
+}
+#endif /* SLJIT_CONFIG_X86 */
+#else /* !TARGET_OS_OSX */
+#define SLJIT_MAP_JIT	(MAP_JIT)
+#endif /* TARGET_OS_OSX */
+
+static SLJIT_INLINE void* alloc_chunk(sljit_uw size)
+{
+	void *retval;
+	int prot = PROT_READ | PROT_WRITE | PROT_EXEC;
+	int flags = MAP_PRIVATE;
+	int fd = -1;
+
+	flags |= MAP_ANON | SLJIT_MAP_JIT;
+
+	retval = mmap(NULL, size, prot, flags, fd, 0);
+	if (retval == MAP_FAILED)
+		return NULL;
+
+	SLJIT_UPDATE_WX_FLAGS(retval, (uint8_t *)retval + size, 0);
+
+	return retval;
+}
+
+static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
+{
+	munmap(chunk, size);
+}
+
+#include "sljitExecAllocatorCore.c"
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitExecAllocator.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorCore.c
similarity index 64%
rename from waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitExecAllocator.c
rename to waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorCore.c
index 92d940ddc2..6cd391104c 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitExecAllocator.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorCore.c
@@ -61,166 +61,42 @@
      [           one big free block           ]
 */
 
-/* --------------------------------------------------------------------- */
-/*  System (OS) functions                                                */
-/* --------------------------------------------------------------------- */
+/* Expected functions:
+     alloc_chunk / free_chunk :
+       * allocate executable system memory chunks
+       * the size is always divisible by CHUNK_SIZE
+     SLJIT_ALLOCATOR_LOCK / SLJIT_ALLOCATOR_UNLOCK :
+       * provided as part of sljitUtils
+       * only the allocator requires this lock, sljit is fully thread safe
+         as it only uses local variables
 
-/* 64 KByte. */
-#define CHUNK_SIZE	(sljit_uw)0x10000u
-
-/*
-   alloc_chunk / free_chunk :
-     * allocate executable system memory chunks
-     * the size is always divisible by CHUNK_SIZE
-   SLJIT_ALLOCATOR_LOCK / SLJIT_ALLOCATOR_UNLOCK :
-     * provided as part of sljitUtils
-     * only the allocator requires this lock, sljit is fully thread safe
-       as it only uses local variables
+   Supported defines:
+     SLJIT_HAS_CHUNK_HEADER - (optional) sljit_chunk_header is defined
+     SLJIT_HAS_EXECUTABLE_OFFSET - (optional) has executable offset data
+     SLJIT_UPDATE_WX_FLAGS - (optional) update WX flags
 */
 
-#ifdef _WIN32
-#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
+#ifdef SLJIT_HAS_CHUNK_HEADER
+#define CHUNK_HEADER_SIZE (sizeof(struct sljit_chunk_header))
+#else /* !SLJIT_HAS_CHUNK_HEADER */
+#define CHUNK_HEADER_SIZE 0
+#endif /* SLJIT_HAS_CHUNK_HEADER */
 
-static SLJIT_INLINE void* alloc_chunk(sljit_uw size)
-{
-	return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
-}
-
-static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
-{
-	SLJIT_UNUSED_ARG(size);
-	VirtualFree(chunk, 0, MEM_RELEASE);
-}
-
-#else /* POSIX */
-
-#if defined(__APPLE__) && defined(MAP_JIT)
-/*
-   On macOS systems, returns MAP_JIT if it is defined _and_ we're running on a
-   version where it's OK to have more than one JIT block or where MAP_JIT is
-   required.
-   On non-macOS systems, returns MAP_JIT if it is defined.
-*/
-#include <TargetConditionals.h>
-#if TARGET_OS_OSX
-#if defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86
-#ifdef MAP_ANON
-#include <sys/utsname.h>
-#include <stdlib.h>
-
-#define SLJIT_MAP_JIT	(get_map_jit_flag())
-
-static SLJIT_INLINE int get_map_jit_flag()
-{
-	size_t page_size;
-	void *ptr;
-	struct utsname name;
-	static int map_jit_flag = -1;
-
-	if (map_jit_flag < 0) {
-		map_jit_flag = 0;
-		uname(&name);
-
-		/* Kernel version for 10.14.0 (Mojave) or later */
-		if (atoi(name.release) >= 18) {
-			page_size = get_page_alignment() + 1;
-			/* Only use MAP_JIT if a hardened runtime is used */
-			ptr = mmap(NULL, page_size, PROT_WRITE | PROT_EXEC,
-					MAP_PRIVATE | MAP_ANON, -1, 0);
-
-			if (ptr != MAP_FAILED)
-				munmap(ptr, page_size);
-			else
-				map_jit_flag = MAP_JIT;
-		}
-	}
-	return map_jit_flag;
-}
-#endif /* MAP_ANON */
-#else /* !SLJIT_CONFIG_X86 */
-#if !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
-#error "Unsupported architecture"
-#endif /* SLJIT_CONFIG_ARM */
-#include <AvailabilityMacros.h>
-#include <pthread.h>
-
-#define SLJIT_MAP_JIT	(MAP_JIT)
-#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec) \
-                        apple_update_wx_flags(enable_exec)
-
-static SLJIT_INLINE void apple_update_wx_flags(sljit_s32 enable_exec)
-{
-#if MAC_OS_X_VERSION_MIN_REQUIRED >= 110000
-	pthread_jit_write_protect_np(enable_exec);
-#else
-#error "Must target Big Sur or newer"
-#endif /* BigSur */
-}
-#endif /* SLJIT_CONFIG_X86 */
-#else /* !TARGET_OS_OSX */
-#define SLJIT_MAP_JIT	(MAP_JIT)
-#endif /* TARGET_OS_OSX */
-#endif /* __APPLE__ && MAP_JIT */
 #ifndef SLJIT_UPDATE_WX_FLAGS
 #define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
-#endif /* !SLJIT_UPDATE_WX_FLAGS */
-#ifndef SLJIT_MAP_JIT
-#define SLJIT_MAP_JIT	(0)
-#endif /* !SLJIT_MAP_JIT */
+#endif /* SLJIT_UPDATE_WX_FLAGS */
 
-static SLJIT_INLINE void* alloc_chunk(sljit_uw size)
-{
-	void *retval;
-	int prot = PROT_READ | PROT_WRITE | PROT_EXEC;
-	int flags = MAP_PRIVATE;
-	int fd = -1;
-
-#ifdef PROT_MAX
-	prot |= PROT_MAX(prot);
-#endif
-
-#ifdef MAP_ANON
-	flags |= MAP_ANON | SLJIT_MAP_JIT;
-#else /* !MAP_ANON */
-	if (SLJIT_UNLIKELY((dev_zero < 0) && open_dev_zero()))
-		return NULL;
-
-	fd = dev_zero;
-#endif /* MAP_ANON */
-
-	retval = mmap(NULL, size, prot, flags, fd, 0);
-	if (retval == MAP_FAILED)
-		return NULL;
-
-#ifdef __FreeBSD__
-        /* HardenedBSD's mmap lies, so check permissions again */
-	if (mprotect(retval, size, PROT_READ | PROT_WRITE | PROT_EXEC) < 0) {
-		munmap(retval, size);
-		return NULL;
-	}
-#endif /* FreeBSD */
-
-	SLJIT_UPDATE_WX_FLAGS(retval, (uint8_t *)retval + size, 0);
-
-	return retval;
-}
-
-static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
-{
-	munmap(chunk, size);
-}
-
-#endif /* windows */
-
-/* --------------------------------------------------------------------- */
-/*  Common functions                                                     */
-/* --------------------------------------------------------------------- */
-
-#define CHUNK_MASK	(~(CHUNK_SIZE - 1))
+#ifndef CHUNK_SIZE
+/* 64 KByte if not specified. */
+#define CHUNK_SIZE	(sljit_uw)0x10000
+#endif /* CHUNK_SIZE */
 
 struct block_header {
 	sljit_uw size;
 	sljit_uw prev_size;
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+	sljit_sw executable_offset;
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
 };
 
 struct free_block {
@@ -234,8 +110,10 @@ struct free_block {
 	((struct block_header*)(((sljit_u8*)base) + offset))
 #define AS_FREE_BLOCK(base, offset) \
 	((struct free_block*)(((sljit_u8*)base) + offset))
-#define MEM_START(base)		((void*)(((sljit_u8*)base) + sizeof(struct block_header)))
+#define MEM_START(base)		((void*)((base) + 1))
+#define CHUNK_MASK		(~(CHUNK_SIZE - 1))
 #define ALIGN_SIZE(size)	(((size) + sizeof(struct block_header) + 7u) & ~(sljit_uw)7)
+#define CHUNK_EXTRA_SIZE	(sizeof(struct block_header) + CHUNK_HEADER_SIZE)
 
 static struct free_block* free_blocks;
 static sljit_uw allocated_size;
@@ -273,11 +151,21 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 	struct free_block *free_block;
 	sljit_uw chunk_size;
 
-	SLJIT_ALLOCATOR_LOCK();
+#ifdef SLJIT_HAS_CHUNK_HEADER
+	struct sljit_chunk_header *chunk_header;
+#else /* !SLJIT_HAS_CHUNK_HEADER */
+	void *chunk_header;
+#endif /* SLJIT_HAS_CHUNK_HEADER */
+
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+	sljit_sw executable_offset;
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
+
 	if (size < (64 - sizeof(struct block_header)))
 		size = (64 - sizeof(struct block_header));
 	size = ALIGN_SIZE(size);
 
+	SLJIT_ALLOCATOR_LOCK();
 	free_block = free_blocks;
 	while (free_block) {
 		if (free_block->size >= size) {
@@ -289,6 +177,9 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 				free_block->size = chunk_size;
 				header = AS_BLOCK_HEADER(free_block, chunk_size);
 				header->prev_size = chunk_size;
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+				header->executable_offset = free_block->header.executable_offset;
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
 				AS_BLOCK_HEADER(header, size)->prev_size = size;
 			}
 			else {
@@ -304,17 +195,28 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 		free_block = free_block->next;
 	}
 
-	chunk_size = (size + sizeof(struct block_header) + CHUNK_SIZE - 1) & CHUNK_MASK;
-	header = (struct block_header*)alloc_chunk(chunk_size);
-	if (!header) {
+	chunk_size = (size + CHUNK_EXTRA_SIZE + CHUNK_SIZE - 1) & CHUNK_MASK;
+
+	chunk_header = alloc_chunk(chunk_size);
+	if (!chunk_header) {
 		SLJIT_ALLOCATOR_UNLOCK();
 		return NULL;
 	}
 
-	chunk_size -= sizeof(struct block_header);
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+	executable_offset = (sljit_sw)((sljit_u8*)chunk_header->executable - (sljit_u8*)chunk_header);
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
+
+	chunk_size -= CHUNK_EXTRA_SIZE;
 	total_size += chunk_size;
 
+	header = (struct block_header*)(((sljit_u8*)chunk_header) + CHUNK_HEADER_SIZE);
+
 	header->prev_size = 0;
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+	header->executable_offset = executable_offset;
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
+
 	if (chunk_size > size + 64) {
 		/* Cut the allocated space into a free and a used block. */
 		allocated_size += size;
@@ -323,6 +225,9 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 
 		free_block = AS_FREE_BLOCK(header, size);
 		free_block->header.prev_size = size;
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+		free_block->header.executable_offset = executable_offset;
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
 		sljit_insert_free_block(free_block, chunk_size);
 		next_header = AS_BLOCK_HEADER(free_block, chunk_size);
 	}
@@ -332,9 +237,12 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 		header->size = chunk_size;
 		next_header = AS_BLOCK_HEADER(header, chunk_size);
 	}
+	SLJIT_ALLOCATOR_UNLOCK();
 	next_header->size = 1;
 	next_header->prev_size = chunk_size;
-	SLJIT_ALLOCATOR_UNLOCK();
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+	next_header->executable_offset = executable_offset;
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
 	return MEM_START(header);
 }
 
@@ -345,11 +253,15 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
 
 	SLJIT_ALLOCATOR_LOCK();
 	header = AS_BLOCK_HEADER(ptr, -(sljit_sw)sizeof(struct block_header));
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+	header = AS_BLOCK_HEADER(header, -header->executable_offset);
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
 	allocated_size -= header->size;
 
-	/* Connecting free blocks together if possible. */
 	SLJIT_UPDATE_WX_FLAGS(NULL, NULL, 0);
 
+	/* Connecting free blocks together if possible. */
+
 	/* If header->prev_size == 0, free_block will equal to header.
 	   In this case, free_block->header.size will be > 0. */
 	free_block = AS_FREE_BLOCK(header, -(sljit_sw)header->prev_size);
@@ -377,7 +289,7 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
 		if (total_size - free_block->size > (allocated_size * 3 / 2)) {
 			total_size -= free_block->size;
 			sljit_remove_free_block(free_block);
-			free_chunk(free_block, free_block->size + sizeof(struct block_header));
+			free_chunk(free_block, free_block->size + CHUNK_EXTRA_SIZE);
 		}
 	}
 
@@ -400,7 +312,7 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void)
 				AS_BLOCK_HEADER(free_block, free_block->size)->size == 1) {
 			total_size -= free_block->size;
 			sljit_remove_free_block(free_block);
-			free_chunk(free_block, free_block->size + sizeof(struct block_header));
+			free_chunk(free_block, free_block->size + CHUNK_EXTRA_SIZE);
 		}
 		free_block = next_free_block;
 	}
@@ -409,3 +321,10 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void)
 	SLJIT_UPDATE_WX_FLAGS(NULL, NULL, 1);
 	SLJIT_ALLOCATOR_UNLOCK();
 }
+
+#ifdef SLJIT_HAS_EXECUTABLE_OFFSET
+SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr)
+{
+	return ((struct block_header *)(ptr))[-1].executable_offset;
+}
+#endif /* SLJIT_HAS_EXECUTABLE_OFFSET */
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorFreeBSD.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorFreeBSD.c
new file mode 100644
index 0000000000..3b93a4df76
--- /dev/null
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorFreeBSD.c
@@ -0,0 +1,89 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/mman.h>
+#include <sys/procctl.h>
+
+#ifdef PROC_WXMAP_CTL
+static SLJIT_INLINE int sljit_is_wx_block(void)
+{
+	static int wx_block = -1;
+	if (wx_block < 0) {
+		int sljit_wx_enable = PROC_WX_MAPPINGS_PERMIT;
+		wx_block = !!procctl(P_PID, 0, PROC_WXMAP_CTL, &sljit_wx_enable);
+	}
+	return wx_block;
+}
+
+#define SLJIT_IS_WX_BLOCK sljit_is_wx_block()
+#else /* !PROC_WXMAP_CTL */
+#define SLJIT_IS_WX_BLOCK (1)
+#endif /* PROC_WXMAP_CTL */
+
+static SLJIT_INLINE void* alloc_chunk(sljit_uw size)
+{
+	void *retval;
+	int prot = PROT_READ | PROT_WRITE | PROT_EXEC;
+	int flags = MAP_PRIVATE;
+	int fd = -1;
+
+#ifdef PROT_MAX
+	prot |= PROT_MAX(prot);
+#endif
+
+#ifdef MAP_ANON
+	flags |= MAP_ANON;
+#else /* !MAP_ANON */
+	if (SLJIT_UNLIKELY((dev_zero < 0) && open_dev_zero()))
+		return NULL;
+
+	fd = dev_zero;
+#endif /* MAP_ANON */
+
+retry:
+	retval = mmap(NULL, size, prot, flags, fd, 0);
+	if (retval == MAP_FAILED) {
+		if (!SLJIT_IS_WX_BLOCK)
+			goto retry;
+
+		return NULL;
+	}
+
+	/* HardenedBSD's mmap lies, so check permissions again. */
+	if (mprotect(retval, size, PROT_READ | PROT_WRITE | PROT_EXEC) < 0) {
+		munmap(retval, size);
+		return NULL;
+	}
+
+	return retval;
+}
+
+static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
+{
+	munmap(chunk, size);
+}
+
+#include "sljitExecAllocatorCore.c"
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorPosix.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorPosix.c
new file mode 100644
index 0000000000..a775f5629a
--- /dev/null
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorPosix.c
@@ -0,0 +1,62 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+static SLJIT_INLINE void* alloc_chunk(sljit_uw size)
+{
+	void *retval;
+	int prot = PROT_READ | PROT_WRITE | PROT_EXEC;
+	int flags = MAP_PRIVATE;
+	int fd = -1;
+
+#ifdef PROT_MAX
+	prot |= PROT_MAX(prot);
+#endif
+
+#ifdef MAP_ANON
+	flags |= MAP_ANON;
+#else /* !MAP_ANON */
+	if (SLJIT_UNLIKELY((dev_zero < 0) && open_dev_zero()))
+		return NULL;
+
+	fd = dev_zero;
+#endif /* MAP_ANON */
+
+	retval = mmap(NULL, size, prot, flags, fd, 0);
+	if (retval == MAP_FAILED)
+		return NULL;
+
+	return retval;
+}
+
+static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
+{
+	munmap(chunk, size);
+}
+
+#include "sljitExecAllocatorCore.c"
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorWindows.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorWindows.c
new file mode 100644
index 0000000000..f152a5a2cd
--- /dev/null
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitExecAllocatorWindows.c
@@ -0,0 +1,40 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
+
+static SLJIT_INLINE void* alloc_chunk(sljit_uw size)
+{
+	return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+}
+
+static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
+{
+	SLJIT_UNUSED_ARG(size);
+	VirtualFree(chunk, 0, MEM_RELEASE);
+}
+
+#include "sljitExecAllocatorCore.c"
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitProtExecAllocatorNetBSD.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitProtExecAllocatorNetBSD.c
new file mode 100644
index 0000000000..0b7fd57787
--- /dev/null
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitProtExecAllocatorNetBSD.c
@@ -0,0 +1,72 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define SLJIT_HAS_CHUNK_HEADER
+#define SLJIT_HAS_EXECUTABLE_OFFSET
+
+struct sljit_chunk_header {
+	void *executable;
+};
+
+/*
+ * MAP_REMAPDUP is a NetBSD extension available sinde 8.0, make sure to
+ * adjust your feature macros (ex: -D_NETBSD_SOURCE) as needed
+ */
+static SLJIT_INLINE struct sljit_chunk_header* alloc_chunk(sljit_uw size)
+{
+	struct sljit_chunk_header *retval;
+
+	retval = (struct sljit_chunk_header *)mmap(NULL, size,
+			PROT_READ | PROT_WRITE | PROT_MPROTECT(PROT_EXEC),
+			MAP_ANON | MAP_SHARED, -1, 0);
+
+	if (retval == MAP_FAILED)
+		return NULL;
+
+	retval->executable = mremap(retval, size, NULL, size, MAP_REMAPDUP);
+	if (retval->executable == MAP_FAILED) {
+		munmap((void *)retval, size);
+		return NULL;
+	}
+
+	if (mprotect(retval->executable, size, PROT_READ | PROT_EXEC) == -1) {
+		munmap(retval->executable, size);
+		munmap((void *)retval, size);
+		return NULL;
+	}
+
+	return retval;
+}
+
+static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
+{
+	struct sljit_chunk_header *header = ((struct sljit_chunk_header *)chunk) - 1;
+
+	munmap(header->executable, size);
+	munmap((void *)header, size);
+}
+
+#include "sljitExecAllocatorCore.c"
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitProtExecAllocatorPosix.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitProtExecAllocatorPosix.c
new file mode 100644
index 0000000000..f7cb6c5670
--- /dev/null
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitProtExecAllocatorPosix.c
@@ -0,0 +1,172 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define SLJIT_HAS_CHUNK_HEADER
+#define SLJIT_HAS_EXECUTABLE_OFFSET
+
+struct sljit_chunk_header {
+	void *executable;
+};
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifndef O_NOATIME
+#define O_NOATIME 0
+#endif
+
+/* this is a linux extension available since kernel 3.11 */
+#ifndef O_TMPFILE
+#define O_TMPFILE 0x404000
+#endif
+
+#ifndef _GNU_SOURCE
+char *secure_getenv(const char *name);
+int mkostemp(char *template, int flags);
+#endif
+
+static SLJIT_INLINE int create_tempfile(void)
+{
+	int fd;
+	char tmp_name[256];
+	size_t tmp_name_len = 0;
+	char *dir;
+	struct stat st;
+#if defined(SLJIT_SINGLE_THREADED) && SLJIT_SINGLE_THREADED
+	mode_t mode;
+#endif
+
+#ifdef HAVE_MEMFD_CREATE
+	/* this is a GNU extension, make sure to use -D_GNU_SOURCE */
+	fd = memfd_create("sljit", MFD_CLOEXEC);
+	if (fd != -1) {
+		fchmod(fd, 0);
+		return fd;
+	}
+#endif
+
+	dir = secure_getenv("TMPDIR");
+
+	if (dir) {
+		size_t len = strlen(dir);
+		if (len > 0 && len < sizeof(tmp_name)) {
+			if ((stat(dir, &st) == 0) && S_ISDIR(st.st_mode)) {
+				memcpy(tmp_name, dir, len + 1);
+				tmp_name_len = len;
+			}
+		}
+	}
+
+#ifdef P_tmpdir
+	if (!tmp_name_len) {
+		tmp_name_len = strlen(P_tmpdir);
+		if (tmp_name_len > 0 && tmp_name_len < sizeof(tmp_name))
+			strcpy(tmp_name, P_tmpdir);
+	}
+#endif
+	if (!tmp_name_len) {
+		strcpy(tmp_name, "/tmp");
+		tmp_name_len = 4;
+	}
+
+	SLJIT_ASSERT(tmp_name_len > 0 && tmp_name_len < sizeof(tmp_name));
+
+	if (tmp_name_len > 1 && tmp_name[tmp_name_len - 1] == '/')
+		tmp_name[--tmp_name_len] = '\0';
+
+	fd = open(tmp_name, O_TMPFILE | O_EXCL | O_RDWR | O_NOATIME | O_CLOEXEC, 0);
+	if (fd != -1)
+		return fd;
+
+	if (tmp_name_len >= sizeof(tmp_name) - 7)
+		return -1;
+
+	strcpy(tmp_name + tmp_name_len, "/XXXXXX");
+#if defined(SLJIT_SINGLE_THREADED) && SLJIT_SINGLE_THREADED
+	mode = umask(0777);
+#endif
+	fd = mkostemp(tmp_name, O_CLOEXEC | O_NOATIME);
+#if defined(SLJIT_SINGLE_THREADED) && SLJIT_SINGLE_THREADED
+	umask(mode);
+#else
+	fchmod(fd, 0);
+#endif
+
+	if (fd == -1)
+		return -1;
+
+	if (unlink(tmp_name)) {
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+static SLJIT_INLINE struct sljit_chunk_header* alloc_chunk(sljit_uw size)
+{
+	struct sljit_chunk_header *retval;
+	int fd;
+
+	fd = create_tempfile();
+	if (fd == -1)
+		return NULL;
+
+	if (ftruncate(fd, (off_t)size)) {
+		close(fd);
+		return NULL;
+	}
+
+	retval = (struct sljit_chunk_header *)mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+	if (retval == MAP_FAILED) {
+		close(fd);
+		return NULL;
+	}
+
+	retval->executable = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0);
+
+	if (retval->executable == MAP_FAILED) {
+		munmap((void *)retval, size);
+		close(fd);
+		return NULL;
+	}
+
+	close(fd);
+	return retval;
+}
+
+static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
+{
+	struct sljit_chunk_header *header = ((struct sljit_chunk_header *)chunk) - 1;
+
+	munmap(header->executable, size);
+	munmap((void *)header, size);
+}
+
+#include "sljitExecAllocatorCore.c"
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitWXExecAllocator.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitWXExecAllocatorPosix.c
similarity index 66%
rename from waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitWXExecAllocator.c
rename to waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitWXExecAllocatorPosix.c
index 6893813155..36d301434a 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitWXExecAllocator.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitWXExecAllocatorPosix.c
@@ -25,8 +25,7 @@
  */
 
 /*
-   This file contains a simple W^X executable memory allocator for POSIX
-   like systems and Windows
+   This file contains a simple W^X executable memory allocator
 
    In *NIX, MAP_ANON is required (that is considered a feature) so make
    sure to set the right availability macros for your system or the code
@@ -51,55 +50,41 @@
    not possible.
 */
 
-#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec) \
-	sljit_update_wx_flags((from), (to), (enable_exec))
-
-#ifndef _WIN32
 #include <sys/types.h>
 #include <sys/mman.h>
 
-#ifdef __NetBSD__
-#define SLJIT_PROT_WX PROT_MPROTECT(PROT_EXEC)
-#define check_se_protected(ptr, size) (0)
-#else /* POSIX */
+#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec) \
+	sljit_update_wx_flags((from), (to), (enable_exec))
+
 #if !(defined SLJIT_SINGLE_THREADED && SLJIT_SINGLE_THREADED)
 #include <pthread.h>
 #define SLJIT_SE_LOCK()		pthread_mutex_lock(&se_lock)
 #define SLJIT_SE_UNLOCK()	pthread_mutex_unlock(&se_lock)
+#else
+#define SLJIT_SE_LOCK()
+#define SLJIT_SE_UNLOCK()
 #endif /* !SLJIT_SINGLE_THREADED */
 
-#define check_se_protected(ptr, size) generic_se_protected(ptr, size)
+#define SLJIT_WX_IS_BLOCK(ptr, size) generic_check_is_wx_block(ptr, size)
 
-static SLJIT_INLINE int generic_se_protected(void *ptr, sljit_uw size)
+static SLJIT_INLINE int generic_check_is_wx_block(void *ptr, sljit_uw size)
 {
 	if (SLJIT_LIKELY(!mprotect(ptr, size, PROT_EXEC)))
-		return mprotect(ptr, size, PROT_READ | PROT_WRITE);
+		return !!mprotect(ptr, size, PROT_READ | PROT_WRITE);
 
-	return -1;
+	return 1;
 }
-#endif /* NetBSD */
-
-#ifndef SLJIT_SE_LOCK
-#define SLJIT_SE_LOCK()
-#endif
-#ifndef SLJIT_SE_UNLOCK
-#define SLJIT_SE_UNLOCK()
-#endif
-#ifndef SLJIT_PROT_WX
-#define SLJIT_PROT_WX 0
-#endif
 
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 {
-#if !(defined SLJIT_SINGLE_THREADED && SLJIT_SINGLE_THREADED) \
-	&& !defined(__NetBSD__)
+#if !(defined SLJIT_SINGLE_THREADED && SLJIT_SINGLE_THREADED)
 	static pthread_mutex_t se_lock = PTHREAD_MUTEX_INITIALIZER;
 #endif
-	static int se_protected = !SLJIT_PROT_WX;
-	int prot = PROT_READ | PROT_WRITE | SLJIT_PROT_WX;
+	static int wx_block = -1;
+	int prot = PROT_READ | PROT_WRITE;
 	sljit_uw* ptr;
 
-	if (SLJIT_UNLIKELY(se_protected < 0))
+	if (SLJIT_UNLIKELY(wx_block > 0))
 		return NULL;
 
 #ifdef PROT_MAX
@@ -112,11 +97,11 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 	if (ptr == MAP_FAILED)
 		return NULL;
 
-	if (SLJIT_UNLIKELY(se_protected > 0)) {
+	if (SLJIT_UNLIKELY(wx_block < 0)) {
 		SLJIT_SE_LOCK();
-		se_protected = check_se_protected(ptr, size);
+		wx_block = SLJIT_WX_IS_BLOCK(ptr, size);
 		SLJIT_SE_UNLOCK();
-		if (SLJIT_UNLIKELY(se_protected < 0)) {
+		if (SLJIT_UNLIKELY(wx_block)) {
 			munmap((void *)ptr, size);
 			return NULL;
 		}
@@ -126,7 +111,6 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 	return ptr;
 }
 
-#undef SLJIT_PROT_WX
 #undef SLJIT_SE_UNLOCK
 #undef SLJIT_SE_LOCK
 
@@ -136,7 +120,7 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
 	munmap((void*)start_ptr, *start_ptr);
 }
 
-static void sljit_update_wx_flags(void *from, void *to, sljit_s32 enable_exec)
+static void sljit_update_wx_flags(void *from, void *to, int enable_exec)
 {
 	sljit_uw page_mask = (sljit_uw)get_page_alignment();
 	sljit_uw start = (sljit_uw)from;
@@ -151,53 +135,6 @@ static void sljit_update_wx_flags(void *from, void *to, sljit_s32 enable_exec)
 	mprotect((void*)start, end - start, prot);
 }
 
-#else /* windows */
-
-SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
-{
-	sljit_uw *ptr;
-
-	size += sizeof(sljit_uw);
-	ptr = (sljit_uw*)VirtualAlloc(NULL, size,
-				MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
-
-	if (!ptr)
-		return NULL;
-
-	*ptr++ = size;
-
-	return ptr;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
-{
-	sljit_uw start = (sljit_uw)ptr - sizeof(sljit_uw);
-#if defined(SLJIT_DEBUG) && SLJIT_DEBUG
-	sljit_uw page_mask = (sljit_uw)get_page_alignment();
-
-	SLJIT_ASSERT(!(start & page_mask));
-#endif
-	VirtualFree((void*)start, 0, MEM_RELEASE);
-}
-
-static void sljit_update_wx_flags(void *from, void *to, sljit_s32 enable_exec)
-{
-	DWORD oldprot;
-	sljit_uw page_mask = (sljit_uw)get_page_alignment();
-	sljit_uw start = (sljit_uw)from;
-	sljit_uw end = (sljit_uw)to;
-	DWORD prot = enable_exec ? PAGE_EXECUTE : PAGE_READWRITE;
-
-	SLJIT_ASSERT(start < end);
-
-	start &= ~page_mask;
-	end = (end + page_mask) & ~page_mask;
-
-	VirtualProtect((void*)start, end - start, prot, &oldprot);
-}
-
-#endif /* !windows */
-
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void)
 {
 	/* This allocator does not keep unused memory for future allocations. */
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitWXExecAllocatorWindows.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitWXExecAllocatorWindows.c
new file mode 100644
index 0000000000..a9553bd7da
--- /dev/null
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/allocator_src/sljitWXExecAllocatorWindows.c
@@ -0,0 +1,102 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   This file contains a simple W^X executable memory allocator
+
+   In *NIX, MAP_ANON is required (that is considered a feature) so make
+   sure to set the right availability macros for your system or the code
+   will fail to build.
+
+   If your system doesn't support mapping of anonymous pages (ex: IRIX) it
+   is also likely that it doesn't need this allocator and should be using
+   the standard one instead.
+
+   It allocates a separate map for each code block and may waste a lot of
+   memory, because whatever was requested, will be rounded up to the page
+   size (minimum 4KB, but could be even bigger).
+
+   It changes the page permissions (RW <-> RX) as needed and therefore, if you
+   will be updating the code after it has been generated, need to make sure to
+   block any concurrent execution, or could result in a SIGBUS, that could
+   even manifest itself at a different address than the one that was being
+   modified.
+
+   Only use if you are unable to use the regular allocator because of security
+   restrictions and adding exceptions to your application or the system are
+   not possible.
+*/
+
+#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec) \
+	sljit_update_wx_flags((from), (to), (enable_exec))
+
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
+{
+	sljit_uw *ptr;
+
+	size += sizeof(sljit_uw);
+	ptr = (sljit_uw*)VirtualAlloc(NULL, size,
+				MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+
+	if (!ptr)
+		return NULL;
+
+	*ptr++ = size;
+
+	return ptr;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
+{
+	sljit_uw start = (sljit_uw)ptr - sizeof(sljit_uw);
+#if defined(SLJIT_DEBUG) && SLJIT_DEBUG
+	sljit_uw page_mask = (sljit_uw)get_page_alignment();
+
+	SLJIT_ASSERT(!(start & page_mask));
+#endif
+	VirtualFree((void*)start, 0, MEM_RELEASE);
+}
+
+static void sljit_update_wx_flags(void *from, void *to, sljit_s32 enable_exec)
+{
+	DWORD oldprot;
+	sljit_uw page_mask = (sljit_uw)get_page_alignment();
+	sljit_uw start = (sljit_uw)from;
+	sljit_uw end = (sljit_uw)to;
+	DWORD prot = enable_exec ? PAGE_EXECUTE : PAGE_READWRITE;
+
+	SLJIT_ASSERT(start < end);
+
+	start &= ~page_mask;
+	end = (end + page_mask) & ~page_mask;
+
+	VirtualProtect((void*)start, end - start, prot, &oldprot);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void)
+{
+	/* This allocator does not keep unused memory for future allocations. */
+}
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfig.h b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfig.h
index 5fba7aa638..e11d4a2e1a 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfig.h
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfig.h
@@ -96,7 +96,9 @@ extern "C" {
 
 /* Executable code allocation:
    If SLJIT_EXECUTABLE_ALLOCATOR is not defined, the application should
-   define SLJIT_MALLOC_EXEC, SLJIT_FREE_EXEC, and SLJIT_EXEC_OFFSET. */
+   define SLJIT_MALLOC_EXEC and SLJIT_FREE_EXEC.
+   Optionally, depending on the implementation used for the allocator,
+   SLJIT_EXEC_OFFSET and SLJIT_UPDATE_WX_FLAGS might also be needed. */
 #ifndef SLJIT_EXECUTABLE_ALLOCATOR
 /* Enabled by default. */
 #define SLJIT_EXECUTABLE_ALLOCATOR 1
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfigInternal.h b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfigInternal.h
index bad7772bea..e9bd4d9322 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfigInternal.h
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitConfigInternal.h
@@ -61,6 +61,8 @@ extern "C" {
      SLJIT_BIG_ENDIAN : big endian architecture
      SLJIT_UNALIGNED : unaligned memory accesses for non-fpu operations are supported
      SLJIT_FPU_UNALIGNED : unaligned memory accesses for fpu operations are supported
+     SLJIT_MASKED_SHIFT : all word shifts are always masked
+     SLJIT_MASKED_SHIFT32 : all 32 bit shifts are always masked
      SLJIT_INDIRECT_CALL : see SLJIT_FUNC_ADDR() for more information
 
    Constants:
@@ -134,23 +136,23 @@ extern "C" {
 /********************************************************/
 
 #if (defined SLJIT_CONFIG_AUTO && SLJIT_CONFIG_AUTO)
-
 #ifndef _WIN32
 
 #if defined(__i386__) || defined(__i386)
 #define SLJIT_CONFIG_X86_32 1
 #elif defined(__x86_64__)
 #define SLJIT_CONFIG_X86_64 1
-#elif defined(__arm__) || defined(__ARM__)
-#ifdef __thumb2__
-#define SLJIT_CONFIG_ARM_THUMB2 1
-#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__)
-#define SLJIT_CONFIG_ARM_V7 1
-#else
-#define SLJIT_CONFIG_ARM_V5 1
-#endif
-#elif defined (__aarch64__)
+#elif defined(__aarch64__)
 #define SLJIT_CONFIG_ARM_64 1
+#elif defined(__thumb2__)
+#define SLJIT_CONFIG_ARM_THUMB2 1
+#elif (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
+	((defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)) \
+	 || (defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8R__)) \
+	 || (defined(__ARM_ARCH_9A__)))
+#define SLJIT_CONFIG_ARM_V7 1
+#elif defined(__arm__) || defined (__ARM__)
+#define SLJIT_CONFIG_ARM_V5 1
 #elif defined(__ppc64__) || defined(__powerpc64__) || (defined(_ARCH_PPC64) && defined(__64BIT__)) || (defined(_POWER) && defined(__64BIT__))
 #define SLJIT_CONFIG_PPC_64 1
 #elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(_ARCH_PWR) || defined(_ARCH_PWR2) || defined(_POWER)
@@ -603,7 +605,7 @@ typedef double sljit_f64;
 #endif
 #endif /* SLJIT_INDIRECT_CALL */
 
-/* The offset which needs to be substracted from the return address to
+/* The offset which needs to be subtracted from the return address to
 determine the next executed instruction after return. */
 #ifndef SLJIT_RETURN_ADDRESS_OFFSET
 #define SLJIT_RETURN_ADDRESS_OFFSET 0
@@ -631,12 +633,14 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void);
 #if (defined SLJIT_PROT_EXECUTABLE_ALLOCATOR && SLJIT_PROT_EXECUTABLE_ALLOCATOR)
 SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 #define SLJIT_EXEC_OFFSET(ptr) sljit_exec_offset(ptr)
-#else
-#define SLJIT_EXEC_OFFSET(ptr) 0
 #endif
 
 #endif /* SLJIT_EXECUTABLE_ALLOCATOR */
 
+#ifndef SLJIT_EXEC_OFFSET
+#define SLJIT_EXEC_OFFSET(ptr) 0
+#endif
+
 /**********************************************/
 /* Registers and locals offset determination. */
 /**********************************************/
@@ -649,6 +653,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0
 #define SLJIT_LOCALS_OFFSET_BASE (8 * SSIZE_OF(sw))
 #define SLJIT_PREF_SHIFT_REG SLJIT_R2
+#define SLJIT_MASKED_SHIFT 1
+#define SLJIT_MASKED_SHIFT32 1
 
 #elif (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 
@@ -664,6 +670,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 #define SLJIT_LOCALS_OFFSET_BASE (4 * SSIZE_OF(sw))
 #endif /* !_WIN64 */
 #define SLJIT_PREF_SHIFT_REG SLJIT_R3
+#define SLJIT_MASKED_SHIFT 1
+#define SLJIT_MASKED_SHIFT32 1
 
 #elif (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5) || (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
 
@@ -688,6 +696,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
 #define SLJIT_LOCALS_OFFSET_BASE (2 * (sljit_s32)sizeof(sljit_sw))
+#define SLJIT_MASKED_SHIFT 1
+#define SLJIT_MASKED_SHIFT32 1
 
 #elif (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
 
@@ -717,6 +727,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 29
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
 #endif
+#define SLJIT_MASKED_SHIFT 1
+#define SLJIT_MASKED_SHIFT32 1
 
 #elif (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
 
@@ -725,6 +737,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 #define SLJIT_LOCALS_OFFSET_BASE 0
 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12
+#define SLJIT_MASKED_SHIFT 1
+#define SLJIT_MASKED_SHIFT32 1
 
 #elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 
@@ -754,6 +768,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 15
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
 #define SLJIT_LOCALS_OFFSET_BASE SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE
+#define SLJIT_MASKED_SHIFT 1
 
 #elif (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)
 
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.c
index 91765656b5..4a73e8e495 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.c
@@ -93,7 +93,8 @@
 #define SSIZE_OF(type) ((sljit_s32)sizeof(sljit_ ## type))
 
 #define VARIABLE_FLAG_SHIFT (10)
-#define VARIABLE_FLAG_MASK (0x3f << VARIABLE_FLAG_SHIFT)
+/* All variable flags are even. */
+#define VARIABLE_FLAG_MASK (0x3e << VARIABLE_FLAG_SHIFT)
 #define GET_FLAG_TYPE(op) ((op) >> VARIABLE_FLAG_SHIFT)
 
 #define GET_OPCODE(op) \
@@ -254,9 +255,9 @@
 	(((scratches < SLJIT_NUMBER_OF_SCRATCH_REGISTERS ? 0 : (scratches - SLJIT_NUMBER_OF_SCRATCH_REGISTERS)) + \
 		(saveds) + (sljit_s32)(extra)) * (sljit_s32)sizeof(sljit_sw))
 
-#define GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, size) \
+#define GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, type) \
 	(((fscratches < SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS ? 0 : (fscratches - SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS)) + \
-		(fsaveds)) * (sljit_s32)(size))
+		(fsaveds)) * SSIZE_OF(type))
 
 #define ADJUST_LOCAL_OFFSET(p, i) \
 	if ((p) == (SLJIT_MEM1(SLJIT_SP))) \
@@ -272,25 +273,49 @@
 #if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR)
 
 #if (defined SLJIT_PROT_EXECUTABLE_ALLOCATOR && SLJIT_PROT_EXECUTABLE_ALLOCATOR)
-#include "sljitProtExecAllocator.c"
-#elif (defined SLJIT_WX_EXECUTABLE_ALLOCATOR && SLJIT_WX_EXECUTABLE_ALLOCATOR)
-#include "sljitWXExecAllocator.c"
+
+#if defined(__NetBSD__)
+#include "allocator_src/sljitProtExecAllocatorNetBSD.c"
 #else
-#include "sljitExecAllocator.c"
+#include "allocator_src/sljitProtExecAllocatorPosix.c"
+#endif
+
+#elif (defined SLJIT_WX_EXECUTABLE_ALLOCATOR && SLJIT_WX_EXECUTABLE_ALLOCATOR)
+
+#if defined(_WIN32)
+#include "allocator_src/sljitWXExecAllocatorWindows.c"
+#else
+#include "allocator_src/sljitWXExecAllocatorPosix.c"
+#endif
+
+#else
+
+#if defined(_WIN32)
+#include "allocator_src/sljitExecAllocatorWindows.c"
+#elif defined(__APPLE__)
+#include "allocator_src/sljitExecAllocatorApple.c"
+#elif defined(__FreeBSD__)
+#include "allocator_src/sljitExecAllocatorFreeBSD.c"
+#else
+#include "allocator_src/sljitExecAllocatorPosix.c"
 #endif
 
 #endif
 
+#else /* !SLJIT_EXECUTABLE_ALLOCATOR */
+
+#ifndef SLJIT_UPDATE_WX_FLAGS
+#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
+#endif
+
+#endif /* SLJIT_EXECUTABLE_ALLOCATOR */
+
 #if (defined SLJIT_PROT_EXECUTABLE_ALLOCATOR && SLJIT_PROT_EXECUTABLE_ALLOCATOR)
 #define SLJIT_ADD_EXEC_OFFSET(ptr, exec_offset) ((sljit_u8 *)(ptr) + (exec_offset))
 #else
 #define SLJIT_ADD_EXEC_OFFSET(ptr, exec_offset) ((sljit_u8 *)(ptr))
 #endif
 
-#ifndef SLJIT_UPDATE_WX_FLAGS
-#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
-#endif
-
 /* Argument checking features. */
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
@@ -993,19 +1018,21 @@ static const char* op0_names[] = {
 static const char* op1_names[] = {
 	"", ".u8", ".s8", ".u16",
 	".s16", ".u32", ".s32", "32",
-	".p", "not", "clz",
+	".p", "clz", "ctz", "rev"
 };
 
 static const char* op2_names[] = {
 	"add", "addc", "sub", "subc",
 	"mul", "and", "or", "xor",
-	"shl", "lshr", "ashr",
+	"shl", "mshl", "lshr", "mlshr",
+	"ashr", "mashr", "rotl", "rotr"
 };
 
-static const char* op_src_names[] = {
+static const char* op_src_dst_names[] = {
 	"fast_return", "skip_frames_before_fast_return",
 	"prefetch_l1", "prefetch_l2",
 	"prefetch_l3", "prefetch_once",
+	"fast_enter", "get_return_address"
 };
 
 static const char* fop1_names[] = {
@@ -1095,7 +1122,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_enter(struct sljit_compil
 	CHECK_ARGUMENT(fsaveds >= 0 && fsaveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS);
 	CHECK_ARGUMENT(fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
 	CHECK_ARGUMENT(local_size >= 0 && local_size <= SLJIT_MAX_LOCAL_SIZE);
-	CHECK_ARGUMENT((arg_types & SLJIT_ARG_FULL_MASK) < SLJIT_ARG_TYPE_F64);
+	CHECK_ARGUMENT((arg_types & SLJIT_ARG_FULL_MASK) <= SLJIT_ARG_TYPE_F32);
 	CHECK_ARGUMENT(function_check_arguments(arg_types, scratches, (options & SLJIT_ENTER_REG_ARG) ? 0 : saveds, fscratches));
 
 	compiler->last_flags = 0;
@@ -1125,7 +1152,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_enter(struct sljit_compil
 				fprintf(compiler->verbose, " keep:%d,", SLJIT_KEPT_SAVEDS_COUNT(options));
 		}
 
-		fprintf(compiler->verbose, "scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n",
+		fprintf(compiler->verbose, " scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n",
 			scratches, saveds, fscratches, fsaveds, local_size);
 	}
 #endif
@@ -1223,35 +1250,53 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_return(struct sljit_compi
 	case SLJIT_ARG_TYPE_P:
 		CHECK_ARGUMENT(op == SLJIT_MOV_P);
 		break;
+	case SLJIT_ARG_TYPE_F64:
+		CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
+		CHECK_ARGUMENT(op == SLJIT_MOV_F64);
+		break;
+	case SLJIT_ARG_TYPE_F32:
+		CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
+		CHECK_ARGUMENT(op == SLJIT_MOV_F32);
+		break;
 	default:
 		/* Context not initialized, void, etc. */
 		CHECK_ARGUMENT(0);
 		break;
 	}
-	FUNCTION_CHECK_SRC(src, srcw);
+
+	if (GET_OPCODE(op) < SLJIT_MOV_F64) {
+		FUNCTION_CHECK_SRC(src, srcw);
+	} else {
+		FUNCTION_FCHECK(src, srcw);
+	}
 	compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  return%s%s ", !(op & SLJIT_32) ? "" : "32",
-			op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE]);
-		sljit_verbose_param(compiler, src, srcw);
+		if (GET_OPCODE(op) < SLJIT_MOV_F64) {
+			fprintf(compiler->verbose, "  return%s%s ", !(op & SLJIT_32) ? "" : "32",
+				op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE]);
+			sljit_verbose_param(compiler, src, srcw);
+		} else {
+			fprintf(compiler->verbose, "  return%s ", !(op & SLJIT_32) ? ".f64" : ".f32");
+			sljit_verbose_fparam(compiler, src, srcw);
+		}
 		fprintf(compiler->verbose, "\n");
 	}
 #endif
 	CHECK_RETURN_OK;
 }
 
-static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	FUNCTION_CHECK_DST(dst, dstw);
-	compiler->last_flags = 0;
+	FUNCTION_CHECK_SRC(src, srcw);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  fast_enter ");
-		sljit_verbose_param(compiler, dst, dstw);
+		fprintf(compiler->verbose, "  return_to ");
+		sljit_verbose_param(compiler, src, srcw);
 		fprintf(compiler->verbose, "\n");
 	}
 #endif
@@ -1291,15 +1336,13 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op1(struct sljit_compiler
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_CLZ);
+	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_REV);
 
 	switch (GET_OPCODE(op)) {
-	case SLJIT_NOT:
-		/* Only SLJIT_32 and SLJIT_SET_Z are allowed. */
-		CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK));
-		break;
 	case SLJIT_MOV:
 	case SLJIT_MOV_U32:
+	case SLJIT_MOV_S32:
+	case SLJIT_MOV32:
 	case SLJIT_MOV_P:
 		/* Nothing allowed */
 		CHECK_ARGUMENT(!(op & (SLJIT_32 | SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
@@ -1312,11 +1355,6 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op1(struct sljit_compiler
 
 	FUNCTION_CHECK_DST(dst, dstw);
 	FUNCTION_CHECK_SRC(src, srcw);
-
-	if (GET_OPCODE(op) >= SLJIT_NOT) {
-		CHECK_ARGUMENT(src != SLJIT_IMM);
-		compiler->last_flags = GET_FLAG_TYPE(op) | (op & (SLJIT_32 | SLJIT_SET_Z));
-	}
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1352,15 +1390,18 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op2(struct sljit_compiler
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_ADD && GET_OPCODE(op) <= SLJIT_ASHR);
+	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_ADD && GET_OPCODE(op) <= SLJIT_ROTR);
 
 	switch (GET_OPCODE(op)) {
 	case SLJIT_AND:
 	case SLJIT_OR:
 	case SLJIT_XOR:
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK));
 		break;
 	case SLJIT_MUL:
@@ -1385,6 +1426,10 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op2(struct sljit_compiler
 		CHECK_ARGUMENT((compiler->last_flags & 0xff) == GET_FLAG_TYPE(SLJIT_SET_CARRY));
 		CHECK_ARGUMENT((op & SLJIT_32) == (compiler->last_flags & SLJIT_32));
 		break;
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
+		break;
 	default:
 		SLJIT_UNREACHABLE();
 		break;
@@ -1418,6 +1463,40 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op2(struct sljit_compiler
 	CHECK_RETURN_OK;
 }
 
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_LSHR
+		|| GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR);
+	CHECK_ARGUMENT((op & ~(0xff | SLJIT_32 | SLJIT_SHIFT_INTO_NON_ZERO)) == 0);
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src1_reg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src2_reg));
+	FUNCTION_CHECK_SRC(src3, src3w);
+	CHECK_ARGUMENT(dst_reg != src2_reg);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		fprintf(compiler->verbose, "  %s%s.into%s ", op2_names[GET_OPCODE(op) - SLJIT_OP2_BASE], !(op & SLJIT_32) ? "" : "32",
+			(op & SLJIT_SHIFT_INTO_NON_ZERO) ? ".nz" : "");
+
+		sljit_verbose_reg(compiler, dst_reg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_reg(compiler, src1_reg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_reg(compiler, src2_reg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_param(compiler, src3, src3w);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1425,19 +1504,16 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_src(struct sljit_compi
 	CHECK_ARGUMENT(op >= SLJIT_FAST_RETURN && op <= SLJIT_PREFETCH_ONCE);
 	FUNCTION_CHECK_SRC(src, srcw);
 
-	if (op == SLJIT_FAST_RETURN || op == SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN)
-	{
+	if (op == SLJIT_FAST_RETURN || op == SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN) {
 		CHECK_ARGUMENT(src != SLJIT_IMM);
 		compiler->last_flags = 0;
-	}
-	else if (op >= SLJIT_PREFETCH_L1 && op <= SLJIT_PREFETCH_ONCE)
-	{
+	} else if (op >= SLJIT_PREFETCH_L1 && op <= SLJIT_PREFETCH_ONCE) {
 		CHECK_ARGUMENT(src & SLJIT_MEM);
 	}
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  %s ", op_src_names[op - SLJIT_OP_SRC_BASE]);
+		fprintf(compiler->verbose, "  %s ", op_src_dst_names[op - SLJIT_OP_SRC_DST_BASE]);
 		sljit_verbose_param(compiler, src, srcw);
 		fprintf(compiler->verbose, "\n");
 	}
@@ -1445,6 +1521,26 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_src(struct sljit_compi
 	CHECK_RETURN_OK;
 }
 
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(op >= SLJIT_FAST_ENTER && op <= SLJIT_GET_RETURN_ADDRESS);
+	FUNCTION_CHECK_DST(dst, dstw);
+
+	if (op == SLJIT_FAST_ENTER)
+		compiler->last_flags = 0;
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		fprintf(compiler->verbose, "  %s ", op_src_dst_names[op - SLJIT_OP_SRC_DST_BASE]);
+		sljit_verbose_param(compiler, dst, dstw);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_register_index(sljit_s32 reg)
 {
 	SLJIT_UNUSED_ARG(reg);
@@ -1658,6 +1754,64 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fop2(struct sljit_compile
 	CHECK_RETURN_OK;
 }
 
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
+	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_COPY_TO_F64 && GET_OPCODE(op) <= SLJIT_COPY_FROM_F64);
+	CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
+#else /* !SLJIT_64BIT_ARCHITECTURE */
+	switch (op) {
+	case SLJIT_COPY32_TO_F32:
+	case SLJIT_COPY32_FROM_F32:
+		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
+		break;
+	case SLJIT_COPY_TO_F64:
+	case SLJIT_COPY_FROM_F64:
+		if (reg & REG_PAIR_MASK) {
+			CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(REG_PAIR_FIRST(reg)));
+			CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(REG_PAIR_SECOND(reg)));
+
+			if (op == SLJIT_COPY_TO_F64)
+				break;
+
+			CHECK_ARGUMENT(REG_PAIR_FIRST(reg) != REG_PAIR_SECOND(reg));
+			break;
+		}
+
+		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
+		break;
+	}
+#endif /* SLJIT_64BIT_ARCHITECTURE */
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		fprintf(compiler->verbose, "  copy%s_%s_f%s ", (op & SLJIT_32) ? "32" : "",
+			GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? "to" : "from", (op & SLJIT_32) ? "32" : "64");
+
+		sljit_verbose_freg(compiler, freg);
+
+		if (reg & REG_PAIR_MASK) {
+			fprintf(compiler->verbose, ", {");
+			sljit_verbose_reg(compiler, REG_PAIR_FIRST(reg));
+			fprintf(compiler->verbose, ", ");
+			sljit_verbose_reg(compiler, REG_PAIR_SECOND(reg));
+			fprintf(compiler->verbose, "}\n");
+		} else {
+			fprintf(compiler->verbose, ", ");
+			sljit_verbose_reg(compiler, reg);
+			fprintf(compiler->verbose, "\n");
+		}
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_label(struct sljit_compiler *compiler)
 {
 	SLJIT_UNUSED_ARG(compiler);
@@ -1704,11 +1858,10 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_jump(struct sljit_compile
 		if ((type & 0xff) <= SLJIT_NOT_ZERO)
 			CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
 		else if ((compiler->last_flags & 0xff) == SLJIT_CARRY) {
-			CHECK_ARGUMENT((type & 0xff) == SLJIT_CARRY || (type & 0xff) == SLJIT_NOT_CARRY);
+			CHECK_ARGUMENT((type & 0xfe) == SLJIT_CARRY);
 			compiler->last_flags = 0;
 		} else
-			CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
-				|| ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+			CHECK_ARGUMENT((type & 0xfe) == (compiler->last_flags & 0xff)
 				|| CHECK_UNORDERED(type, compiler->last_flags));
 	}
 #endif
@@ -1890,9 +2043,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_flags(struct sljit_com
 	if (type <= SLJIT_NOT_ZERO)
 		CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
 	else
-		CHECK_ARGUMENT(type == (compiler->last_flags & 0xff)
-			|| (type == SLJIT_NOT_CARRY && (compiler->last_flags & 0xff) == SLJIT_CARRY)
-			|| (type == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+		CHECK_ARGUMENT((type & 0xfe) == (compiler->last_flags & 0xff)
 			|| CHECK_UNORDERED(type, compiler->last_flags));
 
 	FUNCTION_CHECK_DST(dst, dstw);
@@ -1918,29 +2069,29 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_cmov(struct sljit_compile
 	sljit_s32 src, sljit_sw srcw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(type >= SLJIT_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL);
+	sljit_s32 cond = type & ~SLJIT_32;
+
+	CHECK_ARGUMENT(cond >= SLJIT_EQUAL && cond <= SLJIT_ORDERED_LESS_EQUAL);
 
 	CHECK_ARGUMENT(compiler->scratches != -1 && compiler->saveds != -1);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_32));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg));
 	if (src != SLJIT_IMM) {
 		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src));
 		CHECK_ARGUMENT(srcw == 0);
 	}
 
-	if (type <= SLJIT_NOT_ZERO)
+	if (cond <= SLJIT_NOT_ZERO)
 		CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
 	else
-		CHECK_ARGUMENT(type == (compiler->last_flags & 0xff)
-			|| (type == SLJIT_NOT_CARRY && (compiler->last_flags & 0xff) == SLJIT_CARRY)
-			|| (type == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
-			|| CHECK_UNORDERED(type, compiler->last_flags));
+		CHECK_ARGUMENT((cond & 0xfe) == (compiler->last_flags & 0xff)
+			|| CHECK_UNORDERED(cond, compiler->last_flags));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
 		fprintf(compiler->verbose, "  cmov%s %s, ",
-			!(dst_reg & SLJIT_32) ? "" : "32",
-			jump_names[type]);
-		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_32);
+			!(type & SLJIT_32) ? "" : "32",
+			jump_names[type & ~SLJIT_32]);
+		sljit_verbose_reg(compiler, dst_reg);
 		fprintf(compiler->verbose, ", ");
 		sljit_verbose_param(compiler, src, srcw);
 		fprintf(compiler->verbose, "\n");
@@ -1953,16 +2104,39 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_mem(struct sljit_compiler
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
+	if (SLJIT_UNLIKELY(compiler->skip_checks)) {
+		compiler->skip_checks = 0;
+		CHECK_RETURN_OK;
+	}
+
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	sljit_s32 allowed_flags;
 
+	if (type & SLJIT_MEM_UNALIGNED) {
+		CHECK_ARGUMENT(!(type & (SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)));
+	} else if (type & SLJIT_MEM_UNALIGNED_16) {
+		CHECK_ARGUMENT(!(type & SLJIT_MEM_UNALIGNED_32));
+	} else {
+		CHECK_ARGUMENT((reg & REG_PAIR_MASK) || (type & SLJIT_MEM_UNALIGNED_32));
+	}
+
+	allowed_flags = SLJIT_MEM_UNALIGNED;
+
+	switch (type & 0xff) {
+	case SLJIT_MOV_U32:
+	case SLJIT_MOV_S32:
+	case SLJIT_MOV32:
+		allowed_flags = SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16;
+		break;
+	case SLJIT_MOV:
+	case SLJIT_MOV_P:
+		allowed_flags = SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32;
+		break;
+	}
+
+	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | allowed_flags)) == 0);
+
 	if (reg & REG_PAIR_MASK) {
-		if (type & SLJIT_MEM_UNALIGNED) {
-			CHECK_ARGUMENT((type & ~(0xff | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32)) == 0);
-			CHECK_ARGUMENT((type & (SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32)) != (SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32));
-		} else {
-			CHECK_ARGUMENT((type & ~(0xff | SLJIT_MEM_STORE)) == 0);
-		}
 		CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV);
 		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(REG_PAIR_FIRST(reg)));
 		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(REG_PAIR_SECOND(reg)));
@@ -1971,67 +2145,36 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_mem(struct sljit_compiler
 		CHECK_ARGUMENT((type & 0xff) >= SLJIT_MOV && (type & 0xff) <= SLJIT_MOV_P);
 		CHECK_ARGUMENT(!(type & SLJIT_32) || ((type & 0xff) >= SLJIT_MOV_U8 && (type & 0xff) <= SLJIT_MOV_S16));
 		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
-
-		if (type & SLJIT_MEM_UNALIGNED) {
-			allowed_flags = SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32;
-
-			switch (type & 0xff) {
-			case SLJIT_MOV_U8:
-			case SLJIT_MOV_S8:
-			case SLJIT_MOV_U16:
-			case SLJIT_MOV_S16:
-				allowed_flags = 0;
-				break;
-			case SLJIT_MOV_U32:
-			case SLJIT_MOV_S32:
-			case SLJIT_MOV32:
-				allowed_flags = SLJIT_MEM_ALIGNED_16;
-				break;
-			}
-			CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | allowed_flags)) == 0);
-			CHECK_ARGUMENT((type & (SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32)) != (SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32));
-		} else {
-			CHECK_ARGUMENT((type & SLJIT_MEM_PRE) || (type & SLJIT_MEM_POST));
-			CHECK_ARGUMENT((type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) != (SLJIT_MEM_PRE | SLJIT_MEM_POST));
-			CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_PRE | SLJIT_MEM_POST)) == 0);
-			CHECK_ARGUMENT((mem & REG_MASK) != 0 && (mem & REG_MASK) != reg);
-		}
 	}
 
 	FUNCTION_CHECK_SRC_MEM(mem, memw);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) {
-			if (type & SLJIT_MEM_SUPP)
-				CHECK_RETURN_OK;
-			if (sljit_emit_mem(compiler, type | SLJIT_MEM_SUPP, reg, mem, memw) == SLJIT_ERR_UNSUPPORTED) {
-				fprintf(compiler->verbose, "  // mem: unsupported form, no instructions are emitted");
-				CHECK_RETURN_OK;
-			}
-		}
-
 		if ((type & 0xff) == SLJIT_MOV32)
-			fprintf(compiler->verbose, "  mem32.%s",
-				(type & SLJIT_MEM_STORE) ? "st" : "ld");
+			fprintf(compiler->verbose, "  %s32",
+				(type & SLJIT_MEM_STORE) ? "store" : "load");
 		else
-			fprintf(compiler->verbose, "  mem%s.%s%s",
+			fprintf(compiler->verbose, "  %s%s%s",
+				(type & SLJIT_MEM_STORE) ? "store" : "load",
 				!(type & SLJIT_32) ? "" : "32",
-				(type & SLJIT_MEM_STORE) ? "st" : "ld",
 				op1_names[(type & 0xff) - SLJIT_OP1_BASE]);
 
-		if (type & SLJIT_MEM_UNALIGNED) {
-			printf(".un%s%s ", (type & SLJIT_MEM_ALIGNED_16) ? ".16" : "", (type & SLJIT_MEM_ALIGNED_32) ? ".32" : "");
-		} else
-			printf((type & SLJIT_MEM_PRE) ? ".pre " : ".post ");
+		if (type & SLJIT_MEM_UNALIGNED)
+			printf(".un");
+		else if (type & SLJIT_MEM_UNALIGNED_16)
+			printf(".un16");
+		else if (type & SLJIT_MEM_UNALIGNED_32)
+			printf(".un32");
 
 		if (reg & REG_PAIR_MASK) {
-			fprintf(compiler->verbose, "{");
+			fprintf(compiler->verbose, " {");
 			sljit_verbose_reg(compiler, REG_PAIR_FIRST(reg));
 			fprintf(compiler->verbose, ", ");
 			sljit_verbose_reg(compiler, REG_PAIR_SECOND(reg));
 			fprintf(compiler->verbose, "}, ");
 		} else {
+			fprintf(compiler->verbose, " ");
 			sljit_verbose_reg(compiler, reg);
 			fprintf(compiler->verbose, ", ");
 		}
@@ -2042,43 +2185,118 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_mem(struct sljit_compiler
 	CHECK_RETURN_OK;
 }
 
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	if (SLJIT_UNLIKELY(compiler->skip_checks)) {
+		compiler->skip_checks = 0;
+		CHECK_RETURN_OK;
+	}
+
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT((type & 0xff) >= SLJIT_MOV && (type & 0xff) <= SLJIT_MOV_P);
+	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_POST)) == 0);
+	CHECK_ARGUMENT((mem & REG_MASK) != 0 && (mem & REG_MASK) != reg);
+
+	FUNCTION_CHECK_SRC_MEM(mem, memw);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		if (type & SLJIT_MEM_SUPP)
+			CHECK_RETURN_OK;
+		if (sljit_emit_mem_update(compiler, type | SLJIT_MEM_SUPP, reg, mem, memw) == SLJIT_ERR_UNSUPPORTED) {
+			fprintf(compiler->verbose, "    # mem: unsupported form, no instructions are emitted\n");
+			CHECK_RETURN_OK;
+		}
+
+		if ((type & 0xff) == SLJIT_MOV32)
+			fprintf(compiler->verbose, "  %s32.%s ",
+				(type & SLJIT_MEM_STORE) ? "store" : "load",
+				(type & SLJIT_MEM_POST) ? "post" : "pre");
+		else
+			fprintf(compiler->verbose, "  %s%s%s.%s ",
+				(type & SLJIT_MEM_STORE) ? "store" : "load",
+				!(type & SLJIT_32) ? "" : "32",
+				op1_names[(type & 0xff) - SLJIT_OP1_BASE],
+				(type & SLJIT_MEM_POST) ? "post" : "pre");
+
+		sljit_verbose_reg(compiler, reg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_param(compiler, mem, memw);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV_F64);
+
 	if (type & SLJIT_MEM_UNALIGNED) {
-		CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16 | (type & SLJIT_32 ? 0 : SLJIT_MEM_ALIGNED_32))) == 0);
-		CHECK_ARGUMENT((type & (SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32)) != (SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32));
+		CHECK_ARGUMENT(!(type & (SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)));
+	} else if (type & SLJIT_MEM_UNALIGNED_16) {
+		CHECK_ARGUMENT(!(type & SLJIT_MEM_UNALIGNED_32));
 	} else {
-		CHECK_ARGUMENT((type & SLJIT_MEM_PRE) || (type & SLJIT_MEM_POST));
-		CHECK_ARGUMENT((type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) != (SLJIT_MEM_PRE | SLJIT_MEM_POST));
-		CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_PRE | SLJIT_MEM_POST)) == 0);
+		CHECK_ARGUMENT(type & SLJIT_MEM_UNALIGNED_32);
+		CHECK_ARGUMENT(!(type & SLJIT_32));
 	}
 
+	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	FUNCTION_CHECK_SRC_MEM(mem, memw);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		fprintf(compiler->verbose, "  %s.%s",
+			(type & SLJIT_MEM_STORE) ? "store" : "load",
+			!(type & SLJIT_32) ? "f64" : "f32");
+
+		if (type & SLJIT_MEM_UNALIGNED)
+			printf(".un");
+		else if (type & SLJIT_MEM_UNALIGNED_16)
+			printf(".un16");
+		else if (type & SLJIT_MEM_UNALIGNED_32)
+			printf(".un32");
+
+		fprintf(compiler->verbose, " ");
+		sljit_verbose_freg(compiler, freg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_param(compiler, mem, memw);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV_F64);
+	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_POST)) == 0);
 	FUNCTION_CHECK_SRC_MEM(mem, memw);
 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) {
-			if (type & SLJIT_MEM_SUPP)
-				CHECK_RETURN_OK;
-			if (sljit_emit_fmem(compiler, type | SLJIT_MEM_SUPP, freg, mem, memw) == SLJIT_ERR_UNSUPPORTED) {
-				fprintf(compiler->verbose, "  // fmem: unsupported form, no instructions are emitted");
-				CHECK_RETURN_OK;
-			}
+		if (type & SLJIT_MEM_SUPP)
+			CHECK_RETURN_OK;
+		if (sljit_emit_fmem_update(compiler, type | SLJIT_MEM_SUPP, freg, mem, memw) == SLJIT_ERR_UNSUPPORTED) {
+			fprintf(compiler->verbose, "    # fmem: unsupported form, no instructions are emitted\n");
+			CHECK_RETURN_OK;
 		}
 
-		fprintf(compiler->verbose, "  fmem.%s%s",
-			(type & SLJIT_MEM_STORE) ? "st" : "ld",
-			!(type & SLJIT_32) ? ".f64" : ".f32");
+		fprintf(compiler->verbose, "  %s.%s.%s ",
+			(type & SLJIT_MEM_STORE) ? "store" : "load",
+			!(type & SLJIT_32) ? "f64" : "f32",
+			(type & SLJIT_MEM_POST) ? "post" : "pre");
 
-		if (type & SLJIT_MEM_UNALIGNED) {
-			printf(".un%s%s ", (type & SLJIT_MEM_ALIGNED_16) ? ".16" : "", (type & SLJIT_MEM_ALIGNED_32) ? ".32" : "");
-		} else
-			printf((type & SLJIT_MEM_PRE) ? ".pre " : ".post ");
 		sljit_verbose_freg(compiler, freg);
 		fprintf(compiler->verbose, ", ");
 		sljit_verbose_param(compiler, mem, memw);
@@ -2169,36 +2387,11 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_put_label(struct sljit_co
 	ADJUST_LOCAL_OFFSET(dst, dstw); \
 	ADJUST_LOCAL_OFFSET(src, srcw);
 
-static SLJIT_INLINE sljit_s32 emit_mov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
-{
-#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
-	/* At the moment the pointer size is always equal to sljit_sw. May be changed in the future. */
-	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_P))
-		return SLJIT_SUCCESS;
-#else
-	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P))
-		return SLJIT_SUCCESS;
-#endif
-
-	SLJIT_SKIP_CHECKS(compiler);
-	return sljit_emit_op1(compiler, op, SLJIT_RETURN_REG, 0, src, srcw);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_return(compiler, op, src, srcw));
-
-	FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
-
-	SLJIT_SKIP_CHECKS(compiler);
-	return sljit_emit_return_void(compiler);
-}
-
 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
 		|| (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC) \
 		|| ((defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) && !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1 && SLJIT_MIPS_REV < 6)) \
-		|| (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
+		|| (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) \
+		|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 
 static SLJIT_INLINE sljit_s32 sljit_emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 dst_reg,
@@ -2206,14 +2399,14 @@ static SLJIT_INLINE sljit_s32 sljit_emit_cmov_generic(struct sljit_compiler *com
 {
 	struct sljit_label *label;
 	struct sljit_jump *jump;
-	sljit_s32 op = (dst_reg & SLJIT_32) ? SLJIT_MOV32 : SLJIT_MOV;
+	sljit_s32 op = (type & SLJIT_32) ? SLJIT_MOV32 : SLJIT_MOV;
 
 	SLJIT_SKIP_CHECKS(compiler);
-	jump = sljit_emit_jump(compiler, type ^ 0x1);
+	jump = sljit_emit_jump(compiler, (type & ~SLJIT_32) ^ 0x1);
 	FAIL_IF(!jump);
 
 	SLJIT_SKIP_CHECKS(compiler);
-	FAIL_IF(sljit_emit_op1(compiler, op, dst_reg & ~SLJIT_32, 0, src, srcw));
+	FAIL_IF(sljit_emit_op1(compiler, op, dst_reg, 0, src, srcw));
 
 	SLJIT_SKIP_CHECKS(compiler);
 	label = sljit_emit_label(compiler);
@@ -2303,6 +2496,50 @@ static sljit_s32 sljit_emit_fmem_unaligned(struct sljit_compiler *compiler, slji
 #	include "sljitNativeS390X.c"
 #endif
 
+static SLJIT_INLINE sljit_s32 emit_mov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	/* At the moment the pointer size is always equal to sljit_sw. May be changed in the future. */
+	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_P))
+		return SLJIT_SUCCESS;
+#else
+	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P))
+		return SLJIT_SUCCESS;
+#endif
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_op1(compiler, op, SLJIT_RETURN_REG, 0, src, srcw);
+}
+
+#if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) \
+	&& !((defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) && defined __SOFTFP__)
+
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	if (src == SLJIT_FR0)
+		return SLJIT_SUCCESS;
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+}
+
+#endif /* !SLJIT_CONFIG_X86_32 && !(SLJIT_CONFIG_ARM_32 && __SOFTFP__) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return(compiler, op, src, srcw));
+
+	if (GET_OPCODE(op) < SLJIT_MOV_F64) {
+		FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
+	} else {
+		FAIL_IF(emit_fmov_before_return(compiler, op, src, srcw));
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_return_void(compiler);
+}
+
 #if !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) \
 	&& !(defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
 
@@ -2372,7 +2609,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_cmp(struct sljit_compiler
 	if (condition <= SLJIT_NOT_ZERO)
 		flags = SLJIT_SET_Z;
 	else
-		flags = condition << VARIABLE_FLAG_SHIFT;
+		flags = (condition & 0xfe) << VARIABLE_FLAG_SHIFT;
 
 	SLJIT_SKIP_CHECKS(compiler);
 	PTR_FAIL_IF(sljit_emit_op2u(compiler,
@@ -2410,35 +2647,33 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_fcmp(struct sljit_compile
 	CHECK_PTR(check_sljit_emit_fcmp(compiler, type, src1, src1w, src2, src2w));
 
 	SLJIT_SKIP_CHECKS(compiler);
-	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | ((type & 0xff) << VARIABLE_FLAG_SHIFT) | (type & SLJIT_32), src1, src1w, src2, src2w);
+	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | ((type & 0xfe) << VARIABLE_FLAG_SHIFT) | (type & SLJIT_32), src1, src1w, src2, src2w);
 
 	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
 }
 
 #if !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) \
-	&& !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) \
-	&& !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC) \
-	&& !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+	&& !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(reg);
+	SLJIT_UNUSED_ARG(mem);
+	SLJIT_UNUSED_ARG(memw);
 
-	if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-		return SLJIT_ERR_UNSUPPORTED;
-
-	return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+	return SLJIT_ERR_UNSUPPORTED;
 }
 
-#endif
+#endif /* !SLJIT_CONFIG_ARM && !SLJIT_CONFIG_PPC */
 
-#if !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) \
-	&& !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) \
-	&& !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
+#if !(defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
+	&& !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
@@ -2447,13 +2682,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
 
-	if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-		return SLJIT_ERR_UNSUPPORTED;
-
 	return sljit_emit_fmem_unaligned(compiler, type, freg, mem, memw);
 }
 
-#endif
+#endif /* !SLJIT_CONFIG_ARM_32 && !SLJIT_CONFIG_MIPS */
+
+#if !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+	&& !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fmem_update(compiler, type, freg, mem, memw));
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(mem);
+	SLJIT_UNUSED_ARG(memw);
+
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
+#endif /* !SLJIT_CONFIG_ARM_64 && !SLJIT_CONFIG_PPC */
 
 #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
 	&& !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
@@ -2580,6 +2831,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
 {
 	SLJIT_UNUSED_ARG(compiler);
@@ -2590,9 +2848,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *comp
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
 {
 	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(src);
+	SLJIT_UNUSED_ARG(srcw);
 	SLJIT_UNREACHABLE();
 	return SLJIT_ERR_UNSUPPORTED;
 }
@@ -2659,6 +2919,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(op);
+	SLJIT_UNUSED_ARG(dst_reg);
+	SLJIT_UNUSED_ARG(src1_reg);
+	SLJIT_UNUSED_ARG(src2_reg);
+	SLJIT_UNUSED_ARG(src3);
+	SLJIT_UNUSED_ARG(src3w);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -2723,6 +3000,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(op);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(reg);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
 {
 	SLJIT_UNUSED_ARG(compiler);
@@ -2857,6 +3145,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 reg, sljit_s32 mem, sljit_sw memw)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(reg);
+	SLJIT_UNUSED_ARG(mem);
+	SLJIT_UNUSED_ARG(memw);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 mem, sljit_sw memw)
 {
 	SLJIT_UNUSED_ARG(compiler);
@@ -2868,6 +3167,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 mem, sljit_sw memw)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(mem);
+	SLJIT_UNUSED_ARG(memw);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
 {
 	SLJIT_UNUSED_ARG(compiler);
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.h b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.h
index 1304843de1..87805dd7fd 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.h
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitLir.h
@@ -36,26 +36,24 @@
     Advantages:
       - The execution can be continued from any LIR instruction. In other
         words, it is possible to jump to any label from anywhere, even from
-        a code fragment, which is compiled later, if both compiled code
-        shares the same context. See sljit_emit_enter for more details
-      - Supports self modifying code: target of (conditional) jump and call
+        a code fragment, which is compiled later, as long as the compiling
+        context is the same. See sljit_emit_enter for more details.
+      - Supports self modifying code: target of any jump and call
         instructions and some constant values can be dynamically modified
-        during runtime
+        during runtime. See SLJIT_REWRITABLE_JUMP.
         - although it is not suggested to do it frequently
         - can be used for inline caching: save an important value once
           in the instruction stream
-        - since this feature limits the optimization possibilities, a
-          special flag must be passed at compile time when these
-          instructions are emitted
       - A fixed stack space can be allocated for local variables
       - The compiler is thread-safe
       - The compiler is highly configurable through preprocessor macros.
         You can disable unneeded features (multithreading in single
         threaded applications), and you can use your own system functions
-        (including memory allocators). See sljitConfig.h
+        (including memory allocators). See sljitConfig.h.
     Disadvantages:
-      - No automatic register allocation, and temporary results are
-        not stored on the stack. (hence the name comes)
+      - The compiler is more like a platform independent assembler, so
+        there is no built-in variable management. Registers and stack must
+        be managed manually (the name of the compiler refers to this).
     In practice:
       - This approach is very effective for interpreters
         - One of the saved registers typically points to a stack interface
@@ -77,7 +75,7 @@
 #include "sljitConfig.h"
 
 /* The following header file defines useful macros for fine tuning
-sljit based code generators. They are listed in the beginning
+SLJIT based code generators. They are listed in the beginning
 of sljitConfigInternal.h */
 
 #include "sljitConfigInternal.h"
@@ -90,6 +88,10 @@ of sljitConfigInternal.h */
 extern "C" {
 #endif
 
+/* Version numbers. */
+#define SLJIT_MAJOR_VERSION	0
+#define SLJIT_MINOR_VERSION	95
+
 /* --------------------------------------------------------------------- */
 /*  Error codes                                                          */
 /* --------------------------------------------------------------------- */
@@ -97,33 +99,31 @@ extern "C" {
 /* Indicates no error. */
 #define SLJIT_SUCCESS			0
 /* After the call of sljit_generate_code(), the error code of the compiler
-   is set to this value to avoid future sljit calls (in debug mode at least).
+   is set to this value to avoid further code generation.
    The complier should be freed after sljit_generate_code(). */
 #define SLJIT_ERR_COMPILED		1
-/* Cannot allocate non executable memory. */
+/* Cannot allocate non-executable memory. */
 #define SLJIT_ERR_ALLOC_FAILED		2
 /* Cannot allocate executable memory.
-   Only for sljit_generate_code() */
+   Only sljit_generate_code() returns with this error code. */
 #define SLJIT_ERR_EX_ALLOC_FAILED	3
 /* Return value for SLJIT_CONFIG_UNSUPPORTED placeholder architecture. */
 #define SLJIT_ERR_UNSUPPORTED		4
 /* An ivalid argument is passed to any SLJIT function. */
 #define SLJIT_ERR_BAD_ARGUMENT		5
-/* Dynamic code modification is not enabled. */
-#define SLJIT_ERR_DYN_CODE_MOD		6
 
 /* --------------------------------------------------------------------- */
 /*  Registers                                                            */
 /* --------------------------------------------------------------------- */
 
 /*
-  Scratch (R) registers: registers whose may not preserve their values
+  Scratch (R) registers: registers which may not preserve their values
   across function calls.
 
-  Saved (S) registers: registers whose preserve their values across
+  Saved (S) registers: registers which preserve their values across
   function calls.
 
-  The scratch and saved register sets are overlap. The last scratch register
+  The scratch and saved register sets overlap. The last scratch register
   is the first saved register, the one before the last is the second saved
   register, and so on.
 
@@ -209,7 +209,7 @@ extern "C" {
 /* The SLJIT_SP provides direct access to the linear stack space allocated by
    sljit_emit_enter. It can only be used in the following form: SLJIT_MEM1(SLJIT_SP).
    The immediate offset is extended by the relative stack offset automatically.
-   The sljit_get_local_base can be used to obtain the absolute offset. */
+   The sljit_get_local_base can be used to obtain the real address of a value. */
 #define SLJIT_SP	(SLJIT_NUMBER_OF_REGISTERS + 1)
 
 /* Return with machine word. */
@@ -249,6 +249,10 @@ extern "C" {
 /* Float registers >= SLJIT_FIRST_SAVED_FLOAT_REG are saved registers. */
 #define SLJIT_FIRST_SAVED_FLOAT_REG (SLJIT_FS0 - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS + 1)
 
+/* Return with floating point arg. */
+
+#define SLJIT_RETURN_FREG	SLJIT_FR0
+
 /* --------------------------------------------------------------------- */
 /*  Argument type definitions                                            */
 /* --------------------------------------------------------------------- */
@@ -284,7 +288,7 @@ extern "C" {
         | SLJIT_ARG_VALUE(SLJIT_ARG_TYPE_32, 3) | SLJIT_ARG_VALUE(SLJIT_ARG_TYPE_F32, 4)
 
    Short form of argument type definition:
-     SLJIT_ARGS4(32, P, F64, 32, F32)
+     SLJIT_ARGS4(F32, P, F64, 32, F32)
 
    Argument passing:
      arg_a must be placed in SLJIT_R0
@@ -386,6 +390,7 @@ struct sljit_label {
 struct sljit_jump {
 	struct sljit_jump *next;
 	sljit_uw addr;
+	/* Architecture dependent flags. */
 	sljit_uw flags;
 	union {
 		sljit_uw target;
@@ -423,17 +428,17 @@ struct sljit_compiler {
 	struct sljit_memory_fragment *buf;
 	struct sljit_memory_fragment *abuf;
 
-	/* Used scratch registers. */
+	/* Available scratch registers. */
 	sljit_s32 scratches;
-	/* Used saved registers. */
+	/* Available saved registers. */
 	sljit_s32 saveds;
-	/* Used float scratch registers. */
+	/* Available float scratch registers. */
 	sljit_s32 fscratches;
-	/* Used float saved registers. */
+	/* Available float saved registers. */
 	sljit_s32 fsaveds;
 	/* Local stack size. */
 	sljit_s32 local_size;
-	/* Code size. */
+	/* Maximum code size. */
 	sljit_uw size;
 	/* Relative offset of the executable mapping from the writable mapping. */
 	sljit_sw executable_offset;
@@ -514,7 +519,8 @@ struct sljit_compiler {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) \
 		|| (defined SLJIT_DEBUG && SLJIT_DEBUG) \
 		|| (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-	/* Trust arguments when the API function is called. */
+	/* Trust arguments when an API function is called.
+	   Used internally for calling API functions. */
 	sljit_s32 skip_checks;
 #endif
 };
@@ -523,7 +529,7 @@ struct sljit_compiler {
 /*  Main functions                                                       */
 /* --------------------------------------------------------------------- */
 
-/* Creates an sljit compiler. The allocator_data is required by some
+/* Creates an SLJIT compiler. The allocator_data is required by some
    custom memory managers. This pointer is passed to SLJIT_MALLOC
    and SLJIT_FREE macros. Most allocators (including the default
    one) ignores this value, and it is recommended to pass NULL
@@ -537,43 +543,44 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_compiler* sljit_create_compiler(void *allo
 /* Frees everything except the compiled machine code. */
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_compiler(struct sljit_compiler *compiler);
 
-/* Returns the current error code. If an error is occurred, future sljit
-   calls which uses the same compiler argument returns early with the same
+/* Returns the current error code. If an error occurres, future calls
+   which uses the same compiler argument returns early with the same
    error code. Thus there is no need for checking the error after every
-   call, it is enough to do it before the code is compiled. Removing
+   call, it is enough to do it after the code is compiled. Removing
    these checks increases the performance of the compiling process. */
 static SLJIT_INLINE sljit_s32 sljit_get_compiler_error(struct sljit_compiler *compiler) { return compiler->error; }
 
 /* Sets the compiler error code to SLJIT_ERR_ALLOC_FAILED except
    if an error was detected before. After the error code is set
    the compiler behaves as if the allocation failure happened
-   during an sljit function call. This can greatly simplify error
-   checking, since only the compiler status needs to be checked
-   after the compilation. */
+   during an SLJIT function call. This can greatly simplify error
+   checking, since it is enough to check the compiler status
+   after the code is compiled. */
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_compiler_memory_error(struct sljit_compiler *compiler);
 
-/*
-   Allocate a small amount of memory. The size must be <= 64 bytes on 32 bit,
+/* Allocate a small amount of memory. The size must be <= 64 bytes on 32 bit,
    and <= 128 bytes on 64 bit architectures. The memory area is owned by the
    compiler, and freed by sljit_free_compiler. The returned pointer is
    sizeof(sljit_sw) aligned. Excellent for allocating small blocks during
-   the compiling, and no need to worry about freeing them. The size is
-   enough to contain at most 16 pointers. If the size is outside of the range,
+   compiling, and no need to worry about freeing them. The size is enough
+   to contain at most 16 pointers. If the size is outside of the range,
    the function will return with NULL. However, this return value does not
    indicate that there is no more memory (does not set the current error code
-   of the compiler to out-of-memory status).
-*/
+   of the compiler to out-of-memory status). */
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_alloc_memory(struct sljit_compiler *compiler, sljit_s32 size);
 
+/* Returns the allocator data passed to sljit_create_compiler. These pointers
+   may contain context data even if the normal/exec allocator ignores it. */
+static SLJIT_INLINE void* sljit_get_allocator_data(struct sljit_compiler *compiler) { return compiler->allocator_data; }
+static SLJIT_INLINE void* sljit_get_exec_allocator_data(struct sljit_compiler *compiler) { return compiler->exec_allocator_data; }
+
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 /* Passing NULL disables verbose. */
 SLJIT_API_FUNC_ATTRIBUTE void sljit_compiler_verbose(struct sljit_compiler *compiler, FILE* verbose);
 #endif
 
-/*
-   Create executable code from the sljit instruction stream. This is the final step
-   of the code generation so no more instructions can be added after this call.
-*/
+/* Create executable code from the instruction stream. This is the final step
+   of the code generation so no more instructions can be emitted after this call. */
 
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler);
 
@@ -581,8 +588,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_code(void* code, void *exec_allocator_data);
 
-/*
-   When the protected executable allocator is used the JIT code is mapped
+/* When the protected executable allocator is used the JIT code is mapped
    twice. The first mapping has read/write and the second mapping has read/exec
    permissions. This function returns with the relative offset of the executable
    mapping using the writable mapping as the base after the machine code is
@@ -590,26 +596,24 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_code(void* code, void *exec_allocator_d
    allocator, since it uses only one mapping with read/write/exec permissions.
    Dynamic code modifications requires this value.
 
-   Before a successful code generation, this function returns with 0.
-*/
+   Before a successful code generation, this function returns with 0. */
 static SLJIT_INLINE sljit_sw sljit_get_executable_offset(struct sljit_compiler *compiler) { return compiler->executable_offset; }
 
-/*
-   The executable memory consumption of the generated code can be retrieved by
+/* The executable memory consumption of the generated code can be retrieved by
    this function. The returned value can be used for statistical purposes.
 
-   Before a successful code generation, this function returns with 0.
-*/
+   Before a successful code generation, this function returns with 0. */
 static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler *compiler) { return compiler->executable_size; }
 
 /* Returns with non-zero if the feature or limitation type passed as its
-   argument is present on the current CPU.
+   argument is present on the current CPU. The return value is one, if a
+   feature is fully supported, and it is two, if partially supported.
 
    Some features (e.g. floating point operations) require hardware (CPU)
    support while others (e.g. move with update) are emulated if not available.
-   However even if a feature is emulated, specialized code paths can be faster
-   than the emulation. Some limitations are emulated as well so their general
-   case is supported but it has extra performance costs. */
+   However, even when a feature is emulated, specialized code paths may be
+   faster than the emulation. Some limitations are emulated as well so their
+   general case is supported but it has extra performance costs. */
 
 /* [Not emulated] Floating-point support is available. */
 #define SLJIT_HAS_FPU			0
@@ -619,10 +623,20 @@ static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler
 #define SLJIT_HAS_ZERO_REGISTER		2
 /* [Emulated] Count leading zero is supported. */
 #define SLJIT_HAS_CLZ			3
+/* [Emulated] Count trailing zero is supported. */
+#define SLJIT_HAS_CTZ			4
+/* [Emulated] Reverse the order of bytes is supported. */
+#define SLJIT_HAS_REV			5
+/* [Emulated] Rotate left/right is supported. */
+#define SLJIT_HAS_ROT			6
 /* [Emulated] Conditional move is supported. */
-#define SLJIT_HAS_CMOV			4
-/* [Emulated] Conditional move is supported. */
-#define SLJIT_HAS_PREFETCH		5
+#define SLJIT_HAS_CMOV			7
+/* [Emulated] Prefetch instruction is available (emulated as a nop). */
+#define SLJIT_HAS_PREFETCH		8
+/* [Emulated] Copy from/to f32 operation is available (see sljit_emit_fcopy). */
+#define SLJIT_HAS_COPY_F32		9
+/* [Emulated] Copy from/to f64 operation is available (see sljit_emit_fcopy). */
+#define SLJIT_HAS_COPY_F64		10
 
 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
 /* [Not emulated] SSE2 support is available on x86. */
@@ -645,8 +659,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
    Otherwise it returns zero. */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type);
 
-/* Instruction generation. Returns with any error code. If there is no
-   error, they return with SLJIT_SUCCESS. */
+/* The following functions generate machine code. If there is no
+   error, they return with SLJIT_SUCCESS, otherwise they return
+   with an error code. */
 
 /*
    The executable code is a function from the viewpoint of the C
@@ -654,30 +669,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type);
    Binary Interface) of the platform, which specify the purpose of
    machine registers and stack handling among other things. The
    sljit_emit_enter function emits the necessary instructions for
-   setting up a new context for the executable code and moves function
-   arguments to the saved registers. Furthermore the options argument
+   setting up a new context for the executable code. This is often
+   called as function prologue. Furthermore the options argument
    can be used to pass configuration options to the compiler. The
    available options are listed before sljit_emit_enter.
 
-   The function argument list is the combination of SLJIT_ARGx
-   (SLJIT_DEF_ARG1) macros. Currently maximum 4 arguments are
-   supported. The first integer argument is loaded into SLJIT_S0,
-   the second one is loaded into SLJIT_S1, and so on. Similarly,
-   the first floating point argument is loaded into SLJIT_FR0,
-   the second one is loaded into SLJIT_FR1, and so on. Furthermore
-   the register set used by the function must be declared as well.
-   The number of scratch and saved registers used by the function
-   must be passed to sljit_emit_enter. Only R registers between R0
+   The function argument list is specified by the SLJIT_ARGSx
+   (SLJIT_ARGS0 .. SLJIT_ARGS4) macros. Currently maximum four
+   arguments are supported. See the description of SLJIT_ARGSx
+   macros about argument passing. Furthermore the register set
+   used by the function must be declared as well. The number of
+   scratch and saved registers available to the function must
+   be passed to sljit_emit_enter. Only R registers between R0
    and "scratches" argument can be used later. E.g. if "scratches"
-   is set to 2, the scratch register set will be limited to SLJIT_R0
-    and SLJIT_R1. The S registers and the floating point registers
-   ("fscratches" and "fsaveds") are specified in a similar manner.
-   The sljit_emit_enter is also capable of allocating a stack space
-   for local variables. The "local_size" argument contains the size
-   in bytes of this local area and its staring address is stored
-   in SLJIT_SP. The memory area between SLJIT_SP (inclusive) and
-   SLJIT_SP + local_size (exclusive) can be modified freely until
-   the function returns. The stack space is not initialized.
+   is set to two, the scratch register set will be limited to
+   SLJIT_R0 and SLJIT_R1. The S registers and the floating point
+   registers ("fscratches" and "fsaveds") are specified in a
+   similar manner. The sljit_emit_enter is also capable of
+   allocating a stack space for local data. The "local_size"
+   argument contains the size in bytes of this local area, and
+   it can be accessed using SLJIT_MEM1(SLJIT_SP). The memory
+   area between SLJIT_SP (inclusive) and SLJIT_SP + local_size
+   (exclusive) can be modified freely until the function returns.
+   The stack space is not initialized to zero.
 
    Note: the following conditions must met:
          0 <= scratches <= SLJIT_NUMBER_OF_REGISTERS
@@ -698,16 +712,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type);
    are not saved / restored on function enter / return. Instead,
    these registers can be used to pass / return data (such as
    global / local context pointers) across function calls. The
-   value of n must be between 1 and 3. Furthermore, this option
-   is only supported by register argument calling convention, so
-   SLJIT_ENTER_REG_ARG (see below) must be specified as well. */
+   value of n must be between 1 and 3. This option is only
+   supported by SLJIT_ENTER_REG_ARG calling convention. */
 #define SLJIT_ENTER_KEEP(n)	(n)
 
-/* The compiled function uses an sljit specific register argument
- * calling convention. This is a lightweight function call type where
- * both the caller and called function must be compiled with sljit.
- * The jump type of the function call must be SLJIT_CALL_REG_ARG
- * and the called function must store all arguments in registers. */
+/* The compiled function uses an SLJIT specific register argument
+   calling convention. This is a lightweight function call type where
+   both the caller and the called functions must be compiled by
+   SLJIT. The type argument of the call must be SLJIT_CALL_REG_ARG
+   and all arguments must be stored in scratch registers. */
 #define SLJIT_ENTER_REG_ARG	0x00000004
 
 /* The local_size must be >= 0 and <= SLJIT_MAX_LOCAL_SIZE. */
@@ -717,12 +730,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size);
 
-/* The machine code has a context (which contains the local stack space size,
-   number of used registers, etc.) which initialized by sljit_emit_enter. Several
-   functions (such as sljit_emit_return) requres this context to be able to generate
-   the appropriate code. However, some code fragments (like inline cache) may have
-   no normal entry point so their context is unknown for the compiler. Their context
-   can be provided to the compiler by the sljit_set_context function.
+/* The SLJIT compiler has a current context (which contains the local
+   stack space size, number of used registers, etc.) which is initialized
+   by sljit_emit_enter. Several functions (such as sljit_emit_return)
+   requires this context to be able to generate the appropriate code.
+   However, some code fragments (compiled separately) may have no
+   normal entry point so their context is unknown for the compiler.
+
+   The sljit_set_context and sljit_emit_enter have the same arguments,
+   but sljit_set_context does not generate any machine code.
 
    Note: every call of sljit_emit_enter and sljit_set_context overwrites
          the previous context. */
@@ -731,51 +747,42 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size);
 
-/* Return from machine code. The sljit_emit_return_void function does not return with
-   any value. The sljit_emit_return function returns with a single value which stores
-   the result of a data move instruction. The instruction is specified by the op
-   argument, and must be between SLJIT_MOV and SLJIT_MOV_P (see sljit_emit_op1). */
+/* Return to the caller function. The sljit_emit_return_void function
+   does not return with any value. The sljit_emit_return function returns
+   with a single value loaded from its source operand. The load operation
+   can be between SLJIT_MOV and SLJIT_MOV_P (see sljit_emit_op1) and
+   SLJIT_MOV_F32/SLJIT_MOV_F64 (see sljit_emit_fop1) depending on the
+   return value specified by sljit_emit_enter/sljit_set_context. */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler);
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw);
 
-/* Generating entry and exit points for fast call functions (see SLJIT_FAST_CALL).
-   Both sljit_emit_fast_enter and SLJIT_FAST_RETURN operations preserve the
-   values of all registers and stack frame. The return address is stored in the
-   dst argument of sljit_emit_fast_enter, and this return address can be passed
-   to SLJIT_FAST_RETURN to continue the execution after the fast call.
+/* Restores the saved registers and free the stack area, then the execution
+   continues from the address specified by the source operand. This
+   operation is similar to sljit_emit_return, but it ignores the return
+   address. The code where the exection continues should use the same context
+   as the caller function (see sljit_set_context). A word (pointer) value
+   can be passed in the SLJIT_RETURN_REG register. This function can be used
+   to jump to exception handlers. */
 
-   Fast calls are cheap operations (usually only a single call instruction is
-   emitted) but they do not preserve any registers. However the callee function
-   can freely use / update any registers and stack values which can be
-   efficiently exploited by various optimizations. Registers can be saved
-   manually by the callee function if needed.
-
-   Although returning to different address by SLJIT_FAST_RETURN is possible,
-   this address usually cannot be predicted by the return address predictor of
-   modern CPUs which may reduce performance. Furthermore certain security
-   enhancement technologies such as Intel Control-flow Enforcement Technology
-   (CET) may disallow returning to a different address.
-
-   Flags: - (does not modify flags). */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw);
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw);
 
 /*
    Source and destination operands for arithmetical instructions
     imm              - a simple immediate value (cannot be used as a destination)
-    reg              - any of the registers (immediate argument must be 0)
-    [imm]            - absolute immediate memory address
+    reg              - any of the available registers (immediate argument must be 0)
+    [imm]            - absolute memory address
     [reg+imm]        - indirect memory address
     [reg+(reg<<imm)] - indirect indexed memory address (shift must be between 0 and 3)
-                       useful for (byte, half, int, sljit_sw) array access
-                       (fully supported by both x86 and ARM architectures, and cheap operation on others)
+                       useful for accessing arrays (fully supported by both x86 and
+                       ARM architectures, and cheap operation on others)
 */
 
 /*
-   IMPORTANT NOTE: memory access MUST be naturally aligned unless
+   IMPORTANT NOTE: memory accesses MUST be naturally aligned unless
                    SLJIT_UNALIGNED macro is defined and its value is 1.
 
      length | alignment
@@ -833,17 +840,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *
 
 /* Sets 32 bit operation mode on 64 bit CPUs. This option is ignored on
    32 bit CPUs. When this option is set for an arithmetic operation, only
-   the lower 32 bit of the input registers are used, and the CPU status
+   the lower 32 bits of the input registers are used, and the CPU status
    flags are set according to the 32 bit result. Although the higher 32 bit
    of the input and the result registers are not defined by SLJIT, it might
    be defined by the CPU architecture (e.g. MIPS). To satisfy these CPU
    requirements all source registers must be the result of those operations
    where this option was also set. Memory loads read 32 bit values rather
    than 64 bit ones. In other words 32 bit and 64 bit operations cannot be
-   mixed. The only exception is SLJIT_MOV32 whose source register can hold
+   mixed. The only exception is SLJIT_MOV32 which source register can hold
    any 32 or 64 bit value, and it is converted to a 32 bit compatible format
-   first. This conversion is free (no instructions are emitted) on most CPUs.
-   A 32 bit value can also be converted to a 64 bit value by SLJIT_MOV_S32
+   first. When the source and destination registers are the same, this
+   conversion is free (no instructions are emitted) on most CPUs. A 32 bit
+   value can also be converted to a 64 bit value by SLJIT_MOV_S32
    (sign extension) or SLJIT_MOV_U32 (zero extension).
 
    As for floating-point operations, this option sets 32 bit single
@@ -860,18 +868,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *
      SLJIT_ADD32 == (SLJIT_ADD | SLJIT_32) */
 #define SLJIT_32		0x100
 
-/* Many CPUs (x86, ARM, PPC) have status flags which can be set according
+/* Many CPUs (x86, ARM, PPC) have status flag bits which can be set according
    to the result of an operation. Other CPUs (MIPS) do not have status
-   flags, and results must be stored in registers. To cover both architecture
-   types efficiently only two flags are defined by SLJIT:
+   flag bits, and results must be stored in registers. To cover both
+   architecture types efficiently only two flags are defined by SLJIT:
 
     * Zero (equal) flag: it is set if the result is zero
-    * Variable flag: its value is defined by the last arithmetic operation
+    * Variable flag: its value is defined by the arithmetic operation
 
    SLJIT instructions can set any or both of these flags. The value of
-   these flags is undefined if the instruction does not specify their value.
-   The description of each instruction contains the list of allowed flag
-   types.
+   these flags is undefined if the instruction does not specify their
+   value. The description of each instruction contains the list of
+   allowed flag types.
+
+   Note: the logical or operation can be used to set flags.
 
    Example: SLJIT_ADD can set the Z, OVERFLOW, CARRY flags hence
 
@@ -892,32 +902,40 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *
        Sets the variable flag if unsigned overflow (carry) occurs,
        clears it otherwise.
 
-   If an instruction (e.g. SLJIT_MOV) does not modify flags the flags are
-   unchanged.
+   Certain instructions (e.g. SLJIT_MOV) does not modify flags, so
+   status flags are unchanged.
 
-   Using these flags can reduce the number of emitted instructions. E.g. a
-   fast loop can be implemented by decreasing a counter register and set the
-   zero flag to jump back if the counter register has not reached zero.
+   Example:
 
-   Motivation: although CPUs can set a large number of flags, usually their
-   values are ignored or only one of them is used. Emulating a large number
-   of flags on systems without flag register is complicated so SLJIT
-   instructions must specify the flag they want to use and only that flag
-   will be emulated. The last arithmetic instruction can be repeated if
+     sljit_op2(..., SLJIT_ADD | SLJIT_SET_Z, ...)
+     sljit_op1(..., SLJIT_MOV, ...)
+       Zero flag is set according to the result of SLJIT_ADD.
+
+     sljit_op2(..., SLJIT_ADD | SLJIT_SET_Z, ...)
+     sljit_op2(..., SLJIT_ADD, ...)
+       Zero flag has unknown value.
+
+   These flags can be used for code optimization. E.g. a fast loop can be
+   implemented by decreasing a counter register and set the zero flag
+   using a single instruction. The zero register can be used by a
+   conditional jump to restart the loop. A single comparison can set a
+   zero and less flags to check if a value is less, equal, or greater
+   than another value.
+
+   Motivation: although some CPUs can set a large number of flag bits,
+   usually their values are ignored or only a few of them are used. Emulating
+   a large number of flags on systems without a flag register is complicated
+   so SLJIT instructions must specify the flag they want to use and only
+   that flag is computed. The last arithmetic instruction can be repeated if
    multiple flags need to be checked.
 */
 
 /* Set Zero status flag. */
 #define SLJIT_SET_Z			0x0200
 /* Set the variable status flag if condition is true.
-   See comparison types. */
+   See comparison types (e.g. SLJIT_SET_LESS, SLJIT_SET_F_EQUAL). */
 #define SLJIT_SET(condition)			((condition) << 10)
 
-/* Notes:
-     - you cannot postpone conditional jump instructions except if noted that
-       the instruction does not set flags (See: SLJIT_KEEP_FLAGS).
-     - flag combinations: '|' means 'logical or'. */
-
 /* Starting index of opcodes for sljit_emit_op0. */
 #define SLJIT_OP0_BASE			0
 
@@ -968,10 +986,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *
 /* Flags: - (does not modify flags)
    ENDBR32 instruction for x86-32 and ENDBR64 instruction for x86-64
    when Intel Control-flow Enforcement Technology (CET) is enabled.
-   No instruction for other architectures.  */
+   No instructions are emitted for other architectures. */
 #define SLJIT_ENDBR			(SLJIT_OP0_BASE + 8)
 /* Flags: - (may destroy flags)
-   Skip stack frames before return.  */
+   Skip stack frames before return when Intel Control-flow
+   Enforcement Technology (CET) is enabled. No instructions
+   are emitted for other architectures. */
 #define SLJIT_SKIP_FRAMES_BEFORE_RETURN	(SLJIT_OP0_BASE + 9)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op);
@@ -1015,26 +1035,38 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 /* Flags: - (does not modify flags) */
 #define SLJIT_MOV32			(SLJIT_OP1_BASE + 7)
 /* Flags: - (does not modify flags)
-   Note: load a pointer sized data, useful on x32 (a 32 bit mode on x86-64
-         where all x64 features are available, e.g. 16 register) or similar
-         compiling modes */
+   Note: loads a pointer sized data, useful on x32 mode (a 64 bit mode
+         on x86-64 which uses 32 bit pointers) or similar compiling modes */
 #define SLJIT_MOV_P			(SLJIT_OP1_BASE + 8)
-/* Flags: Z
-   Note: immediate source argument is not supported */
-#define SLJIT_NOT			(SLJIT_OP1_BASE + 9)
-#define SLJIT_NOT32			(SLJIT_NOT | SLJIT_32)
 /* Count leading zeroes
    Flags: - (may destroy flags)
    Note: immediate source argument is not supported */
-#define SLJIT_CLZ			(SLJIT_OP1_BASE + 10)
+#define SLJIT_CLZ			(SLJIT_OP1_BASE + 9)
 #define SLJIT_CLZ32			(SLJIT_CLZ | SLJIT_32)
+/* Count trailing zeroes
+   Flags: - (may destroy flags)
+   Note: immediate source argument is not supported */
+#define SLJIT_CTZ			(SLJIT_OP1_BASE + 10)
+#define SLJIT_CTZ32			(SLJIT_CTZ | SLJIT_32)
+/* Reverse the order of bytes
+   Flags: - (may destroy flags)
+   Note: converts between little and big endian formats
+   Note: immediate source argument is not supported */
+#define SLJIT_REV			(SLJIT_OP1_BASE + 11)
+#define SLJIT_REV32			(SLJIT_REV | SLJIT_32)
+
+/* The following unary operations are supported by using sljit_emit_op2:
+     - binary not: SLJIT_XOR with immedate -1 as src1 or src2
+     - negate: SLJIT_SUB with immedate 0 as src1
+   Note: these operations are optimized by the compiler if the
+     target CPU has specialized instruction forms for them. */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw);
 
 /* Starting index of opcodes for sljit_emit_op2. */
-#define SLJIT_OP2_BASE			96
+#define SLJIT_OP2_BASE			64
 
 /* Flags: Z | OVERFLOW | CARRY */
 #define SLJIT_ADD			(SLJIT_OP2_BASE + 0)
@@ -1044,7 +1076,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 #define SLJIT_ADDC32			(SLJIT_ADDC | SLJIT_32)
 /* Flags: Z | LESS | GREATER_EQUAL | GREATER | LESS_EQUAL
           SIG_LESS | SIG_GREATER_EQUAL | SIG_GREATER
-          SIG_LESS_EQUAL | CARRY */
+          SIG_LESS_EQUAL | OVERFLOW | CARRY */
 #define SLJIT_SUB			(SLJIT_OP2_BASE + 2)
 #define SLJIT_SUB32			(SLJIT_SUB | SLJIT_32)
 /* Flags: CARRY */
@@ -1071,78 +1103,164 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 #define SLJIT_SHL			(SLJIT_OP2_BASE + 8)
 #define SLJIT_SHL32			(SLJIT_SHL | SLJIT_32)
 /* Flags: Z
-   Let bit_length be the length of the shift operation: 32 or 64.
-   If src2 is immediate, src2w is masked by (bit_length - 1).
-   Otherwise, if the content of src2 is outside the range from 0
-   to bit_length - 1, the result is undefined. */
-#define SLJIT_LSHR			(SLJIT_OP2_BASE + 9)
-#define SLJIT_LSHR32			(SLJIT_LSHR | SLJIT_32)
+   Same as SLJIT_SHL, except the the second operand is
+   always masked by the length of the shift operation. */
+#define SLJIT_MSHL			(SLJIT_OP2_BASE + 9)
+#define SLJIT_MSHL32			(SLJIT_MSHL | SLJIT_32)
 /* Flags: Z
    Let bit_length be the length of the shift operation: 32 or 64.
    If src2 is immediate, src2w is masked by (bit_length - 1).
    Otherwise, if the content of src2 is outside the range from 0
    to bit_length - 1, the result is undefined. */
-#define SLJIT_ASHR			(SLJIT_OP2_BASE + 10)
+#define SLJIT_LSHR			(SLJIT_OP2_BASE + 10)
+#define SLJIT_LSHR32			(SLJIT_LSHR | SLJIT_32)
+/* Flags: Z
+   Same as SLJIT_LSHR, except the the second operand is
+   always masked by the length of the shift operation. */
+#define SLJIT_MLSHR			(SLJIT_OP2_BASE + 11)
+#define SLJIT_MLSHR32			(SLJIT_MLSHR | SLJIT_32)
+/* Flags: Z
+   Let bit_length be the length of the shift operation: 32 or 64.
+   If src2 is immediate, src2w is masked by (bit_length - 1).
+   Otherwise, if the content of src2 is outside the range from 0
+   to bit_length - 1, the result is undefined. */
+#define SLJIT_ASHR			(SLJIT_OP2_BASE + 12)
 #define SLJIT_ASHR32			(SLJIT_ASHR | SLJIT_32)
+/* Flags: Z
+   Same as SLJIT_ASHR, except the the second operand is
+   always masked by the length of the shift operation. */
+#define SLJIT_MASHR			(SLJIT_OP2_BASE + 13)
+#define SLJIT_MASHR32			(SLJIT_MASHR | SLJIT_32)
+/* Flags: - (may destroy flags)
+   Let bit_length be the length of the rotate operation: 32 or 64.
+   The second operand is always masked by (bit_length - 1). */
+#define SLJIT_ROTL			(SLJIT_OP2_BASE + 14)
+#define SLJIT_ROTL32			(SLJIT_ROTL | SLJIT_32)
+/* Flags: - (may destroy flags)
+   Let bit_length be the length of the rotate operation: 32 or 64.
+   The second operand is always masked by (bit_length - 1). */
+#define SLJIT_ROTR			(SLJIT_OP2_BASE + 15)
+#define SLJIT_ROTR32			(SLJIT_ROTR | SLJIT_32)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w);
 
-/* The sljit_emit_op2u function is the same as sljit_emit_op2 except the result is discarded. */
+/* The sljit_emit_op2u function is the same as sljit_emit_op2
+   except the result is discarded. */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w);
 
-/* Starting index of opcodes for sljit_emit_op2. */
-#define SLJIT_OP_SRC_BASE		128
+/* Emit a left or right shift operation, where the bits shifted
+   in comes from a separate source operand. All operands are
+   interpreted as unsigned integers.
 
-/* Note: src cannot be an immedate value
+   In the followings the value_mask variable is 31 for 32 bit
+     operations and word_size - 1 otherwise.
+
+   op must be one of the following operations:
+     SLJIT_SHL or SLJIT_SHL32:
+       dst_reg = src1_reg << src3_reg
+       dst_reg |= ((src2_reg >> 1) >> (src3 ^ value_mask))
+     SLJIT_MSHL or SLJIT_MSHL32:
+       src3 &= value_mask
+       perform the SLJIT_SHL or SLJIT_SHL32 operation
+     SLJIT_LSHR or SLJIT_LSHR32:
+       dst_reg = src1_reg >> src3_reg
+       dst_reg |= ((src2_reg << 1) << (src3 ^ value_mask))
+     SLJIT_MLSHR or SLJIT_MLSHR32:
+       src3 &= value_mask
+       perform the SLJIT_LSHR or SLJIT_LSHR32 operation
+
+   op can be combined (or'ed) with SLJIT_SHIFT_INTO_NON_ZERO
+
+   dst_reg specifies the destination register, where dst_reg
+     and src2_reg cannot be the same registers
+   src1_reg specifies the source register
+   src2_reg specifies the register which is shifted into src1_reg
+   src3 / src3w contains the shift amount
+
+   Note: a rotate operation is performed if src1_reg and
+         src2_reg are the same registers
+
+   Flags: - (may destroy flags) */
+
+/* The src3 operand contains a non-zero value. Improves
+   the generated code on certain architectures, which
+   provides a small performance improvement. */
+#define SLJIT_SHIFT_INTO_NON_ZERO	0x200
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w);
+
+/* Starting index of opcodes for sljit_emit_op_src
+   and sljit_emit_op_dst. */
+#define SLJIT_OP_SRC_DST_BASE		96
+
+/* Fast return, see SLJIT_FAST_CALL for more details.
+   Note: src cannot be an immedate value
    Flags: - (does not modify flags) */
-#define SLJIT_FAST_RETURN		(SLJIT_OP_SRC_BASE + 0)
+#define SLJIT_FAST_RETURN		(SLJIT_OP_SRC_DST_BASE + 0)
 /* Skip stack frames before fast return.
    Note: src cannot be an immedate value
    Flags: may destroy flags. */
-#define SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN	(SLJIT_OP_SRC_BASE + 1)
+#define SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN	(SLJIT_OP_SRC_DST_BASE + 1)
 /* Prefetch value into the level 1 data cache
    Note: if the target CPU does not support data prefetch,
          no instructions are emitted.
    Note: this instruction never fails, even if the memory address is invalid.
    Flags: - (does not modify flags) */
-#define SLJIT_PREFETCH_L1		(SLJIT_OP_SRC_BASE + 2)
+#define SLJIT_PREFETCH_L1		(SLJIT_OP_SRC_DST_BASE + 2)
 /* Prefetch value into the level 2 data cache
    Note: same as SLJIT_PREFETCH_L1 if the target CPU
          does not support this instruction form.
    Note: this instruction never fails, even if the memory address is invalid.
    Flags: - (does not modify flags) */
-#define SLJIT_PREFETCH_L2		(SLJIT_OP_SRC_BASE + 3)
+#define SLJIT_PREFETCH_L2		(SLJIT_OP_SRC_DST_BASE + 3)
 /* Prefetch value into the level 3 data cache
    Note: same as SLJIT_PREFETCH_L2 if the target CPU
          does not support this instruction form.
    Note: this instruction never fails, even if the memory address is invalid.
    Flags: - (does not modify flags) */
-#define SLJIT_PREFETCH_L3		(SLJIT_OP_SRC_BASE + 4)
+#define SLJIT_PREFETCH_L3		(SLJIT_OP_SRC_DST_BASE + 4)
 /* Prefetch a value which is only used once (and can be discarded afterwards)
    Note: same as SLJIT_PREFETCH_L1 if the target CPU
          does not support this instruction form.
    Note: this instruction never fails, even if the memory address is invalid.
    Flags: - (does not modify flags) */
-#define SLJIT_PREFETCH_ONCE		(SLJIT_OP_SRC_BASE + 5)
+#define SLJIT_PREFETCH_ONCE		(SLJIT_OP_SRC_DST_BASE + 5)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw);
 
+/* Fast enter, see SLJIT_FAST_CALL for more details.
+   Flags: - (does not modify flags) */
+#define SLJIT_FAST_ENTER		(SLJIT_OP_SRC_DST_BASE + 6)
+
+/* Copies the return address into dst. The return address is the
+   address where the execution continues after the called function
+   returns (see: sljit_emit_return / sljit_emit_return_void).
+   Flags: - (does not modify flags) */
+#define SLJIT_GET_RETURN_ADDRESS	(SLJIT_OP_SRC_DST_BASE + 7)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw);
+
 /* Starting index of opcodes for sljit_emit_fop1. */
-#define SLJIT_FOP1_BASE			160
+#define SLJIT_FOP1_BASE			128
 
 /* Flags: - (does not modify flags) */
 #define SLJIT_MOV_F64			(SLJIT_FOP1_BASE + 0)
 #define SLJIT_MOV_F32			(SLJIT_MOV_F64 | SLJIT_32)
 /* Convert opcodes: CONV[DST_TYPE].FROM[SRC_TYPE]
-   SRC/DST TYPE can be: D - double, S - single, W - signed word, I - signed int
-   Rounding mode when the destination is W or I: round towards zero. */
+   SRC/DST TYPE can be: F64, F32, S32, SW
+   Rounding mode when the destination is SW or S32: round towards zero. */
 /* Flags: - (may destroy flags) */
 #define SLJIT_CONV_F64_FROM_F32		(SLJIT_FOP1_BASE + 1)
 #define SLJIT_CONV_F32_FROM_F64		(SLJIT_CONV_F64_FROM_F32 | SLJIT_32)
@@ -1158,7 +1276,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 /* Flags: - (may destroy flags) */
 #define SLJIT_CONV_F64_FROM_S32		(SLJIT_FOP1_BASE + 5)
 #define SLJIT_CONV_F32_FROM_S32		(SLJIT_CONV_F64_FROM_S32 | SLJIT_32)
-/* Note: dst is the left and src is the right operand for SLJIT_CMPD.
+/* Note: dst is the left and src is the right operand for SLJIT_CMP_F32/64.
    Flags: EQUAL_F | LESS_F | GREATER_EQUAL_F | GREATER_F | LESS_EQUAL_F */
 #define SLJIT_CMP_F64			(SLJIT_FOP1_BASE + 6)
 #define SLJIT_CMP_F32			(SLJIT_CMP_F64 | SLJIT_32)
@@ -1174,7 +1292,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil
 	sljit_s32 src, sljit_sw srcw);
 
 /* Starting index of opcodes for sljit_emit_fop2. */
-#define SLJIT_FOP2_BASE			192
+#define SLJIT_FOP2_BASE			160
 
 /* Flags: - (may destroy flags) */
 #define SLJIT_ADD_F64			(SLJIT_FOP2_BASE + 0)
@@ -1194,10 +1312,70 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w);
 
+/* The following opcodes are used by sljit_emit_fcopy(). */
+
+/* 64 bit: copy a 64 bit value from an integer register into a
+           64 bit floating point register without any modifications.
+   32 bit: copy a 32 bit register or register pair into a 64 bit
+           floating point register without any modifications. The
+           register, or the first register of the register pair
+           replaces the high order 32 bit of the floating point
+           register. If a register pair is passed, the low
+           order 32 bit is replaced by the second register.
+           Otherwise, the low order 32 bit is unchanged. */
+#define SLJIT_COPY_TO_F64		1
+/* Copy a 32 bit value from an integer register into a 32 bit
+   floating point register without any modifications. */
+#define SLJIT_COPY32_TO_F32		(SLJIT_COPY_TO_F64 | SLJIT_32)
+/* 64 bit: copy the value of a 64 bit floating point register into
+           an integer register without any modifications.
+   32 bit: copy a 64 bit floating point register into a 32 bit register
+           or a 32 bit register pair without any modifications. The
+           high order 32 bit of the floating point register is copied
+           into the register, or the first register of the register
+           pair. If a register pair is passed, the low order 32 bit
+           is copied into the second register. */
+#define SLJIT_COPY_FROM_F64		2
+/* Copy the value of a 32 bit floating point register into an integer
+   register without any modifications. The register should be processed
+   with 32 bit operations later. */
+#define SLJIT_COPY32_FROM_F32		(SLJIT_COPY_FROM_F64 | SLJIT_32)
+
+/* Special data copy which involves floating point registers.
+
+  op must be between SLJIT_COPY_TO_F64 and SLJIT_COPY32_FROM_F32
+  freg must be a floating point register
+  reg must be a register or register pair */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg);
+
 /* Label and jump instructions. */
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler);
 
+/* The SLJIT_FAST_CALL is a calling method for creating lightweight function
+   calls. This type of calls preserve the values of all registers and stack
+   frame. Unlike normal function calls, the enter and return operations must
+   be performed by the SLJIT_FAST_ENTER and SLJIT_FAST_RETURN operations
+   respectively. The return address is stored in the dst argument of the
+   SLJIT_FAST_ENTER operation, and this return address should be passed as
+   the src argument for the SLJIT_FAST_RETURN operation to return from the
+   called function.
+
+   Fast calls are cheap operations (usually only a single call instruction is
+   emitted) but they do not preserve any registers. However the callee function
+   can freely use / update any registers and the locals area which can be
+   efficiently exploited by various optimizations. Registers can be saved
+   and restored manually if needed.
+
+   Although returning to different address by SLJIT_FAST_RETURN is possible,
+   this address usually cannot be predicted by the return address predictor of
+   modern CPUs which may reduce performance. Furthermore certain security
+   enhancement technologies such as Intel Control-flow Enforcement Technology
+   (CET) may disallow returning to a different address (indirect jumps
+   can be used instead, see SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN). */
+
 /* Invert (negate) conditional type: xor (^) with 0x1 */
 
 /* Integer comparison types. */
@@ -1209,25 +1387,25 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi
 #define SLJIT_LESS			2
 #define SLJIT_SET_LESS			SLJIT_SET(SLJIT_LESS)
 #define SLJIT_GREATER_EQUAL		3
-#define SLJIT_SET_GREATER_EQUAL		SLJIT_SET(SLJIT_GREATER_EQUAL)
+#define SLJIT_SET_GREATER_EQUAL		SLJIT_SET(SLJIT_LESS)
 #define SLJIT_GREATER			4
 #define SLJIT_SET_GREATER		SLJIT_SET(SLJIT_GREATER)
 #define SLJIT_LESS_EQUAL		5
-#define SLJIT_SET_LESS_EQUAL		SLJIT_SET(SLJIT_LESS_EQUAL)
+#define SLJIT_SET_LESS_EQUAL		SLJIT_SET(SLJIT_GREATER)
 #define SLJIT_SIG_LESS			6
 #define SLJIT_SET_SIG_LESS		SLJIT_SET(SLJIT_SIG_LESS)
 #define SLJIT_SIG_GREATER_EQUAL		7
-#define SLJIT_SET_SIG_GREATER_EQUAL	SLJIT_SET(SLJIT_SIG_GREATER_EQUAL)
+#define SLJIT_SET_SIG_GREATER_EQUAL	SLJIT_SET(SLJIT_SIG_LESS)
 #define SLJIT_SIG_GREATER		8
 #define SLJIT_SET_SIG_GREATER		SLJIT_SET(SLJIT_SIG_GREATER)
 #define SLJIT_SIG_LESS_EQUAL		9
-#define SLJIT_SET_SIG_LESS_EQUAL	SLJIT_SET(SLJIT_SIG_LESS_EQUAL)
+#define SLJIT_SET_SIG_LESS_EQUAL	SLJIT_SET(SLJIT_SIG_GREATER)
 
 #define SLJIT_OVERFLOW			10
 #define SLJIT_SET_OVERFLOW		SLJIT_SET(SLJIT_OVERFLOW)
 #define SLJIT_NOT_OVERFLOW		11
 
-/* Unlike other flags, sljit_emit_jump may destroy this flag. */
+/* Unlike other flags, sljit_emit_jump may destroy the carry flag. */
 #define SLJIT_CARRY			12
 #define SLJIT_SET_CARRY			SLJIT_SET(SLJIT_CARRY)
 #define SLJIT_NOT_CARRY			13
@@ -1239,22 +1417,22 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi
 #define SLJIT_F_EQUAL				14
 #define SLJIT_SET_F_EQUAL			SLJIT_SET(SLJIT_F_EQUAL)
 #define SLJIT_F_NOT_EQUAL			15
-#define SLJIT_SET_F_NOT_EQUAL			SLJIT_SET(SLJIT_F_NOT_EQUAL)
+#define SLJIT_SET_F_NOT_EQUAL			SLJIT_SET(SLJIT_F_EQUAL)
 #define SLJIT_F_LESS				16
 #define SLJIT_SET_F_LESS			SLJIT_SET(SLJIT_F_LESS)
 #define SLJIT_F_GREATER_EQUAL			17
-#define SLJIT_SET_F_GREATER_EQUAL		SLJIT_SET(SLJIT_F_GREATER_EQUAL)
+#define SLJIT_SET_F_GREATER_EQUAL		SLJIT_SET(SLJIT_F_LESS)
 #define SLJIT_F_GREATER				18
 #define SLJIT_SET_F_GREATER			SLJIT_SET(SLJIT_F_GREATER)
 #define SLJIT_F_LESS_EQUAL			19
-#define SLJIT_SET_F_LESS_EQUAL			SLJIT_SET(SLJIT_F_LESS_EQUAL)
+#define SLJIT_SET_F_LESS_EQUAL			SLJIT_SET(SLJIT_F_GREATER)
 
 /* Jumps when either argument contains a NaN value. */
 #define SLJIT_UNORDERED				20
 #define SLJIT_SET_UNORDERED			SLJIT_SET(SLJIT_UNORDERED)
 /* Jumps when neither argument contains a NaN value. */
 #define SLJIT_ORDERED				21
-#define SLJIT_SET_ORDERED			SLJIT_SET(SLJIT_ORDERED)
+#define SLJIT_SET_ORDERED			SLJIT_SET(SLJIT_UNORDERED)
 
 /* Ordered / unordered floating point comparison types.
 
@@ -1264,37 +1442,37 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi
 #define SLJIT_ORDERED_EQUAL			22
 #define SLJIT_SET_ORDERED_EQUAL			SLJIT_SET(SLJIT_ORDERED_EQUAL)
 #define SLJIT_UNORDERED_OR_NOT_EQUAL		23
-#define SLJIT_SET_UNORDERED_OR_NOT_EQUAL	SLJIT_SET(SLJIT_UNORDERED_OR_NOT_EQUAL)
+#define SLJIT_SET_UNORDERED_OR_NOT_EQUAL	SLJIT_SET(SLJIT_ORDERED_EQUAL)
 #define SLJIT_ORDERED_LESS			24
 #define SLJIT_SET_ORDERED_LESS			SLJIT_SET(SLJIT_ORDERED_LESS)
 #define SLJIT_UNORDERED_OR_GREATER_EQUAL	25
-#define SLJIT_SET_UNORDERED_OR_GREATER_EQUAL	SLJIT_SET(SLJIT_UNORDERED_OR_GREATER_EQUAL)
+#define SLJIT_SET_UNORDERED_OR_GREATER_EQUAL	SLJIT_SET(SLJIT_ORDERED_LESS)
 #define SLJIT_ORDERED_GREATER			26
 #define SLJIT_SET_ORDERED_GREATER		SLJIT_SET(SLJIT_ORDERED_GREATER)
 #define SLJIT_UNORDERED_OR_LESS_EQUAL		27
-#define SLJIT_SET_UNORDERED_OR_LESS_EQUAL	SLJIT_SET(SLJIT_UNORDERED_OR_LESS_EQUAL)
+#define SLJIT_SET_UNORDERED_OR_LESS_EQUAL	SLJIT_SET(SLJIT_ORDERED_GREATER)
 
 #define SLJIT_UNORDERED_OR_EQUAL		28
 #define SLJIT_SET_UNORDERED_OR_EQUAL		SLJIT_SET(SLJIT_UNORDERED_OR_EQUAL)
 #define SLJIT_ORDERED_NOT_EQUAL			29
-#define SLJIT_SET_ORDERED_NOT_EQUAL		SLJIT_SET(SLJIT_ORDERED_NOT_EQUAL)
+#define SLJIT_SET_ORDERED_NOT_EQUAL		SLJIT_SET(SLJIT_UNORDERED_OR_EQUAL)
 #define SLJIT_UNORDERED_OR_LESS			30
 #define SLJIT_SET_UNORDERED_OR_LESS		SLJIT_SET(SLJIT_UNORDERED_OR_LESS)
 #define SLJIT_ORDERED_GREATER_EQUAL		31
-#define SLJIT_SET_ORDERED_GREATER_EQUAL		SLJIT_SET(SLJIT_ORDERED_GREATER_EQUAL)
+#define SLJIT_SET_ORDERED_GREATER_EQUAL		SLJIT_SET(SLJIT_UNORDERED_OR_LESS)
 #define SLJIT_UNORDERED_OR_GREATER		32
 #define SLJIT_SET_UNORDERED_OR_GREATER		SLJIT_SET(SLJIT_UNORDERED_OR_GREATER)
 #define SLJIT_ORDERED_LESS_EQUAL		33
-#define SLJIT_SET_ORDERED_LESS_EQUAL		SLJIT_SET(SLJIT_ORDERED_LESS_EQUAL)
+#define SLJIT_SET_ORDERED_LESS_EQUAL		SLJIT_SET(SLJIT_UNORDERED_OR_GREATER)
 
 /* Unconditional jump types. */
 #define SLJIT_JUMP			34
-	/* Fast calling method. See sljit_emit_fast_enter / SLJIT_FAST_RETURN. */
+/* Fast calling method. See the description above. */
 #define SLJIT_FAST_CALL			35
-	/* Default C calling convention. */
+/* Default C calling convention. */
 #define SLJIT_CALL			36
-	/* Called function must be an sljit compiled function.
-	   See SLJIT_ENTER_REG_ARG option. */
+/* Called function must be compiled by SLJIT.
+   See SLJIT_ENTER_REG_ARG option. */
 #define SLJIT_CALL_REG_ARG		37
 
 /* The target can be changed during runtime (see: sljit_set_jump_addr). */
@@ -1304,7 +1482,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi
    stack usage is reduced before the call, but it is not necessarily reduced
    to zero. In the latter case the compiler needs to allocate space for some
    arguments and the return address must be stored on the stack as well. */
-#define SLJIT_CALL_RETURN			0x2000
+#define SLJIT_CALL_RETURN		0x2000
 
 /* Emit a jump instruction. The destination is not set, only the type of the jump.
     type must be between SLJIT_EQUAL and SLJIT_FAST_CALL
@@ -1314,18 +1492,18 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type);
 
 /* Emit a C compiler (ABI) compatible function call.
-    type must be SLJIT_CALL or SLJIT_CALL_CDECL
-    type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP and SLJIT_CALL_RETURN
-    arg_types is the combination of SLJIT_RET / SLJIT_ARGx (SLJIT_DEF_RET / SLJIT_DEF_ARGx) macros
+    type must be SLJIT_CALL or SLJIT_CALL_REG_ARG
+    type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP and/or SLJIT_CALL_RETURN
+    arg_types can be specified by SLJIT_ARGSx (SLJIT_ARG_RETURN / SLJIT_ARG_VALUE) macros
 
    Flags: destroy all flags. */
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 arg_types);
 
 /* Basic arithmetic comparison. In most architectures it is implemented as
-   an compare operation followed by a sljit_emit_jump. However some
-   architectures (i.e: ARM64 or MIPS) may employ special optimizations here.
-   It is suggested to use this comparison form when appropriate.
-    type must be between SLJIT_EQUAL and SLJIT_I_SIG_LESS_EQUAL
+   a compare operation followed by a sljit_emit_jump. However some
+   architectures (i.e: ARM64 or MIPS) may employ special optimizations
+   here. It is suggested to use this comparison form when appropriate.
+    type must be between SLJIT_EQUAL and SLJIT_SIG_LESS_EQUAL
     type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP
 
    Flags: may destroy flags. */
@@ -1334,15 +1512,14 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_cmp(struct sljit_compiler
 	sljit_s32 src2, sljit_sw src2w);
 
 /* Basic floating point comparison. In most architectures it is implemented as
-   an SLJIT_FCMP operation (setting appropriate flags) followed by a
+   a SLJIT_CMP_F32/64 operation (setting appropriate flags) followed by a
    sljit_emit_jump. However some architectures (i.e: MIPS) may employ
    special optimizations here. It is suggested to use this comparison form
    when appropriate.
     type must be between SLJIT_F_EQUAL and SLJIT_ORDERED_LESS_EQUAL
     type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP
    Flags: destroy flags.
-   Note: if either operand is NaN, the behaviour is undefined for
-         types up to SLJIT_S_LESS_EQUAL. */
+   Note: when an operand is NaN the behaviour depends on the comparison type. */
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_fcmp(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w);
@@ -1363,22 +1540,22 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 /* Emit a C compiler (ABI) compatible function call.
    Direct form: set src to SLJIT_IMM() and srcw to the address
    Indirect form: any other valid addressing mode
-    type must be SLJIT_CALL or SLJIT_CALL_CDECL
+    type must be SLJIT_CALL or SLJIT_CALL_REG_ARG
     type can be combined (or'ed) with SLJIT_CALL_RETURN
-    arg_types is the combination of SLJIT_RET / SLJIT_ARGx (SLJIT_DEF_RET / SLJIT_DEF_ARGx) macros
+    arg_types can be specified by SLJIT_ARGSx (SLJIT_ARG_RETURN / SLJIT_ARG_VALUE) macros
 
    Flags: destroy all flags. */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 arg_types, sljit_s32 src, sljit_sw srcw);
 
-/* Perform the operation using the conditional flags as the second argument.
-   Type must always be between SLJIT_EQUAL and SLJIT_ORDERED_LESS_EQUAL. The value
-   represented by the type is 1, if the condition represented by the type
-   is fulfilled, and 0 otherwise.
+/* Perform an operation using the conditional flags as the second argument.
+   Type must always be between SLJIT_EQUAL and SLJIT_ORDERED_LESS_EQUAL.
+   The value represented by the type is 1, if the condition represented
+   by the type is fulfilled, and 0 otherwise.
 
-   If op == SLJIT_MOV, SLJIT_MOV32:
+   When op is SLJIT_MOV or SLJIT_MOV32:
      Set dst to the value represented by the type (0 or 1).
      Flags: - (does not modify flags)
-   If op == SLJIT_OR, op == SLJIT_AND, op == SLJIT_XOR
+   When op is SLJIT_AND, SLJIT_AND32, SLJIT_OR, SLJIT_OR32, SLJIT_XOR, or SLJIT_XOR32
      Performs the binary operation using dst as the first, and the value
      represented by type as the second argument. Result is written into dst.
      Flags: Z (may destroy flags) */
@@ -1391,49 +1568,50 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
    instruction does not support memory access.
 
    type must be between SLJIT_EQUAL and SLJIT_ORDERED_LESS_EQUAL
-   dst_reg must be a valid register and it can be combined
-      with SLJIT_32 to perform a 32 bit arithmetic operation
-   src must be register or immediate (SLJIT_IMM)
+   type can be combined (or'ed) with SLJIT_32
+   dst_reg must be a valid register
+   src must be a valid register or immediate (SLJIT_IMM)
 
    Flags: - (does not modify flags) */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 dst_reg,
 	sljit_s32 src, sljit_sw srcw);
 
-/* The following flags are used by sljit_emit_mem() and sljit_emit_fmem(). */
+/* The following flags are used by sljit_emit_mem(), sljit_emit_mem_update(),
+   sljit_emit_fmem(), and sljit_emit_fmem_update(). */
 
 /* Memory load operation. This is the default. */
 #define SLJIT_MEM_LOAD		0x000000
 /* Memory store operation. */
 #define SLJIT_MEM_STORE		0x000200
 
-/* Load or stora data from an unaligned address. */
+/* The following flags are used by sljit_emit_mem() and sljit_emit_fmem(). */
+
+/* Load or stora data from an unaligned (byte aligned) address. */
 #define SLJIT_MEM_UNALIGNED	0x000400
-/* Load or store data and update the base address with a single operation. */
-/* Base register is updated before the memory access. */
-#define SLJIT_MEM_PRE		0x000800
+/* Load or stora data from a 16 bit aligned address. */
+#define SLJIT_MEM_UNALIGNED_16	0x000800
+/* Load or stora data from a 32 bit aligned address. */
+#define SLJIT_MEM_UNALIGNED_32	0x001000
+
+/* The following flags are used by sljit_emit_mem_update(),
+   and sljit_emit_fmem_update(). */
+
+/* Base register is updated before the memory access (default). */
+#define SLJIT_MEM_PRE		0x000000
 /* Base register is updated after the memory access. */
-#define SLJIT_MEM_POST		0x001000
-
-/* The following flags are supported when SLJIT_MEM_UNALIGNED is specified: */
-
-/* Defines 16 bit alignment for unaligned accesses. */
-#define SLJIT_MEM_ALIGNED_16	0x010000
-/* Defines 32 bit alignment for unaligned accesses. */
-#define SLJIT_MEM_ALIGNED_32	0x020000
-
-/* The following flags are supported when SLJIT_MEM_PRE or
-   SLJIT_MEM_POST is specified: */
+#define SLJIT_MEM_POST		0x000400
 
 /* When SLJIT_MEM_SUPP is passed, no instructions are emitted.
    Instead the function returns with SLJIT_SUCCESS if the instruction
    form is supported and SLJIT_ERR_UNSUPPORTED otherwise. This flag
    allows runtime checking of available instruction forms. */
-#define SLJIT_MEM_SUPP		0x010000
+#define SLJIT_MEM_SUPP		0x000800
 
 /* The sljit_emit_mem emits instructions for various memory operations:
 
-   When SLJIT_MEM_UNALIGNED is set in type argument:
+   When SLJIT_MEM_UNALIGNED / SLJIT_MEM_UNALIGNED_16 /
+        SLJIT_MEM_UNALIGNED_32 is set in type argument:
      Emit instructions for unaligned memory loads or stores. When
      SLJIT_UNALIGNED is not defined, the only way to access unaligned
      memory data is using sljit_emit_mem. Otherwise all operations (e.g.
@@ -1448,24 +1626,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
      location specified by the mem/memw arguments, and the end address
      of this operation is the starting address of the data transfer
      between the second register and memory. The type argument must
-     be SLJIT_MOV. The SLJIT_MEM_UNALIGNED flag and its options are
-     allowed for this operation.
-
-   When SLJIT_MEM_PRE or SLJIT_MEM_POST is set in type argument:
-     Emit a single memory load or store with update instruction.
-     When the requested instruction form is not supported by the CPU,
-     it returns with SLJIT_ERR_UNSUPPORTED instead of emulating the
-     instruction. This allows specializing tight loops based on
-     the supported instruction forms (see SLJIT_MEM_SUPP flag).
-     Absolute address (SLJIT_MEM0) forms are never supported
-     and the base (first) register specified by the mem argument
-     must not be SLJIT_SP and must also be different from the
-     register specified by the reg argument.
+     be SLJIT_MOV. The SLJIT_MEM_UNALIGNED* options are allowed for
+     this operation.
 
    type must be between SLJIT_MOV and SLJIT_MOV_P and can be
-     combined with SLJIT_MEM_* flags.
+     combined (or'ed) with SLJIT_MEM_* flags
    reg is a register or register pair, which is the source or
-     destination of the operation.
+     destination of the operation
    mem must be a memory operand
 
    Flags: - (does not modify flags) */
@@ -1473,11 +1640,35 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw);
 
+/* Emit a single memory load or store with update instruction.
+   When the requested instruction form is not supported by the CPU,
+   it returns with SLJIT_ERR_UNSUPPORTED instead of emulating the
+   instruction. This allows specializing tight loops based on
+   the supported instruction forms (see SLJIT_MEM_SUPP flag).
+   Absolute address (SLJIT_MEM0) forms are never supported
+   and the base (first) register specified by the mem argument
+   must not be SLJIT_SP and must also be different from the
+   register specified by the reg argument.
+
+   type must be between SLJIT_MOV and SLJIT_MOV_P and can be
+     combined (or'ed) with SLJIT_MEM_* flags
+   reg is the source or destination register of the operation
+   mem must be a memory operand
+
+   Flags: - (does not modify flags) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw);
+
 /* Same as sljit_emit_mem except the followings:
 
+   Loading or storing a pair of registers is not supported.
+
    type must be SLJIT_MOV_F64 or SLJIT_MOV_F32 and can be
-     combined with SLJIT_MEM_* flags.
+     combined (or'ed) with SLJIT_MEM_* flags.
    freg is the source or destination floating point register
+     of the operation
    mem must be a memory operand
 
    Flags: - (does not modify flags) */
@@ -1486,14 +1677,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw);
 
-/* Copies the base address of SLJIT_SP + offset to dst. The offset can be
-   anything to negate the effect of relative addressing. For example if an
-   array of sljit_sw values is stored on the stack from offset 0x40, and R0
-   contains the offset of an array item plus 0x120, this item can be
-   overwritten by two SLJIT instructions:
+/* Same as sljit_emit_mem_update except the followings:
+
+   type must be SLJIT_MOV_F64 or SLJIT_MOV_F32 and can be
+     combined (or'ed) with SLJIT_MEM_* flags
+   freg is the source or destination floating point register
+     of the operation
+   mem must be a memory operand
+
+   Flags: - (does not modify flags) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw);
+
+/* Copies the base address of SLJIT_SP + offset to dst. The offset can
+   represent the starting address of a value in the local data (stack).
+   The offset is not limited by the local data limits, it can be any value.
+   For example if an array of bytes are stored on the stack from
+   offset 0x40, and R0 contains the offset of an array item plus 0x120,
+   this item can be changed by two SLJIT instructions:
 
    sljit_get_local_base(compiler, SLJIT_R1, 0, 0x40 - 0x120);
-   sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0, SLJIT_IMM, 0x5);
+   sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0, SLJIT_IMM, 0x5);
 
    Flags: - (may destroy flags) */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset);
@@ -1521,19 +1727,72 @@ static SLJIT_INLINE sljit_uw sljit_get_const_addr(struct sljit_const *const_) {
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset);
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset);
 
+/* --------------------------------------------------------------------- */
+/*  CPU specific functions                                               */
+/* --------------------------------------------------------------------- */
+
+/* The following function is a helper function for sljit_emit_op_custom.
+   It returns with the real machine register index ( >=0 ) of any SLJIT_R,
+   SLJIT_S and SLJIT_SP registers.
+
+   Note: it returns with -1 for virtual registers (only on x86-32). */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg);
+
+/* The following function is a helper function for sljit_emit_op_custom.
+   It returns with the real machine register ( >= 0 ) index of any SLJIT_FR,
+   and SLJIT_FS register.
+
+   Note: the index is always an even number on ARM-32, MIPS. */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg);
+
+/* Any instruction can be inserted into the instruction stream by
+   sljit_emit_op_custom. It has a similar purpose as inline assembly.
+   The size parameter must match to the instruction size of the target
+   architecture:
+
+         x86: 0 < size <= 15. The instruction argument can be byte aligned.
+      Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
+              if size == 4, the instruction argument must be 4 byte aligned.
+   Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
+	void *instruction, sljit_u32 size);
+
+/* Flags were set by a 32 bit operation. */
+#define SLJIT_CURRENT_FLAGS_32			SLJIT_32
+
+/* Flags were set by an ADD or ADDC operations. */
+#define SLJIT_CURRENT_FLAGS_ADD			0x01
+/* Flags were set by a SUB, SUBC, or NEG operation. */
+#define SLJIT_CURRENT_FLAGS_SUB			0x02
+
+/* Flags were set by sljit_emit_op2u with SLJIT_SUB opcode.
+   Must be combined with SLJIT_CURRENT_FLAGS_SUB. */
+#define SLJIT_CURRENT_FLAGS_COMPARE		0x04
+
+/* Define the currently available CPU status flags. It is usually used after
+   an sljit_emit_label or sljit_emit_op_custom operations to define which CPU
+   status flags are available.
+
+   The current_flags must be a valid combination of SLJIT_SET_* and
+   SLJIT_CURRENT_FLAGS_* constants. */
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_current_flags(struct sljit_compiler *compiler,
+	sljit_s32 current_flags);
+
 /* --------------------------------------------------------------------- */
 /*  Miscellaneous utility functions                                      */
 /* --------------------------------------------------------------------- */
 
-#define SLJIT_MAJOR_VERSION	0
-#define SLJIT_MINOR_VERSION	94
-
 /* Get the human readable name of the platform. Can be useful on platforms
-   like ARM, where ARM and Thumb2 functions can be mixed, and
-   it is useful to know the type of the code generator. */
+   like ARM, where ARM and Thumb2 functions can be mixed, and it is useful
+   to know the type of the code generator. */
 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void);
 
-/* Portable helper function to get an offset of a member. */
+/* Portable helper function to get an offset of a member.
+   Same as offsetof() macro defined in stddef.h */
 #define SLJIT_OFFSETOF(base, member) ((sljit_sw)(&((base*)0x10)->member) - 0x10)
 
 #if (defined SLJIT_UTIL_STACK && SLJIT_UTIL_STACK)
@@ -1624,60 +1883,6 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_function_context(void** func_ptr, struct
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void);
 #endif
 
-/* --------------------------------------------------------------------- */
-/*  CPU specific functions                                               */
-/* --------------------------------------------------------------------- */
-
-/* The following function is a helper function for sljit_emit_op_custom.
-   It returns with the real machine register index ( >=0 ) of any SLJIT_R,
-   SLJIT_S and SLJIT_SP registers.
-
-   Note: it returns with -1 for virtual registers (only on x86-32). */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg);
-
-/* The following function is a helper function for sljit_emit_op_custom.
-   It returns with the real machine register index of any SLJIT_FLOAT register.
-
-   Note: the index is always an even number on ARM-32, MIPS. */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg);
-
-/* Any instruction can be inserted into the instruction stream by
-   sljit_emit_op_custom. It has a similar purpose as inline assembly.
-   The size parameter must match to the instruction size of the target
-   architecture:
-
-         x86: 0 < size <= 15. The instruction argument can be byte aligned.
-      Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
-              if size == 4, the instruction argument must be 4 byte aligned.
-   Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
-	void *instruction, sljit_u32 size);
-
-/* Flags were set by a 32 bit operation. */
-#define SLJIT_CURRENT_FLAGS_32			SLJIT_32
-
-/* Flags were set by an ADD or ADDC operations. */
-#define SLJIT_CURRENT_FLAGS_ADD			0x01
-/* Flags were set by a SUB, SUBC, or NEG operation. */
-#define SLJIT_CURRENT_FLAGS_SUB			0x02
-
-/* Flags were set by sljit_emit_op2u with SLJIT_SUB opcode.
-   Must be combined with SLJIT_CURRENT_FLAGS_SUB. */
-#define SLJIT_CURRENT_FLAGS_COMPARE		0x04
-
-/* Define the currently available CPU status flags. It is usually used after
-   an sljit_emit_label or sljit_emit_op_custom operations to define which CPU
-   status flags are available.
-
-   The current_flags must be a valid combination of SLJIT_SET_* and
-   SLJIT_CURRENT_FLAGS_* constants. */
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_current_flags(struct sljit_compiler *compiler,
-	sljit_s32 current_flags);
-
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_32.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_32.c
index a1f16f5055..8175293d2b 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_32.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_32.c
@@ -101,6 +101,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define BKPT		0xe1200070
 #define EOR		0xe0200000
 #define LDR		0xe5100000
+#define LDR_POST	0xe4100000
 #define MOV		0xe1a00000
 #define MUL		0xe0000090
 #define MVN		0xe1e00000
@@ -108,6 +109,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define ORR		0xe1800000
 #define PUSH		0xe92d0000
 #define POP		0xe8bd0000
+#define RBIT		0xe6ff0f30
+#define REV		0xe6bf0f30
 #define RSB		0xe0600000
 #define RSC		0xe0e00000
 #define SBC		0xe0c00000
@@ -566,6 +569,7 @@ static SLJIT_INLINE void inline_set_jump_addr(sljit_uw jump_ptr, sljit_sw execut
 
 static sljit_uw get_imm(sljit_uw imm);
 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg, sljit_uw imm);
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg);
 
 static SLJIT_INLINE void inline_set_const(sljit_uw addr, sljit_sw executable_offset, sljit_uw new_constant, sljit_s32 flush_cache)
 {
@@ -957,12 +961,22 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 #endif
 
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_CMOV:
 #if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
+	case SLJIT_HAS_CTZ:
+	case SLJIT_HAS_REV:
 	case SLJIT_HAS_PREFETCH:
 #endif
+	case SLJIT_HAS_COPY_F32:
+	case SLJIT_HAS_COPY_F64:
 		return 1;
 
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+	case SLJIT_HAS_CTZ:
+		return 2;
+#endif
+
 	default:
 		return 0;
 	}
@@ -1224,6 +1238,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 
 	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1);
 
+	/* Doubles are saved, so alignment is unaffected. */
 	if ((size & SSIZE_OF(sw)) != 0 && (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG))
 		size += SSIZE_OF(sw);
 
@@ -1236,8 +1251,11 @@ static sljit_s32 emit_add_sp(struct sljit_compiler *compiler, sljit_uw imm)
 	sljit_uw imm2 = get_imm(imm);
 
 	if (imm2 == 0) {
-		FAIL_IF(load_immediate(compiler, TMP_REG2, imm));
-		imm2 = RM(TMP_REG2);
+		imm2 = (imm & ~(sljit_uw)0x3ff) >> 10;
+		imm = (imm & 0x3ff) >> 2;
+
+		FAIL_IF(push_inst(compiler, ADD | SRC2_IMM | RD(SLJIT_SP) | RN(SLJIT_SP) | 0xb00 | imm2));
+		return push_inst(compiler, ADD | SRC2_IMM | RD(SLJIT_SP) | RN(SLJIT_SP) | 0xf00 | (imm & 0xff));
 	}
 
 	return push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | imm2);
@@ -1246,11 +1264,11 @@ static sljit_s32 emit_add_sp(struct sljit_compiler *compiler, sljit_uw imm)
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size)
 {
 	sljit_s32 local_size, fscratches, fsaveds, i, tmp;
-	sljit_s32 saveds_restore_start = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	sljit_s32 restored_reg = 0;
 	sljit_s32 lr_dst = TMP_PC;
-	sljit_uw reg_list;
+	sljit_uw reg_list = 0;
 
-	SLJIT_ASSERT(reg_map[TMP_REG2] == 14);
+	SLJIT_ASSERT(reg_map[TMP_REG2] == 14 && frame_size <= 128);
 
 	local_size = compiler->local_size;
 	fscratches = compiler->fscratches;
@@ -1275,47 +1293,84 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
 	if (frame_size < 0) {
 		lr_dst = TMP_REG2;
 		frame_size = 0;
-	} else if (frame_size > 0)
+	} else if (frame_size > 0) {
+		SLJIT_ASSERT(frame_size == 1 || (frame_size & 0x7) == 0);
 		lr_dst = 0;
+		frame_size &= ~0x7;
+	}
 
-	reg_list = 0;
 	if (lr_dst != 0)
 		reg_list |= (sljit_uw)1 << reg_map[lr_dst];
 
 	tmp = SLJIT_S0 - compiler->saveds;
-	if (saveds_restore_start != tmp) {
-		for (i = saveds_restore_start; i > tmp; i--)
+	i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	if (tmp < i) {
+		restored_reg = i;
+		do {
 			reg_list |= (sljit_uw)1 << reg_map[i];
-	} else
-		saveds_restore_start = 0;
+		} while (--i > tmp);
+	}
 
-	for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--)
-		reg_list |= (sljit_uw)1 << reg_map[i];
+	i = compiler->scratches;
+	if (i >= SLJIT_FIRST_SAVED_REG) {
+		restored_reg = i;
+		do {
+			reg_list |= (sljit_uw)1 << reg_map[i];
+		} while (--i >= SLJIT_FIRST_SAVED_REG);
+	}
+
+	if (lr_dst == TMP_REG2 && reg_list == 0) {
+		restored_reg = TMP_REG2;
+		lr_dst = 0;
+	}
 
 	if (lr_dst == 0 && (reg_list & (reg_list - 1)) == 0) {
 		/* The local_size does not include the saved registers. */
-		local_size += SSIZE_OF(sw);
+		tmp = 0;
+		if (reg_list != 0) {
+			tmp = 2;
+			if (local_size <= 0xfff) {
+				if (local_size == 0) {
+					SLJIT_ASSERT(restored_reg != TMP_REG2);
+					if (frame_size == 0)
+						return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(restored_reg) | 0x800008);
+					if (frame_size > 2 * SSIZE_OF(sw))
+						return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)(frame_size - (2 * SSIZE_OF(sw))));
+				}
 
-		if (reg_list != 0)
-			local_size += SSIZE_OF(sw);
+				FAIL_IF(push_inst(compiler, LDR | 0x800000 | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)local_size));
+				tmp = 1;
+			} else if (frame_size == 0) {
+				frame_size = (restored_reg == TMP_REG2) ? SSIZE_OF(sw) : 2 * SSIZE_OF(sw);
+				tmp = 3;
+			}
+
+			/* Place for the saved register. */
+			if (restored_reg != TMP_REG2)
+				local_size += SSIZE_OF(sw);
+		}
+
+		/* Place for the lr register. */
+		local_size += SSIZE_OF(sw);
 
 		if (frame_size > local_size)
 			FAIL_IF(push_inst(compiler, SUB | RD(SLJIT_SP) | RN(SLJIT_SP) | (1 << 25) | (sljit_uw)(frame_size - local_size)));
 		else if (frame_size < local_size)
 			FAIL_IF(emit_add_sp(compiler, (sljit_uw)(local_size - frame_size)));
 
-		if (reg_list == 0)
+		if (tmp <= 1)
 			return SLJIT_SUCCESS;
 
-		if (saveds_restore_start != 0) {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[saveds_restore_start]));
-			lr_dst = saveds_restore_start;
-		} else {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[SLJIT_FIRST_SAVED_REG]));
-			lr_dst = SLJIT_FIRST_SAVED_REG;
+		if (tmp == 2) {
+			frame_size -= SSIZE_OF(sw);
+			if (restored_reg != TMP_REG2)
+				frame_size -= SSIZE_OF(sw);
+
+			return push_inst(compiler, LDR | 0x800000 | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)frame_size);
 		}
 
-		return push_inst(compiler, LDR | 0x800000 | RN(SLJIT_SP) | RD(lr_dst) | (sljit_uw)(frame_size - 2 * SSIZE_OF(sw)));
+		tmp = (restored_reg == TMP_REG2) ? 0x800004 : 0x800008;
+		return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)tmp);
 	}
 
 	if (local_size > 0)
@@ -1328,13 +1383,18 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
 		SLJIT_ASSERT(lr_dst != 0);
 		SLJIT_ASSERT(reg_list == (sljit_uw)1 << reg_map[lr_dst]);
 
-		return push_inst(compiler, 0xe49d0004 | RD(lr_dst));
+		return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(lr_dst) | 0x800004);
 	}
 
 	FAIL_IF(push_inst(compiler, POP | reg_list));
+
 	if (frame_size > 0)
 		return push_inst(compiler, SUB | RD(SLJIT_SP) | RN(SLJIT_SP) | (1 << 25) | ((sljit_uw)frame_size - sizeof(sljit_sw)));
-	return SLJIT_SUCCESS;
+
+	if (lr_dst != 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | (1 << 25) | sizeof(sljit_sw));
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
@@ -1345,28 +1405,38 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	return emit_stack_frame_release(compiler, 0);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | RM(src)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
 
-#define EMIT_SHIFT_INS_AND_RETURN(opcode) \
-	SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM)); \
-	if (compiler->shift_imm != 0x20) { \
-		SLJIT_ASSERT(src1 == TMP_REG1); \
-		SLJIT_ASSERT(!(flags & ARGS_SWAPPED)); \
-		\
-		if (compiler->shift_imm != 0) \
-			return push_inst(compiler, MOV | (flags & SET_FLAGS) | \
-				RD(dst) | (compiler->shift_imm << 7) | (opcode << 5) | RM(src2)); \
-		return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) | RM(src2)); \
-	} \
-	return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) \
-		| RM8((flags & ARGS_SWAPPED) ? src1 : src2) | (sljit_uw)(opcode << 5) \
-		| 0x10 | RM((flags & ARGS_SWAPPED) ? src2 : src1));
-
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_uw dst, sljit_uw src1, sljit_uw src2)
 {
+	sljit_s32 is_masked;
+	sljit_uw shift_type;
+
 	switch (GET_OPCODE(op)) {
 	case SLJIT_MOV:
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
@@ -1414,18 +1484,38 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 		return SLJIT_SUCCESS;
 
-	case SLJIT_NOT:
-		if (src2 & SRC2_IMM)
-			return push_inst(compiler, ((flags & INV_IMM) ? MOV : MVN) | (flags & SET_FLAGS) | RD(dst) | src2);
-
-		return push_inst(compiler, MVN | (flags & SET_FLAGS) | RD(dst) | RM(src2));
-
 	case SLJIT_CLZ:
-		SLJIT_ASSERT(!(flags & INV_IMM));
-		SLJIT_ASSERT(!(src2 & SRC2_IMM));
+		SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM));
 		FAIL_IF(push_inst(compiler, CLZ | RD(dst) | RM(src2)));
 		return SLJIT_SUCCESS;
 
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM));
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+		FAIL_IF(push_inst(compiler, RSB | SRC2_IMM | RD(TMP_REG1) | RN(src2) | 0));
+		FAIL_IF(push_inst(compiler, AND | RD(TMP_REG2) | RN(src2) | RM(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, CLZ | RD(dst) | RM(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, CMP | SET_FLAGS | SRC2_IMM | RN(dst) | 32));
+		return push_inst(compiler, (EOR ^ 0xf0000000) | SRC2_IMM | RD(dst) | RN(dst) | 0x1f);
+#else /* !SLJIT_CONFIG_ARM_V5 */
+		FAIL_IF(push_inst(compiler, RBIT | RD(dst) | RM(src2)));
+		return push_inst(compiler, CLZ | RD(dst) | RM(dst));
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
+	case SLJIT_REV:
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | (8 << 7) | (0 << 5) | RM(src2)));
+		FAIL_IF(push_inst(compiler, MOV | RD(dst) | (24 << 7) | (1 << 5) | RM(src2)));
+		FAIL_IF(push_inst(compiler, ORR | RD(dst) | RN(dst) | (16 << 7) | (0 << 5) | RM(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | (16 << 7) | (1 << 5) | RM(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | (8 << 7) | (3 << 5) | RM(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, ORR | RD(dst) | RN(dst) | (8 << 7) | (0 << 5) | RM(TMP_REG1)));
+		return push_inst(compiler, ORR | RD(dst) | RN(dst) | (8 << 7) | (1 << 5) | RM(TMP_REG1));
+#else /* !SLJIT_CONFIG_ARM_V5 */
+		return push_inst(compiler, REV | RD(dst) | RM(src2));
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
 	case SLJIT_ADD:
 		SLJIT_ASSERT(!(flags & INV_IMM));
 
@@ -1475,21 +1565,68 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		return push_inst(compiler, ORR | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
 
 	case SLJIT_XOR:
-		SLJIT_ASSERT(!(flags & INV_IMM));
+		if (flags & INV_IMM) {
+			SLJIT_ASSERT(src2 == SRC2_IMM);
+			return push_inst(compiler, MVN | (flags & SET_FLAGS) | RD(dst) | RM(src1));
+		}
 		return push_inst(compiler, EOR | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
 
 	case SLJIT_SHL:
-		EMIT_SHIFT_INS_AND_RETURN(0);
+	case SLJIT_MSHL:
+		shift_type = 0;
+		is_masked = GET_OPCODE(op) == SLJIT_MSHL;
+		break;
 
 	case SLJIT_LSHR:
-		EMIT_SHIFT_INS_AND_RETURN(1);
+	case SLJIT_MLSHR:
+		shift_type = 1;
+		is_masked = GET_OPCODE(op) == SLJIT_MLSHR;
+		break;
 
 	case SLJIT_ASHR:
-		EMIT_SHIFT_INS_AND_RETURN(2);
+	case SLJIT_MASHR:
+		shift_type = 2;
+		is_masked = GET_OPCODE(op) == SLJIT_MASHR;
+		break;
+
+	case SLJIT_ROTL:
+		if (compiler->shift_imm == 0x20) {
+			FAIL_IF(push_inst(compiler, RSB | SRC2_IMM | RD(TMP_REG2) | RN(src2) | 0));
+			src2 = TMP_REG2;
+		} else
+			compiler->shift_imm = (sljit_uw)(-(sljit_sw)compiler->shift_imm) & 0x1f;
+		/* fallthrough */
+
+	case SLJIT_ROTR:
+		shift_type = 3;
+		is_masked = 0;
+		break;
+
+	default:
+		SLJIT_UNREACHABLE();
+		return SLJIT_SUCCESS;
 	}
 
-	SLJIT_UNREACHABLE();
-	return SLJIT_SUCCESS;
+	SLJIT_ASSERT(!(flags & ARGS_SWAPPED) && !(flags & INV_IMM) && !(src2 & SRC2_IMM));
+
+	if (compiler->shift_imm != 0x20) {
+		SLJIT_ASSERT(src1 == TMP_REG1);
+
+		if (compiler->shift_imm != 0)
+			return push_inst(compiler, MOV | (flags & SET_FLAGS) |
+				RD(dst) | (compiler->shift_imm << 7) | (shift_type << 5) | RM(src2));
+		return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) | RM(src2));
+	}
+
+	SLJIT_ASSERT(src1 != TMP_REG2);
+
+	if (is_masked) {
+		FAIL_IF(push_inst(compiler, AND | RD(TMP_REG2) | RN(src2) | SRC2_IMM | 0x1f));
+		src2 = TMP_REG2;
+	}
+
+	return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst)
+		| RM8(src2) | (sljit_uw)(shift_type << 5) | 0x10 | RM(src1));
 }
 
 #undef EMIT_SHIFT_INS_AND_RETURN
@@ -1678,7 +1815,7 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg,
 #endif
 }
 
-static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg,
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg,
 	sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
 	sljit_uw imm, offset_reg, tmp;
@@ -2033,10 +2170,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	case SLJIT_MOV_S16:
 		return emit_op(compiler, SLJIT_MOV_S16, ALLOW_ANY_IMM | SIGNED | HALF_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
 
-	case SLJIT_NOT:
-		return emit_op(compiler, op, ALLOW_ANY_IMM, dst, dstw, TMP_REG1, 0, src, srcw);
-
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
+	case SLJIT_REV:
 		return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src, srcw);
 	}
 
@@ -2048,6 +2184,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w)
 {
+	sljit_s32 inp_flags;
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op2(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w));
 	ADJUST_LOCAL_OFFSET(dst, dstw);
@@ -2062,9 +2200,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		return emit_op(compiler, op, ALLOW_IMM | ALLOW_NEG_IMM, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_OR:
-	case SLJIT_XOR:
 		return emit_op(compiler, op, ALLOW_IMM, dst, dstw, src1, src1w, src2, src2w);
 
+	case SLJIT_XOR:
+		inp_flags = ALLOW_IMM;
+		if (((src1 & SLJIT_IMM) && src1w == -1) || ((src2 & SLJIT_IMM) && src2w == -1)) {
+			inp_flags |= ALLOW_INV_IMM;
+		}
+		return emit_op(compiler, op, inp_flags, dst, dstw, src1, src1w, src2, src2w);
+
 	case SLJIT_MUL:
 		return emit_op(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w);
 
@@ -2072,13 +2216,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		return emit_op(compiler, op, ALLOW_ANY_IMM, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
 		if (src2 & SLJIT_IMM) {
 			compiler->shift_imm = src2w & 0x1f;
 			return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src1, src1w);
-		}
-		else {
+		} else {
 			compiler->shift_imm = 0x20;
 			return emit_op(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w);
 		}
@@ -2098,6 +2246,55 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_s32 is_left;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+
+	op = GET_OPCODE(op);
+	is_left = (op == SLJIT_SHL || op == SLJIT_MSHL);
+
+	if (src1_reg == src2_reg) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, is_left ? SLJIT_ROTL : SLJIT_ROTR, dst_reg, 0, src1_reg, 0, src3, src3w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	/* Shift type of ROR is 3. */
+	if (src3 & SLJIT_IMM) {
+		src3w &= 0x1f;
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+
+		FAIL_IF(push_inst(compiler, MOV | RD(dst_reg) | RM(src1_reg) | ((sljit_uw)(is_left ? 0 : 1) << 5) | ((sljit_uw)src3w << 7)));
+		src3w = (src3w ^ 0x1f) + 1;
+		return push_inst(compiler, ORR | RD(dst_reg) | RN(dst_reg) | RM(src2_reg) | ((sljit_uw)(is_left ? 1 : 0) << 5) | ((sljit_uw)src3w << 7));
+	}
+
+	if (src3 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG2, src3, src3w, TMP_REG2));
+		src3 = TMP_REG2;
+	}
+
+	if (op == SLJIT_MSHL || op == SLJIT_MLSHR || dst_reg == src3) {
+		FAIL_IF(push_inst(compiler, AND | SRC2_IMM | RD(TMP_REG2) | RN(src3) | 0x1f));
+		src3 = TMP_REG2;
+	}
+
+	FAIL_IF(push_inst(compiler, MOV | RD(dst_reg) | RM8(src3) | ((sljit_uw)(is_left ? 0 : 1) << 5) | 0x10 | RM(src1_reg)));
+	FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | RM(src2_reg) | ((sljit_uw)(is_left ? 1 : 0) << 5) | (1 << 7)));
+	FAIL_IF(push_inst(compiler, EOR | SRC2_IMM | RD(TMP_REG2) | RN(src3) | 0x1f));
+	return push_inst(compiler, ORR | RD(dst_reg) | RN(dst_reg) | RM8(TMP_REG2) | ((sljit_uw)(is_left ? 1 : 0) << 5) | 0x10 | RM(TMP_REG1));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -2132,6 +2329,46 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 size, dst_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		SLJIT_ASSERT(reg_map[TMP_REG2] == 14);
+
+		if (FAST_IS_REG(dst))
+			return push_inst(compiler, MOV | RD(dst) | RM(TMP_REG2));
+		break;
+	case SLJIT_GET_RETURN_ADDRESS:
+		size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds - SLJIT_KEPT_SAVEDS_COUNT(compiler->options), 0);
+
+		if (compiler->fsaveds > 0 || compiler->fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
+			/* The size of pc is not added above. */
+			if ((size & SSIZE_OF(sw)) == 0)
+				size += SSIZE_OF(sw);
+
+			size += GET_SAVED_FLOAT_REGISTERS_SIZE(compiler->fscratches, compiler->fsaveds, f64);
+		}
+
+		SLJIT_ASSERT(((compiler->local_size + size + SSIZE_OF(sw)) & 0x7) == 0);
+
+		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, dst_r, SLJIT_MEM1(SLJIT_SP), compiler->local_size + size, TMP_REG1));
+		break;
+	}
+
+	if (dst & SLJIT_MEM)
+		return emit_op_mem(compiler, WORD_SIZE, TMP_REG2, dst, dstw, TMP_REG1);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -2372,23 +2609,31 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 
 #undef EMIT_FPU_DATA_TRANSFER
 
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
 {
+	sljit_s32 reg2;
+	sljit_uw inst;
+
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
 
-	SLJIT_ASSERT(reg_map[TMP_REG2] == 14);
+	if (reg & REG_PAIR_MASK) {
+		reg2 = REG_PAIR_SECOND(reg);
+		reg = REG_PAIR_FIRST(reg);
 
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, MOV | RD(dst) | RM(TMP_REG2));
+		inst = VMOV2 | RN(reg) | RD(reg2) | VM(freg);
+	} else {
+		inst = VMOV | VN(freg) | RD(reg);
 
-	/* Memory. */
-	return emit_op_mem(compiler, WORD_SIZE, TMP_REG2, dst, dstw, TMP_REG1);
+		if (!(op & SLJIT_32))
+			inst |= 1 << 7;
+	}
+
+	if (GET_OPCODE(op) == SLJIT_COPY_FROM_F64)
+		inst |= 1 << 20;
+
+	return push_inst(compiler, inst);
 }
 
 /* --------------------------------------------------------------------- */
@@ -2834,7 +3079,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 		src = TMP_REG1;
 	}
 
-	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0)) {
+	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
 		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | RM(src)));
 		src = TMP_REG1;
 	}
@@ -2880,6 +3125,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 	return sljit_emit_ijump(compiler, type, src, srcw);
 }
 
+#ifdef __SOFTFP__
+
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	if (compiler->options & SLJIT_ENTER_REG_ARG) {
+		if (src == SLJIT_FR0)
+			return SLJIT_SUCCESS;
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+	}
+
+	if (FAST_IS_REG(src)) {
+		if (op & SLJIT_32)
+			return push_inst(compiler, VMOV | (1 << 20) | RD(SLJIT_R0) | VN(src));
+		return push_inst(compiler, VMOV2 | (1 << 20) | RD(SLJIT_R0) | RN(SLJIT_R1) | VM(src));
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+
+	if (op & SLJIT_32)
+		return sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src, srcw);
+	return sljit_emit_mem(compiler, SLJIT_MOV, SLJIT_REG_PAIR(SLJIT_R0, SLJIT_R1), src, srcw);
+}
+
+#endif /* __SOFTFP__ */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
@@ -2930,9 +3202,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
-	dst_reg &= ~SLJIT_32;
-
-	cc = get_cc(compiler, type);
+	cc = get_cc(compiler, type & ~SLJIT_32);
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
 		tmp = get_imm((sljit_uw)srcw);
@@ -3058,7 +3328,7 @@ static sljit_s32 sljit_emit_mem_unaligned(struct sljit_compiler *compiler, sljit
 		break;
 
 	default:
-		if (type & SLJIT_MEM_ALIGNED_32) {
+		if (type & SLJIT_MEM_UNALIGNED_32) {
 			flags = WORD_SIZE;
 			if (!(type & SLJIT_MEM_STORE))
 				flags |= LOAD_DATA;
@@ -3066,7 +3336,7 @@ static sljit_s32 sljit_emit_mem_unaligned(struct sljit_compiler *compiler, sljit
 			return emit_op_mem(compiler, flags, reg, mem, memw, TMP_REG1);
 		}
 
-		if (!(type & SLJIT_MEM_ALIGNED_16)) {
+		if (!(type & SLJIT_MEM_UNALIGNED_16)) {
 			FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 3));
 			flags = BYTE_SIZE;
 			steps = 3;
@@ -3182,46 +3452,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	sljit_s32 mem, sljit_sw memw)
 {
 	sljit_s32 flags;
-	sljit_uw is_type1_transfer, inst;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
-	if (reg & REG_PAIR_MASK) {
-		ADJUST_LOCAL_OFFSET(mem, memw);
-
-#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-		if ((type & SLJIT_MEM_UNALIGNED) && !(type & SLJIT_MEM_ALIGNED_32)) {
-			FAIL_IF(update_mem_addr(compiler, &mem, &memw, (type & SLJIT_MEM_ALIGNED_16) ? 0xfff - 6 : 0xfff - 7));
-
-			if (!(type & SLJIT_MEM_STORE) && REG_PAIR_FIRST(reg) == (mem & REG_MASK)) {
-				FAIL_IF(sljit_emit_mem_unaligned(compiler, type, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw)));
-				return sljit_emit_mem_unaligned(compiler, type, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw);
-			}
-
-			FAIL_IF(sljit_emit_mem_unaligned(compiler, type, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw));
-			return sljit_emit_mem_unaligned(compiler, type, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw));
-		}
-#endif /* SLJIT_CONFIG_ARM_V5 */
-
-		FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 4));
-
-		flags = WORD_SIZE;
-
-		if (!(type & SLJIT_MEM_STORE)) {
-			if (REG_PAIR_FIRST(reg) == (mem & REG_MASK)) {
-				FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw), TMP_REG1));
-				return emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw, TMP_REG1);
-			}
-
-			flags = WORD_SIZE | LOAD_DATA;
-		}
-
-		FAIL_IF(emit_op_mem(compiler, flags, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw, TMP_REG1));
-		return emit_op_mem(compiler, flags, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw), TMP_REG1);
-	}
-
-	if (type & SLJIT_MEM_UNALIGNED) {
+	if (!(reg & REG_PAIR_MASK)) {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
 		ADJUST_LOCAL_OFFSET(mem, memw);
 #endif /* SLJIT_CONFIG_ARM_V5 */
@@ -3229,6 +3464,49 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
 	}
 
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+	if (type & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16)) {
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, (type & SLJIT_MEM_UNALIGNED_16) ? 0xfff - 6 : 0xfff - 7));
+
+		if (!(type & SLJIT_MEM_STORE) && REG_PAIR_FIRST(reg) == (mem & REG_MASK)) {
+			FAIL_IF(sljit_emit_mem_unaligned(compiler, type, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw)));
+			return sljit_emit_mem_unaligned(compiler, type, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw);
+		}
+
+		FAIL_IF(sljit_emit_mem_unaligned(compiler, type, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw));
+		return sljit_emit_mem_unaligned(compiler, type, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw));
+	}
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
+	FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 4));
+
+	flags = WORD_SIZE;
+
+	if (!(type & SLJIT_MEM_STORE)) {
+		if (REG_PAIR_FIRST(reg) == (mem & REG_MASK)) {
+			FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw), TMP_REG1));
+			return emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw, TMP_REG1);
+		}
+
+		flags = WORD_SIZE | LOAD_DATA;
+	}
+
+	FAIL_IF(emit_op_mem(compiler, flags, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw, TMP_REG1));
+	return emit_op_mem(compiler, flags, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw), TMP_REG1);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 flags;
+	sljit_uw is_type1_transfer, inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
+
 	is_type1_transfer = 1;
 
 	switch (type & 0xff) {
@@ -3269,16 +3547,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
 		if (!is_type1_transfer && memw != 0)
 			return SLJIT_ERR_UNSUPPORTED;
-	}
-	else {
+	} else {
 		if (is_type1_transfer) {
 			if (memw > 4095 || memw < -4095)
 				return SLJIT_ERR_UNSUPPORTED;
-		}
-		else {
-			if (memw > 255 || memw < -255)
-				return SLJIT_ERR_UNSUPPORTED;
-		}
+		} else if (memw > 255 || memw < -255)
+			return SLJIT_ERR_UNSUPPORTED;
 	}
 
 	if (type & SLJIT_MEM_SUPP)
@@ -3292,20 +3566,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 		if (is_type1_transfer)
 			inst |= (1 << 25);
 
-		if (type & SLJIT_MEM_PRE)
-			inst |= (1 << 21);
-		else
+		if (type & SLJIT_MEM_POST)
 			inst ^= (1 << 24);
+		else
+			inst |= (1 << 21);
 
 		return push_inst(compiler, inst);
 	}
 
 	inst = EMIT_DATA_TRANSFER(flags, 0, reg, mem & REG_MASK, 0);
 
-	if (type & SLJIT_MEM_PRE)
-		inst |= (1 << 21);
-	else
+	if (type & SLJIT_MEM_POST)
 		inst ^= (1 << 24);
+	else
+		inst |= (1 << 21);
 
 	if (is_type1_transfer) {
 		if (memw >= 0)
@@ -3336,10 +3610,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
 
-	if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-		return SLJIT_ERR_UNSUPPORTED;
-
-	if (type & SLJIT_MEM_ALIGNED_32)
+	if (type & SLJIT_MEM_UNALIGNED_32)
 		return emit_fop_mem(compiler, ((type ^ SLJIT_32) & SLJIT_32) | ((type & SLJIT_MEM_STORE) ? 0 : FPU_LOAD), freg, mem, memw);
 
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
@@ -3347,23 +3618,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 		FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(freg) | RD(TMP_REG2)));
 
 		if (type & SLJIT_32)
-			return sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_ALIGNED_16), TMP_REG2, mem, memw);
+			return sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_UNALIGNED_16), TMP_REG2, mem, memw);
 
 		max_offset = 0xfff - 7;
-		if (type & SLJIT_MEM_ALIGNED_16)
+		if (type & SLJIT_MEM_UNALIGNED_16)
 			max_offset++;
 
 		FAIL_IF(update_mem_addr(compiler, &mem, &memw, max_offset));
 		mem |= SLJIT_MEM;
 
-		FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_ALIGNED_16), TMP_REG2, mem, memw));
+		FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_UNALIGNED_16), TMP_REG2, mem, memw));
 
 		FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(freg) | 0x80 | RD(TMP_REG2)));
-		return sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_ALIGNED_16), TMP_REG2, mem, memw + 4);
+		return sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_UNALIGNED_16), TMP_REG2, mem, memw + 4);
 	}
 
 	max_offset = (type & SLJIT_32) ? 0xfff - 3 : 0xfff - 7;
-	if (type & SLJIT_MEM_ALIGNED_16)
+	if (type & SLJIT_MEM_UNALIGNED_16)
 		max_offset++;
 
 	FAIL_IF(update_mem_addr(compiler, &mem, &memw, max_offset));
@@ -3382,11 +3653,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 
 	mem |= SLJIT_MEM;
 
-	FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | (type & SLJIT_MEM_ALIGNED_16), dst, mem, memw));
+	FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | (type & SLJIT_MEM_UNALIGNED_16), dst, mem, memw));
 	FAIL_IF(push_inst(compiler, VMOV | VN(freg) | RD(dst)));
 
 	if (!(type & SLJIT_32)) {
-		FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | (type & SLJIT_MEM_ALIGNED_16), dst, mem, memw + 4));
+		FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | (type & SLJIT_MEM_UNALIGNED_16), dst, mem, memw + 4));
 		FAIL_IF(push_inst(compiler, VMOV | VN(freg) | 0x80 | RD(dst)));
 	}
 
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_64.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_64.c
index 1fbdae572a..c3215742f4 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_64.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_64.c
@@ -86,6 +86,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define CSINC 0x9a800400
 #define EOR 0xca000000
 #define EORI 0xd2000000
+#define EXTR 0x93c00000
 #define FABS 0x1e60c000
 #define FADD 0x1e602800
 #define FCMP 0x1e602000
@@ -93,11 +94,13 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define FCVTZS 0x9e780000
 #define FDIV 0x1e601800
 #define FMOV 0x1e604000
+#define FMOV_R 0x9e660000
 #define FMUL 0x1e600800
 #define FNEG 0x1e614000
 #define FSUB 0x1e603800
 #define LDRI 0xf9400000
 #define LDRI_F64 0xfd400000
+#define LDRI_POST 0xf8400400
 #define LDP 0xa9400000
 #define LDP_F64 0x6d400000
 #define LDP_POST 0xa8c00000
@@ -112,7 +115,10 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define ORN 0xaa200000
 #define ORR 0xaa000000
 #define ORRI 0xb2000000
+#define RBIT 0xdac00000
 #define RET 0xd65f0000
+#define REV 0xdac00c00
+#define RORV 0x9ac02c00
 #define SBC 0xda000000
 #define SBFM 0x93000000
 #define SCVTF 0x9e620000
@@ -389,8 +395,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 #endif
 
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_CTZ:
+	case SLJIT_HAS_REV:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_CMOV:
 	case SLJIT_HAS_PREFETCH:
+	case SLJIT_HAS_COPY_F32:
+	case SLJIT_HAS_COPY_F64:
 		return 1;
 
 	default:
@@ -629,6 +640,8 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 		switch (op) {
 		case SLJIT_MUL:
 		case SLJIT_CLZ:
+		case SLJIT_CTZ:
+		case SLJIT_REV:
 		case SLJIT_ADDC:
 		case SLJIT_SUBC:
 			/* No form with immediate operand (except imm 0, which
@@ -637,10 +650,6 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 		case SLJIT_MOV:
 			SLJIT_ASSERT(!(flags & SET_FLAGS) && (flags & ARG2_IMM) && arg1 == TMP_REG1);
 			return load_immediate(compiler, dst, imm);
-		case SLJIT_NOT:
-			SLJIT_ASSERT(flags & ARG2_IMM);
-			FAIL_IF(load_immediate(compiler, dst, (flags & INT_OP) ? (~imm & 0xffffffff) : ~imm));
-			goto set_flags;
 		case SLJIT_SUB:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
 			if (flags & ARG1_IMM)
@@ -687,8 +696,13 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 				break;
 			CHECK_FLAGS(3 << 29);
 			return push_inst(compiler, (ANDI ^ inv_bits) | RD(dst) | RN(reg) | inst_bits);
-		case SLJIT_OR:
 		case SLJIT_XOR:
+			if (imm == -1) {
+				FAIL_IF(push_inst(compiler, (ORN ^ inv_bits) | RD(dst) | RN(TMP_ZERO) | RM(reg)));
+				goto set_flags;
+			}
+			/* fallthrough */
+		case SLJIT_OR:
 			inst_bits = logical_imm(imm, LOGICAL_IMM_CHECK | ((flags & INT_OP) ? 16 : 32));
 			if (!inst_bits)
 				break;
@@ -699,36 +713,50 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 			FAIL_IF(push_inst(compiler, (inst_bits ^ inv_bits) | RD(dst) | RN(reg)));
 			goto set_flags;
 		case SLJIT_SHL:
+		case SLJIT_MSHL:
 			if (flags & ARG1_IMM)
 				break;
+
 			if (flags & INT_OP) {
 				imm &= 0x1f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1)
-					| (((sljit_ins)-imm & 0x1f) << 16) | ((31 - (sljit_ins)imm) << 10)));
-			}
-			else {
+				inst_bits = (((sljit_ins)-imm & 0x1f) << 16) | ((31 - (sljit_ins)imm) << 10);
+			} else {
 				imm &= 0x3f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | (1 << 22)
-					| (((sljit_ins)-imm & 0x3f) << 16) | ((63 - (sljit_ins)imm) << 10)));
+				inst_bits = ((sljit_ins)1 << 22) | (((sljit_ins)-imm & 0x3f) << 16) | ((63 - (sljit_ins)imm) << 10);
 			}
+
+			FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | inst_bits));
 			goto set_flags;
 		case SLJIT_LSHR:
+		case SLJIT_MLSHR:
 		case SLJIT_ASHR:
+		case SLJIT_MASHR:
 			if (flags & ARG1_IMM)
 				break;
-			if (op == SLJIT_ASHR)
+
+			if (op >= SLJIT_ASHR)
 				inv_bits |= 1 << 30;
+
 			if (flags & INT_OP) {
 				imm &= 0x1f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1)
-					| ((sljit_ins)imm << 16) | (31 << 10)));
-			}
-			else {
+				inst_bits = ((sljit_ins)imm << 16) | (31 << 10);
+			} else {
 				imm &= 0x3f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1)
-					| (1 << 22) | ((sljit_ins)imm << 16) | (63 << 10)));
+				inst_bits = ((sljit_ins)1 << 22) | ((sljit_ins)imm << 16) | (63 << 10);
 			}
+
+			FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | inst_bits));
 			goto set_flags;
+		case SLJIT_ROTL:
+		case SLJIT_ROTR:
+			if (flags & ARG1_IMM)
+				break;
+
+			if (op == SLJIT_ROTL)
+				imm = -imm;
+
+			imm &= (flags & INT_OP) ? 0x1f : 0x3f;
+			return push_inst(compiler, (EXTR ^ (inv_bits | (inv_bits >> 9))) | RD(dst) | RN(arg1) | RM(arg1) | ((sljit_ins)imm << 10));
 		default:
 			SLJIT_UNREACHABLE();
 			break;
@@ -787,13 +815,17 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 	case SLJIT_MOV_S32:
 		SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
 		return push_inst(compiler, SBFM | (1 << 22) | RD(dst) | RN(arg2) | (31 << 10));
-	case SLJIT_NOT:
-		SLJIT_ASSERT(arg1 == TMP_REG1);
-		FAIL_IF(push_inst(compiler, (ORN ^ inv_bits) | RD(dst) | RN(TMP_ZERO) | RM(arg2)));
-		break; /* Set flags. */
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(arg1 == TMP_REG1);
 		return push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | RN(arg2));
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(arg1 == TMP_REG1);
+		FAIL_IF(push_inst(compiler, (RBIT ^ inv_bits) | RD(dst) | RN(arg2)));
+		return push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | RN(dst));
+	case SLJIT_REV:
+		SLJIT_ASSERT(arg1 == TMP_REG1);
+		inv_bits |= inv_bits >> 21;
+		return push_inst(compiler, (REV ^ inv_bits) | RD(dst) | RN(arg2));
 	case SLJIT_ADD:
 		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
 		CHECK_FLAGS(1 << 29);
@@ -832,14 +864,23 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 		FAIL_IF(push_inst(compiler, (EOR ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		FAIL_IF(push_inst(compiler, (LSLV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		FAIL_IF(push_inst(compiler, (LSRV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		FAIL_IF(push_inst(compiler, (ASRV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
+	case SLJIT_ROTL:
+		FAIL_IF(push_inst(compiler, (SUB ^ inv_bits) | RD(TMP_REG2) | RN(TMP_ZERO) | RM(arg2)));
+		arg2 = TMP_REG2;
+		/* fallthrough */
+	case SLJIT_ROTR:
+		return push_inst(compiler, (RORV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
 	default:
 		SLJIT_UNREACHABLE();
 		return SLJIT_SUCCESS;
@@ -911,9 +952,19 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, s
 		}
 	}
 
-	if (argw <= 255 && argw >= -256)
+	if (argw <= 0xff && argw >= -0x100)
 		return push_inst(compiler, STURBI | type | RT(reg) | RN(arg) | (((sljit_ins)argw & 0x1ff) << 12));
 
+	if (argw >= 0) {
+		if (argw <= 0xfff0ff && ((argw + 0x100) & 0xfff) <= 0x1ff) {
+			FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_reg) | RN(arg) | (((sljit_ins)argw >> 12) << 10)));
+			return push_inst(compiler, STURBI | type | RT(reg) | RN(tmp_reg) | (((sljit_ins)argw & 0x1ff) << 12));
+		}
+	} else if (argw >= -0xfff100 && ((-argw + 0xff) & 0xfff) <= 0x1ff) {
+		FAIL_IF(push_inst(compiler, SUBI | (1 << 22) | RD(tmp_reg) | RN(arg) | (((sljit_ins)-argw >> 12) << 10)));
+		return push_inst(compiler, STURBI | type | RT(reg) | RN(tmp_reg) | (((sljit_ins)argw & 0x1ff) << 12));
+	}
+
 	FAIL_IF(load_immediate(compiler, tmp_reg, argw));
 
 	return push_inst(compiler, STRB | type | RT(reg) | RN(arg) | RM(tmp_reg));
@@ -936,7 +987,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
 	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 2);
-	saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, SSIZE_OF(f64));
+	saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 
 	local_size = (local_size + saved_regs_size + 0xf) & ~0xf;
 	compiler->local_size = local_size;
@@ -1109,25 +1160,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
 	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 2);
-	saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, SSIZE_OF(f64));
+	saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 
 	compiler->local_size = (local_size + saved_regs_size + 0xf) & ~0xf;
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_s32 local_size, prev, fprev, i, tmp;
 	sljit_ins offs;
 
 	local_size = compiler->local_size;
 
-	if (local_size > 512 && local_size <= 512 + 496) {
-		FAIL_IF(push_inst(compiler, LDP_POST | RT(TMP_FP) | RT2(TMP_LR)
-			| RN(SLJIT_SP) | ((sljit_ins)(local_size - 512) << (15 - 3))));
-		local_size = 512;
-	} else
-		FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
+	if (!is_return_to) {
+		if (local_size > 512 && local_size <= 512 + 496) {
+			FAIL_IF(push_inst(compiler, LDP_POST | RT(TMP_FP) | RT2(TMP_LR)
+				| RN(SLJIT_SP) | ((sljit_ins)(local_size - 512) << (15 - 3))));
+			local_size = 512;
+		} else
+			FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
+	} else {
+		if (local_size > 512 && local_size <= 512 + 248) {
+			FAIL_IF(push_inst(compiler, LDRI_POST | RT(TMP_FP) | RN(SLJIT_SP) | ((sljit_ins)(local_size - 512) << 12)));
+			local_size = 512;
+		} else
+			FAIL_IF(push_inst(compiler, LDRI | RT(TMP_FP) | RN(SLJIT_SP) | 0));
+	}
 
 	if (local_size > 512) {
 		local_size -= 512;
@@ -1203,11 +1262,34 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 
 	return push_inst(compiler, RET | RN(TMP_LR));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(src)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -1404,6 +1486,75 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_ins inv_bits, imm;
+	sljit_s32 is_left;
+	sljit_sw mask;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	if (src1_reg == src2_reg) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_left ? SLJIT_ROTL : SLJIT_ROTR) | (op & SLJIT_32), dst_reg, 0, src1_reg, 0, src3, src3w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	inv_bits = (op & SLJIT_32) ? W_OP : 0;
+
+	if (src3 & SLJIT_IMM) {
+		mask = inv_bits ? 0x1f : 0x3f;
+		src3w &= mask;
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+
+		if (is_left)
+			src3w = (src3w ^ mask) + 1;
+
+		return push_inst(compiler, (EXTR ^ (inv_bits | (inv_bits >> 9))) | RD(dst_reg)
+			| RN(is_left ? src1_reg : src2_reg) | RM(is_left ? src2_reg : src1_reg) | ((sljit_ins)src3w << 10));
+	}
+
+	if (src3 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inv_bits ? INT_SIZE : WORD_SIZE, TMP_REG2, src3, src3w, TMP_REG2));
+		src3 = TMP_REG2;
+	} else if (dst_reg == src3) {
+		FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG2) | RN(TMP_ZERO) | RM(src3)));
+		src3 = TMP_REG2;
+	}
+
+	FAIL_IF(push_inst(compiler, ((is_left ? LSLV : LSRV) ^ inv_bits) | RD(dst_reg) | RN(src1_reg) | RM(src3)));
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		/* Shift left/right by 1. */
+		if (is_left)
+			imm = (sljit_ins)(inv_bits ? ((1 << 16) | (31 << 10)) : ((1 << 16) | (63 << 10) | (1 << 22)));
+		else
+			imm = (sljit_ins)(inv_bits ? ((31 << 16) | (30 << 10)) : ((63 << 16) | (62 << 10) | (1 << 22)));
+
+		FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(TMP_REG1) | RN(src2_reg) | imm));
+
+		/* Set imm to mask. */
+		imm = (sljit_ins)(inv_bits ? (4 << 10) : ((5 << 10) | (1 << 22)));
+		FAIL_IF(push_inst(compiler, (EORI ^ inv_bits) | RD(TMP_REG2) | RN(src3) | imm));
+
+		src2_reg = TMP_REG1;
+	} else
+		FAIL_IF(push_inst(compiler, (SUB ^ inv_bits) | RD(TMP_REG2) | RN(TMP_ZERO) | RM(src3)));
+
+	FAIL_IF(push_inst(compiler, ((is_left ? LSRV : LSLV) ^ inv_bits) | RD(TMP_REG1) | RN(src2_reg) | RM(TMP_REG2)));
+	return push_inst(compiler, (ORR ^ inv_bits) | RD(dst_reg) | RN(dst_reg) | RM(TMP_REG1));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1444,6 +1595,32 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 dst_r = TMP_LR;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		if (FAST_IS_REG(dst))
+			return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | RM(TMP_LR));
+		break;
+	case SLJIT_GET_RETURN_ADDRESS:
+		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, dst_r, SLJIT_MEM1(SLJIT_SP), 0x8, TMP_REG2));
+		break;
+	}
+
+	if (dst & SLJIT_MEM)
+		return emit_op_mem(compiler, WORD_SIZE | STORE, dst_r, dst, dstw, TMP_REG2);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -1678,21 +1855,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 	return emit_fop_mem(compiler, mem_flags | STORE, TMP_FREG1, dst, dstw);
 }
 
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
 {
+	sljit_ins inst;
+
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
 
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | RM(TMP_LR));
+	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64)
+		inst = FMOV_R | RN(reg) | VD(freg) | (sljit_ins)1 << 16;
+	else
+		inst = FMOV_R | VN(freg) | RD(reg);
 
-	/* Memory. */
-	return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_LR, dst, dstw, TMP_REG1);
+	if (op & SLJIT_32)
+		inst ^= ((sljit_ins)1 << 31) | ((sljit_ins)1 << 22);
+
+	return push_inst(compiler, inst);
 }
 
 /* --------------------------------------------------------------------- */
@@ -1836,7 +2015,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compile
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
@@ -1885,10 +2064,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (!(src & SLJIT_IMM)) {
 		if (src & SLJIT_MEM) {
+			ADJUST_LOCAL_OFFSET(src, srcw);
 			FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
 			src = TMP_REG1;
 		}
@@ -1913,20 +2092,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 	SLJIT_UNUSED_ARG(arg_types);
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
 		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
 		src = TMP_REG1;
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(src)));
 			src = TMP_REG1;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP;
 	}
 
@@ -1986,22 +2165,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	sljit_s32 dst_reg,
 	sljit_s32 src, sljit_sw srcw)
 {
-	sljit_ins inv_bits = (dst_reg & SLJIT_32) ? W_OP : 0;
+	sljit_ins inv_bits = (type & SLJIT_32) ? W_OP : 0;
 	sljit_ins cc;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
-		if (dst_reg & SLJIT_32)
+		if (type & SLJIT_32)
 			srcw = (sljit_s32)srcw;
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
 		srcw = 0;
 	}
 
-	cc = get_cc(compiler, type);
-	dst_reg &= ~SLJIT_32;
+	cc = get_cc(compiler, type & ~SLJIT_32);
 
 	return push_inst(compiler, (CSEL ^ inv_bits) | (cc << 12) | RD(dst_reg) | RN(dst_reg) | RM(src));
 }
@@ -2010,59 +2188,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
-	sljit_u32 sign = 0, inst;
+	sljit_u32 inst;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
-	if (!(reg & REG_PAIR_MASK)) {
-		if (type & SLJIT_MEM_UNALIGNED)
-			return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
-
-		if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -256))
-			return SLJIT_ERR_UNSUPPORTED;
-
-		if (type & SLJIT_MEM_SUPP)
-			return SLJIT_SUCCESS;
-
-		switch (type & 0xff) {
-		case SLJIT_MOV:
-		case SLJIT_MOV_P:
-			inst = STURBI | (MEM_SIZE_SHIFT(WORD_SIZE) << 30) | 0x400;
-			break;
-		case SLJIT_MOV_S8:
-			sign = 1;
-			/* fallthrough */
-		case SLJIT_MOV_U8:
-			inst = STURBI | (MEM_SIZE_SHIFT(BYTE_SIZE) << 30) | 0x400;
-			break;
-		case SLJIT_MOV_S16:
-			sign = 1;
-			/* fallthrough */
-		case SLJIT_MOV_U16:
-			inst = STURBI | (MEM_SIZE_SHIFT(HALF_SIZE) << 30) | 0x400;
-			break;
-		case SLJIT_MOV_S32:
-			sign = 1;
-			/* fallthrough */
-		case SLJIT_MOV_U32:
-		case SLJIT_MOV32:
-			inst = STURBI | (MEM_SIZE_SHIFT(INT_SIZE) << 30) | 0x400;
-			break;
-		default:
-			SLJIT_UNREACHABLE();
-			inst = STURBI | (MEM_SIZE_SHIFT(WORD_SIZE) << 30) | 0x400;
-			break;
-		}
-
-		if (!(type & SLJIT_MEM_STORE))
-			inst |= sign ? 0x00800000 : 0x00400000;
-
-		if (type & SLJIT_MEM_PRE)
-			inst |= 0x800;
-
-		return push_inst(compiler, inst | RT(reg) | RN(mem & REG_MASK) | (sljit_ins)((memw & 0x1ff) << 12));
-	}
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
 
 	ADJUST_LOCAL_OFFSET(mem, memw);
 
@@ -2123,17 +2255,68 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	return push_inst(compiler, ((type & SLJIT_MEM_STORE) ? STP : LDP) | RT(REG_PAIR_FIRST(reg)) | RT2(REG_PAIR_SECOND(reg)) | RN(mem & REG_MASK) | (sljit_ins)((memw & 0x3f8) << 12));
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_u32 sign = 0, inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
+
+	if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -256))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_MEM_SUPP)
+		return SLJIT_SUCCESS;
+
+	switch (type & 0xff) {
+	case SLJIT_MOV:
+	case SLJIT_MOV_P:
+		inst = STURBI | (MEM_SIZE_SHIFT(WORD_SIZE) << 30) | 0x400;
+		break;
+	case SLJIT_MOV_S8:
+		sign = 1;
+		/* fallthrough */
+	case SLJIT_MOV_U8:
+		inst = STURBI | (MEM_SIZE_SHIFT(BYTE_SIZE) << 30) | 0x400;
+		break;
+	case SLJIT_MOV_S16:
+		sign = 1;
+		/* fallthrough */
+	case SLJIT_MOV_U16:
+		inst = STURBI | (MEM_SIZE_SHIFT(HALF_SIZE) << 30) | 0x400;
+		break;
+	case SLJIT_MOV_S32:
+		sign = 1;
+		/* fallthrough */
+	case SLJIT_MOV_U32:
+	case SLJIT_MOV32:
+		inst = STURBI | (MEM_SIZE_SHIFT(INT_SIZE) << 30) | 0x400;
+		break;
+	default:
+		SLJIT_UNREACHABLE();
+		inst = STURBI | (MEM_SIZE_SHIFT(WORD_SIZE) << 30) | 0x400;
+		break;
+	}
+
+	if (!(type & SLJIT_MEM_STORE))
+		inst |= sign ? 0x00800000 : 0x00400000;
+
+	if (!(type & SLJIT_MEM_POST))
+		inst |= 0x800;
+
+	return push_inst(compiler, inst | RT(reg) | RN(mem & REG_MASK) | (sljit_ins)((memw & 0x1ff) << 12));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw)
 {
 	sljit_u32 inst;
 
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
-
-	if (type & SLJIT_MEM_UNALIGNED)
-		return sljit_emit_fmem_unaligned(compiler, type, freg, mem, memw);
+	CHECK(check_sljit_emit_fmem_update(compiler, type, freg, mem, memw));
 
 	if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -256))
 		return SLJIT_ERR_UNSUPPORTED;
@@ -2149,7 +2332,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	if (!(type & SLJIT_MEM_STORE))
 		inst |= 0x00400000;
 
-	if (type & SLJIT_MEM_PRE)
+	if (!(type & SLJIT_MEM_POST))
 		inst |= 0x800;
 
 	return push_inst(compiler, inst | VT(freg) | RN(mem & REG_MASK) | (sljit_ins)((memw & 0x1ff) << 12));
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_T2_32.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_T2_32.c
index 55c810bb78..73dd7f99d5 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_T2_32.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeARM_T2_32.c
@@ -160,6 +160,12 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define POP_W		0xe8bd0000
 #define PUSH		0xb400
 #define PUSH_W		0xe92d0000
+#define REV		0xba00
+#define REV_W		0xfa90f080
+#define RBIT		0xfa90f0a0
+#define RORS		0x41c0
+#define ROR_W		0xfa60f000
+#define ROR_WI		0xea4f0030
 #define RSB_WI		0xf1c00000
 #define RSBSI		0x4240
 #define SBCI		0xf1600000
@@ -492,8 +498,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 #endif
 
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_CTZ:
+	case SLJIT_HAS_REV:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_CMOV:
 	case SLJIT_HAS_PREFETCH:
+	case SLJIT_HAS_COPY_F32:
+	case SLJIT_HAS_COPY_F64:
 		return 1;
 
 	default:
@@ -593,7 +604,7 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 	   arg1 must be register, imm
 	   arg2 must be register, imm */
 	sljit_s32 reg;
-	sljit_uw imm, nimm;
+	sljit_uw imm, imm2;
 
 	if (SLJIT_UNLIKELY((flags & (ARG1_IMM | ARG2_IMM)) == (ARG1_IMM | ARG2_IMM))) {
 		/* Both are immediates, no temporaries are used. */
@@ -608,45 +619,41 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 
 		switch (flags & 0xffff) {
 		case SLJIT_CLZ:
+		case SLJIT_CTZ:
+		case SLJIT_REV:
 		case SLJIT_MUL:
 			/* No form with immediate operand. */
 			break;
 		case SLJIT_MOV:
 			SLJIT_ASSERT(!(flags & SET_FLAGS) && (flags & ARG2_IMM) && arg1 == TMP_REG2);
 			return load_immediate(compiler, dst, imm);
-		case SLJIT_NOT:
-			if (!(flags & SET_FLAGS))
-				return load_immediate(compiler, dst, ~imm);
-			/* Since the flags should be set, we just fallback to the register mode.
-			   Although some clever things could be done here, "NOT IMM" does not worth the efforts. */
-			break;
 		case SLJIT_ADD:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
-			nimm = NEGATE(imm);
+			imm2 = NEGATE(imm);
 			if (IS_2_LO_REGS(reg, dst)) {
 				if (imm <= 0x7)
 					return push_inst16(compiler, ADDSI3 | IMM3(imm) | RD3(dst) | RN3(reg));
-				if (nimm <= 0x7)
-					return push_inst16(compiler, SUBSI3 | IMM3(nimm) | RD3(dst) | RN3(reg));
+				if (imm2 <= 0x7)
+					return push_inst16(compiler, SUBSI3 | IMM3(imm2) | RD3(dst) | RN3(reg));
 				if (reg == dst) {
 					if (imm <= 0xff)
 						return push_inst16(compiler, ADDSI8 | IMM8(imm) | RDN3(dst));
-					if (nimm <= 0xff)
-						return push_inst16(compiler, SUBSI8 | IMM8(nimm) | RDN3(dst));
+					if (imm2 <= 0xff)
+						return push_inst16(compiler, SUBSI8 | IMM8(imm2) | RDN3(dst));
 				}
 			}
 			if (!(flags & SET_FLAGS)) {
 				if (imm <= 0xfff)
 					return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(imm));
-				if (nimm <= 0xfff)
-					return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(nimm));
+				if (imm2 <= 0xfff)
+					return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(imm2));
 			}
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
-			nimm = get_imm(NEGATE(imm));
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
+			imm = get_imm(NEGATE(imm));
+			if (imm != INVALID_IMM)
+				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_ADDC:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
@@ -667,39 +674,39 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 			if (flags & UNUSED_RETURN) {
 				if (imm <= 0xff && reg_map[reg] <= 7)
 					return push_inst16(compiler, CMPI | IMM8(imm) | RDN3(reg));
-				nimm = get_imm(imm);
-				if (nimm != INVALID_IMM)
-					return push_inst32(compiler, CMPI_W | RN4(reg) | nimm);
-				nimm = get_imm(NEGATE(imm));
-				if (nimm != INVALID_IMM)
-					return push_inst32(compiler, CMNI_W | RN4(reg) | nimm);
+				imm2 = get_imm(imm);
+				if (imm2 != INVALID_IMM)
+					return push_inst32(compiler, CMPI_W | RN4(reg) | imm2);
+				imm = get_imm(NEGATE(imm));
+				if (imm != INVALID_IMM)
+					return push_inst32(compiler, CMNI_W | RN4(reg) | imm);
 				break;
 			}
-			nimm = NEGATE(imm);
+			imm2 = NEGATE(imm);
 			if (IS_2_LO_REGS(reg, dst)) {
 				if (imm <= 0x7)
 					return push_inst16(compiler, SUBSI3 | IMM3(imm) | RD3(dst) | RN3(reg));
-				if (nimm <= 0x7)
-					return push_inst16(compiler, ADDSI3 | IMM3(nimm) | RD3(dst) | RN3(reg));
+				if (imm2 <= 0x7)
+					return push_inst16(compiler, ADDSI3 | IMM3(imm2) | RD3(dst) | RN3(reg));
 				if (reg == dst) {
 					if (imm <= 0xff)
 						return push_inst16(compiler, SUBSI8 | IMM8(imm) | RDN3(dst));
-					if (nimm <= 0xff)
-						return push_inst16(compiler, ADDSI8 | IMM8(nimm) | RDN3(dst));
+					if (imm2 <= 0xff)
+						return push_inst16(compiler, ADDSI8 | IMM8(imm2) | RDN3(dst));
 				}
 			}
 			if (!(flags & SET_FLAGS)) {
 				if (imm <= 0xfff)
 					return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(imm));
-				if (nimm <= 0xfff)
-					return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(nimm));
+				if (imm2 <= 0xfff)
+					return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(imm2));
 			}
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
-			nimm = get_imm(NEGATE(imm));
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
+			imm = get_imm(NEGATE(imm));
+			if (imm != INVALID_IMM)
+				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_SUBC:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
@@ -710,32 +717,43 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 				return push_inst32(compiler, SBCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_AND:
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ((flags & UNUSED_RETURN) ? TSTI : ANDI) | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, ((flags & UNUSED_RETURN) ? TSTI : ANDI) | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
 			imm = get_imm(~imm);
 			if (imm != INVALID_IMM)
 				return push_inst32(compiler, BICI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_OR:
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ORRI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, ORRI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
 			imm = get_imm(~imm);
 			if (imm != INVALID_IMM)
 				return push_inst32(compiler, ORNI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_XOR:
+			if (imm == (sljit_uw)-1) {
+				if (IS_2_LO_REGS(dst, reg))
+					return push_inst16(compiler, MVNS | RD3(dst) | RN3(reg));
+				return push_inst32(compiler, MVN_W | (flags & SET_FLAGS) | RD4(dst) | RM4(reg));
+			}
 			imm = get_imm(imm);
 			if (imm != INVALID_IMM)
 				return push_inst32(compiler, EORI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_SHL:
+		case SLJIT_MSHL:
 		case SLJIT_LSHR:
+		case SLJIT_MLSHR:
 		case SLJIT_ASHR:
+		case SLJIT_MASHR:
+		case SLJIT_ROTL:
+		case SLJIT_ROTR:
 			if (flags & ARG1_IMM)
 				break;
 			imm &= 0x1f;
+
 			if (imm == 0) {
 				if (!(flags & SET_FLAGS))
 					return push_inst16(compiler, MOV | SET_REGS44(dst, reg));
@@ -743,19 +761,28 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 					return push_inst16(compiler, MOVS | RD3(dst) | RN3(reg));
 				return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(dst) | RM4(reg));
 			}
+
 			switch (flags & 0xffff) {
 			case SLJIT_SHL:
+			case SLJIT_MSHL:
 				if (IS_2_LO_REGS(dst, reg))
 					return push_inst16(compiler, LSLSI | RD3(dst) | RN3(reg) | (imm << 6));
 				return push_inst32(compiler, LSL_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
 			case SLJIT_LSHR:
+			case SLJIT_MLSHR:
 				if (IS_2_LO_REGS(dst, reg))
 					return push_inst16(compiler, LSRSI | RD3(dst) | RN3(reg) | (imm << 6));
 				return push_inst32(compiler, LSR_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
-			default: /* SLJIT_ASHR */
+			case SLJIT_ASHR:
+			case SLJIT_MASHR:
 				if (IS_2_LO_REGS(dst, reg))
 					return push_inst16(compiler, ASRSI | RD3(dst) | RN3(reg) | (imm << 6));
 				return push_inst32(compiler, ASR_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
+			case SLJIT_ROTL:
+				imm = (imm ^ 0x1f) + 1;
+				/* fallthrough */
+			default: /* SLJIT_ROTR */
+				return push_inst32(compiler, ROR_WI | RD4(dst) | RM4(reg) | IMM5(imm));
 			}
 		default:
 			SLJIT_UNREACHABLE();
@@ -807,15 +834,17 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 		if (IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, SXTH | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, SXTH_W | RD4(dst) | RM4(arg2));
-	case SLJIT_NOT:
-		SLJIT_ASSERT(arg1 == TMP_REG2);
-		if (IS_2_LO_REGS(dst, arg2))
-			return push_inst16(compiler, MVNS | RD3(dst) | RN3(arg2));
-		return push_inst32(compiler, MVN_W | (flags & SET_FLAGS) | RD4(dst) | RM4(arg2));
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(arg1 == TMP_REG2);
-		FAIL_IF(push_inst32(compiler, CLZ | RN4(arg2) | RD4(dst) | RM4(arg2)));
-		return SLJIT_SUCCESS;
+		return push_inst32(compiler, CLZ | RN4(arg2) | RD4(dst) | RM4(arg2));
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(arg1 == TMP_REG2);
+		FAIL_IF(push_inst32(compiler, RBIT | RN4(arg2) | RD4(dst) | RM4(arg2)));
+		return push_inst32(compiler, CLZ | RN4(dst) | RD4(dst) | RM4(dst));
+	case SLJIT_REV:
+		if (IS_2_LO_REGS(dst, arg2))
+			return push_inst16(compiler, REV | RD3(dst) | RN3(arg2));
+		return push_inst32(compiler, REV_W | RN4(arg2) | RD4(dst) | RM4(arg2));
 	case SLJIT_ADD:
 		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
 		if (IS_3_LO_REGS(dst, arg1, arg2))
@@ -865,18 +894,38 @@ static sljit_s32 emit_op_imm(struct sljit_compiler *compiler, sljit_s32 flags, s
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, EORS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, EOR_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_MSHL:
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(arg2) | 0x1f));
+		arg2 = TMP_REG2;
+		/* fallthrough */
 	case SLJIT_SHL:
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, LSLS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, LSL_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_MLSHR:
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(arg2) | 0x1f));
+		arg2 = TMP_REG2;
+		/* fallthrough */
 	case SLJIT_LSHR:
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, LSRS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, LSR_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_MASHR:
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(arg2) | 0x1f));
+		arg2 = TMP_REG2;
+		/* fallthrough */
 	case SLJIT_ASHR:
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, ASRS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, ASR_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_ROTL:
+		FAIL_IF(push_inst32(compiler, RSB_WI | RD4(TMP_REG2) | RN4(arg2) | 0));
+		arg2 = TMP_REG2;
+		/* fallthrough */
+	case SLJIT_ROTR:
+		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
+			return push_inst16(compiler, RORS | RD3(dst) | RN3(arg2));
+		return push_inst32(compiler, ROR_W | RD4(dst) | RN4(arg1) | RM4(arg2));
 	}
 
 	SLJIT_UNREACHABLE();
@@ -1311,6 +1360,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 
 	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1);
 
+	/* Doubles are saved, so alignment is unaffected. */
 	if ((size & SSIZE_OF(sw)) != 0 && (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG))
 		size += SSIZE_OF(sw);
 
@@ -1341,9 +1391,9 @@ static sljit_s32 emit_add_sp(struct sljit_compiler *compiler, sljit_uw imm)
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size)
 {
 	sljit_s32 local_size, fscratches, fsaveds, i, tmp;
-	sljit_s32 saveds_restore_start = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	sljit_s32 restored_reg = 0;
 	sljit_s32 lr_dst = TMP_PC;
-	sljit_uw reg_list;
+	sljit_uw reg_list = 0;
 
 	SLJIT_ASSERT(reg_map[TMP_REG2] == 14 && frame_size <= 128);
 
@@ -1370,49 +1420,88 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
 	if (frame_size < 0) {
 		lr_dst = TMP_REG2;
 		frame_size = 0;
-	} else if (frame_size > 0)
+	} else if (frame_size > 0) {
+		SLJIT_ASSERT(frame_size == 1 || (frame_size & 0x7) == 0);
 		lr_dst = 0;
+		frame_size &= ~0x7;
+	}
 
-	reg_list = 0;
 	tmp = SLJIT_S0 - compiler->saveds;
-	if (saveds_restore_start != tmp) {
-		for (i = saveds_restore_start; i > tmp; i--)
+	i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	if (tmp < i) {
+		restored_reg = i;
+		do {
 			reg_list |= (sljit_uw)1 << reg_map[i];
-	} else
-		saveds_restore_start = 0;
+		} while (--i > tmp);
+	}
 
-	for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--)
-		reg_list |= (sljit_uw)1 << reg_map[i];
+	i = compiler->scratches;
+	if (i >= SLJIT_FIRST_SAVED_REG) {
+		restored_reg = i;
+		do {
+			reg_list |= (sljit_uw)1 << reg_map[i];
+		} while (--i >= SLJIT_FIRST_SAVED_REG);
+	}
+
+	if (lr_dst == TMP_REG2 && reg_list == 0) {
+		reg_list |= (sljit_uw)1 << reg_map[TMP_REG2];
+		restored_reg = TMP_REG2;
+		lr_dst = 0;
+	}
 
 	if (lr_dst == 0 && (reg_list & (reg_list - 1)) == 0) {
 		/* The local_size does not include the saved registers. */
+		tmp = 0;
+		if (reg_list != 0) {
+			tmp = 2;
+			if (local_size <= 0xfff) {
+				if (local_size == 0) {
+					SLJIT_ASSERT(restored_reg != TMP_REG2);
+					if (frame_size == 0)
+						return push_inst32(compiler, LDRI | RT4(restored_reg) | RN4(SLJIT_SP) | 0x308);
+					if (frame_size > 2 * SSIZE_OF(sw))
+						return push_inst32(compiler, LDRI | RT4(restored_reg) | RN4(SLJIT_SP) | 0x100 | (sljit_ins)(frame_size - (2 * SSIZE_OF(sw))));
+				}
+
+				if (reg_map[restored_reg] <= 7 && local_size <= 0x3fc)
+					FAIL_IF(push_inst16(compiler, STR_SP | 0x800 | RDN3(restored_reg) | (sljit_ins)(local_size >> 2)));
+				else
+					FAIL_IF(push_inst32(compiler, LDR | RT4(restored_reg) | RN4(SLJIT_SP) | (sljit_ins)local_size));
+				tmp = 1;
+			} else if (frame_size == 0) {
+				frame_size = (restored_reg == TMP_REG2) ? SSIZE_OF(sw) : 2 * SSIZE_OF(sw);
+				tmp = 3;
+			}
+
+			/* Place for the saved register. */
+			if (restored_reg != TMP_REG2)
+				local_size += SSIZE_OF(sw);
+		}
+
+		/* Place for the lr register. */
 		local_size += SSIZE_OF(sw);
 
-		if (reg_list != 0)
-			local_size += SSIZE_OF(sw);
-
 		if (frame_size > local_size)
-			FAIL_IF(push_inst16(compiler, SUB_SP_I | ((sljit_uw)(frame_size - local_size) >> 2)));
+			FAIL_IF(push_inst16(compiler, SUB_SP_I | ((sljit_ins)(frame_size - local_size) >> 2)));
 		else if (frame_size < local_size)
 			FAIL_IF(emit_add_sp(compiler, (sljit_uw)(local_size - frame_size)));
 
-		if (reg_list == 0)
+		if (tmp <= 1)
 			return SLJIT_SUCCESS;
 
-		if (saveds_restore_start != 0) {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[saveds_restore_start]));
-			lr_dst = saveds_restore_start;
-		} else {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[SLJIT_FIRST_SAVED_REG]));
-			lr_dst = SLJIT_FIRST_SAVED_REG;
+		if (tmp == 2) {
+			frame_size -= SSIZE_OF(sw);
+			if (restored_reg != TMP_REG2)
+				frame_size -= SSIZE_OF(sw);
+
+			if (reg_map[restored_reg] <= 7)
+				return push_inst16(compiler, STR_SP | 0x800 | RDN3(restored_reg) | (sljit_ins)(frame_size >> 2));
+
+			return push_inst32(compiler, LDR | RT4(restored_reg) | RN4(SLJIT_SP) | (sljit_ins)frame_size);
 		}
 
-		frame_size -= 2 * SSIZE_OF(sw);
-
-		if (reg_map[lr_dst] <= 7)
-			return push_inst16(compiler, STR_SP | 0x800 | RDN3(lr_dst) | (sljit_uw)(frame_size >> 2));
-
-		return push_inst32(compiler, LDR | RT4(lr_dst) | RN4(SLJIT_SP) | (sljit_uw)frame_size);
+		tmp = (restored_reg == TMP_REG2) ? 0x304 : 0x308;
+		return push_inst32(compiler, LDRI | RT4(restored_reg) | RN4(SLJIT_SP) | (sljit_ins)tmp);
 	}
 
 	if (local_size > 0)
@@ -1427,12 +1516,8 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
 
 		FAIL_IF(push_inst16(compiler, POP | reg_list));
 	} else {
-		if (lr_dst != 0) {
-			if (reg_list == 0)
-				return push_inst32(compiler, 0xf85d0b04 | RT4(lr_dst));
-
+		if (lr_dst != 0)
 			reg_list |= (sljit_uw)1 << reg_map[lr_dst];
-		}
 
 		/* At least two registers must be set for POP_W instruction. */
 		SLJIT_ASSERT((reg_list & (reg_list - 1)) != 0);
@@ -1441,8 +1526,12 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
 	}
 
 	if (frame_size > 0)
-		return push_inst16(compiler, SUB_SP_I | (((sljit_uw)frame_size - sizeof(sljit_sw)) >> 2));
-	return SLJIT_SUCCESS;
+		return push_inst16(compiler, SUB_SP_I | (((sljit_ins)frame_size - sizeof(sljit_sw)) >> 2));
+
+	if (lr_dst != 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst16(compiler, ADD_SP_I | 1);
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
@@ -1453,6 +1542,28 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	return emit_stack_frame_release(compiler, 0);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, src)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -1709,6 +1820,63 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_s32 is_left;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+
+	op = GET_OPCODE(op);
+	is_left = (op == SLJIT_SHL || op == SLJIT_MSHL);
+
+	if (src1_reg == src2_reg) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, is_left ? SLJIT_ROTL : SLJIT_ROTR, dst_reg, 0, src1_reg, 0, src3, src3w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	if (src3 & SLJIT_IMM) {
+		src3w &= 0x1f;
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+
+		if (IS_2_LO_REGS(dst_reg, src1_reg))
+			FAIL_IF(push_inst16(compiler, (is_left ? LSLSI : LSRSI) | RD3(dst_reg) | RN3(src1_reg) | ((sljit_ins)src3w << 6)));
+		else
+			FAIL_IF(push_inst32(compiler, (is_left ? LSL_WI : LSR_WI) | RD4(dst_reg) | RM4(src1_reg) | IMM5(src3w)));
+
+		src3w = (src3w ^ 0x1f) + 1;
+		return push_inst32(compiler, ORR_W | RD4(dst_reg) | RN4(dst_reg) | RM4(src2_reg) | (is_left ? 0x10 : 0x0) | IMM5(src3w));
+	}
+
+	if (src3 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, src3, src3w, TMP_REG2));
+		src3 = TMP_REG2;
+	}
+
+	if (op == SLJIT_MSHL || op == SLJIT_MLSHR || dst_reg == src3) {
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(src3) | 0x1f));
+		src3 = TMP_REG2;
+	}
+
+	if (dst_reg == src1_reg && IS_2_LO_REGS(dst_reg, src3))
+		FAIL_IF(push_inst16(compiler, (is_left ? LSLS : LSRS) | RD3(dst_reg) | RN3(src3)));
+	else
+		FAIL_IF(push_inst32(compiler, (is_left ? LSL_W : LSR_W) | RD4(dst_reg) | RN4(src1_reg) | RM4(src3)));
+
+	FAIL_IF(push_inst32(compiler, (is_left ? LSR_WI : LSL_WI) | RD4(TMP_REG1) | RM4(src2_reg) | (1 << 6)));
+	FAIL_IF(push_inst32(compiler, EORI | RD4(TMP_REG2) | RN4(src3) | 0x1f));
+	FAIL_IF(push_inst32(compiler, (is_left ? LSR_W : LSL_W) | RD4(TMP_REG1) | RN4(TMP_REG1) | RM4(TMP_REG2)));
+	return push_inst32(compiler, ORR_W | RD4(dst_reg) | RN4(dst_reg) | RM4(TMP_REG1));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1738,6 +1906,46 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 size, dst_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		SLJIT_ASSERT(reg_map[TMP_REG2] == 14);
+
+		if (FAST_IS_REG(dst))
+			return push_inst16(compiler, MOV | SET_REGS44(dst, TMP_REG2));
+		break;
+	case SLJIT_GET_RETURN_ADDRESS:
+		size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds - SLJIT_KEPT_SAVEDS_COUNT(compiler->options), 0);
+
+		if (compiler->fsaveds > 0 || compiler->fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
+			/* The size of pc is not added above. */
+			if ((size & SSIZE_OF(sw)) == 0)
+				size += SSIZE_OF(sw);
+
+			size += GET_SAVED_FLOAT_REGISTERS_SIZE(compiler->fscratches, compiler->fsaveds, f64);
+		}
+
+		SLJIT_ASSERT(((compiler->local_size + size + SSIZE_OF(sw)) & 0x7) == 0);
+
+		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, dst_r, SLJIT_MEM1(SLJIT_SP), compiler->local_size + size, TMP_REG1));
+		break;
+	}
+
+	if (dst & SLJIT_MEM)
+		return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG2, dst, dstw, TMP_REG1);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -1972,23 +2180,31 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 	return emit_fop_mem(compiler, (op & SLJIT_32), TMP_FREG1, dst, dstw);
 }
 
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
 {
+	sljit_s32 reg2;
+	sljit_ins inst;
+
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
 
-	SLJIT_ASSERT(reg_map[TMP_REG2] == 14);
+	if (reg & REG_PAIR_MASK) {
+		reg2 = REG_PAIR_SECOND(reg);
+		reg = REG_PAIR_FIRST(reg);
 
-	if (FAST_IS_REG(dst))
-		return push_inst16(compiler, MOV | SET_REGS44(dst, TMP_REG2));
+		inst = VMOV2 | RN4(reg) | RT4(reg2) | DM4(freg);
+	} else {
+		inst = VMOV | DN4(freg) | RT4(reg);
 
-	/* Memory. */
-	return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG2, dst, dstw, TMP_REG1);
+		if (!(op & SLJIT_32))
+			inst |= 1 << 7;
+	}
+
+	if (GET_OPCODE(op) == SLJIT_COPY_FROM_F64)
+		inst |= 1 << 20;
+
+	return push_inst32(compiler, inst);
 }
 
 /* --------------------------------------------------------------------- */
@@ -2413,7 +2629,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 		src = TMP_REG1;
 	}
 
-	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0)) {
+	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
 		FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, src)));
 		src = TMP_REG1;
 	}
@@ -2460,6 +2676,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 	return sljit_emit_ijump(compiler, type, src, srcw);
 }
 
+#ifdef __SOFTFP__
+
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	if (compiler->options & SLJIT_ENTER_REG_ARG) {
+		if (src == SLJIT_FR0)
+			return SLJIT_SUCCESS;
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+	}
+
+	if (FAST_IS_REG(src)) {
+		if (op & SLJIT_32)
+			return push_inst32(compiler, VMOV | (1 << 20) | DN4(src) | RT4(SLJIT_R0));
+		return push_inst32(compiler, VMOV2 | (1 << 20) | DM4(src) | RT4(SLJIT_R0) | RN4(SLJIT_R1));
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+
+	if (op & SLJIT_32)
+		return sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src, srcw);
+	return sljit_emit_mem(compiler, SLJIT_MOV, SLJIT_REG_PAIR(SLJIT_R0, SLJIT_R1), src, srcw);
+}
+
+#endif /* __SOFTFP__ */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
@@ -2522,9 +2765,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
-	dst_reg &= ~SLJIT_32;
-
-	cc = get_cc(compiler, type);
+	cc = get_cc(compiler, type & ~SLJIT_32);
 
 	if (!(src & SLJIT_IMM)) {
 		FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
@@ -2567,64 +2808,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 {
 	sljit_s32 flags;
 	sljit_uw imm, tmp;
-	sljit_ins inst;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
-	if (!(reg & REG_PAIR_MASK)) {
-		if (type & SLJIT_MEM_UNALIGNED)
-			return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
 
-		if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -255))
-			return SLJIT_ERR_UNSUPPORTED;
-
-		if (type & SLJIT_MEM_SUPP)
-			return SLJIT_SUCCESS;
-
-		switch (type & 0xff) {
-		case SLJIT_MOV:
-		case SLJIT_MOV_U32:
-		case SLJIT_MOV_S32:
-		case SLJIT_MOV32:
-		case SLJIT_MOV_P:
-			flags = WORD_SIZE;
-			break;
-		case SLJIT_MOV_U8:
-			flags = BYTE_SIZE;
-			break;
-		case SLJIT_MOV_S8:
-			flags = BYTE_SIZE | SIGNED;
-			break;
-		case SLJIT_MOV_U16:
-			flags = HALF_SIZE;
-			break;
-		case SLJIT_MOV_S16:
-			flags = HALF_SIZE | SIGNED;
-			break;
-		default:
-			SLJIT_UNREACHABLE();
-			flags = WORD_SIZE;
-			break;
-		}
-
-		if (type & SLJIT_MEM_STORE)
-			flags |= STORE;
-
-		inst = sljit_mem32[flags] | 0x900;
-
-		if (type & SLJIT_MEM_PRE)
-			inst |= 0x400;
-
-		if (memw >= 0)
-			inst |= 0x200;
-		else
-			memw = -memw;
-
-		return push_inst32(compiler, inst | RT4(reg) | RN4(mem & REG_MASK) | (sljit_ins)memw);
-	}
-
-	if (type & SLJIT_MEM_UNALIGNED) {
+	if (type & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)) {
 		if ((mem & REG_MASK) == 0) {
 			if ((memw & 0xfff) >= (0x1000 - SSIZE_OF(sw))) {
 				imm = get_imm((sljit_uw)((memw + 0x1000) & ~0xfff));
@@ -2786,6 +2977,64 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	return push_inst32(compiler, ((type & SLJIT_MEM_STORE) ? STRD : LDRD) | (sljit_ins)flags | RN4(mem & REG_MASK) | RT4(REG_PAIR_FIRST(reg)) | RD4(REG_PAIR_SECOND(reg)) | (sljit_ins)memw);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 flags;
+	sljit_ins inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
+
+	if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -255))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_MEM_SUPP)
+		return SLJIT_SUCCESS;
+
+	switch (type & 0xff) {
+	case SLJIT_MOV:
+	case SLJIT_MOV_U32:
+	case SLJIT_MOV_S32:
+	case SLJIT_MOV32:
+	case SLJIT_MOV_P:
+		flags = WORD_SIZE;
+		break;
+	case SLJIT_MOV_U8:
+		flags = BYTE_SIZE;
+		break;
+	case SLJIT_MOV_S8:
+		flags = BYTE_SIZE | SIGNED;
+		break;
+	case SLJIT_MOV_U16:
+		flags = HALF_SIZE;
+		break;
+	case SLJIT_MOV_S16:
+		flags = HALF_SIZE | SIGNED;
+		break;
+	default:
+		SLJIT_UNREACHABLE();
+		flags = WORD_SIZE;
+		break;
+	}
+
+	if (type & SLJIT_MEM_STORE)
+		flags |= STORE;
+
+	inst = sljit_mem32[flags] | 0x900;
+
+	if (!(type & SLJIT_MEM_POST))
+		inst |= 0x400;
+
+	if (memw >= 0)
+		inst |= 0x200;
+	else
+		memw = -memw;
+
+	return push_inst32(compiler, inst | RT4(reg) | RN4(mem & REG_MASK) | (sljit_ins)memw);
+}
+
 static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem, sljit_sw *memw, sljit_s32 max_offset)
 {
 	sljit_s32 arg = *mem;
@@ -2854,10 +3103,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
 
-	if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-		return SLJIT_ERR_UNSUPPORTED;
-
-	if (type & SLJIT_MEM_ALIGNED_32)
+	if (type & SLJIT_MEM_UNALIGNED_32)
 		return emit_fop_mem(compiler, ((type ^ SLJIT_32) & SLJIT_32) | ((type & SLJIT_MEM_STORE) ? 0 : FPU_LOAD), freg, mem, memw);
 
 	if (type & SLJIT_MEM_STORE) {
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_32.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_32.c
index ca9dbd0a53..1691905db7 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_32.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_32.c
@@ -44,6 +44,35 @@ static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_
 	return push_inst(compiler, ORI | S(dst) | T(dst) | IMM(init_value), DR(dst));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	sljit_s32 reg2;
+	sljit_ins inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	if (reg & REG_PAIR_MASK) {
+		reg2 = REG_PAIR_SECOND(reg);
+		reg = REG_PAIR_FIRST(reg);
+
+		inst = T(reg2) | FS(freg) | (1 << 11);
+
+		if (op == SLJIT_COPY_TO_F64)
+			FAIL_IF(push_inst(compiler, MTC1 | inst, MOVABLE_INS));
+		else
+			FAIL_IF(push_inst(compiler, MFC1 | inst, DR(reg2)));
+	}
+
+	inst = T(reg) | FS(freg);
+
+	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64)
+		return push_inst(compiler, MTC1 | inst, MOVABLE_INS);
+
+	return push_inst(compiler, MFC1 | inst, DR(reg));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
 {
 	sljit_ins *inst = (sljit_ins *)addr;
@@ -267,7 +296,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 
 	if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
 		if (type & SLJIT_CALL_RETURN) {
-			if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+			if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 				FAIL_IF(push_inst(compiler, ADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
 				src = PIC_ADDR_REG;
 				srcw = 0;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_64.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_64.c
index 443bade802..a29fe0730d 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_64.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_64.c
@@ -128,6 +128,22 @@ static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_
 	return push_inst(compiler, ORI | S(dst) | T(dst) | IMM(init_value), DR(dst));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	sljit_ins inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	inst = T(reg) | FS(freg);
+
+	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64)
+		return push_inst(compiler, ((op & SLJIT_32) ? MTC1 : DMTC1) | inst, MOVABLE_INS);
+
+	return push_inst(compiler, ((op & SLJIT_32) ? MFC1 : DMFC1) | inst, DR(reg));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
 {
 	sljit_ins *inst = (sljit_ins *)addr;
@@ -282,7 +298,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 
 	if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
 		if (type & SLJIT_CALL_RETURN) {
-			if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+			if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 				FAIL_IF(push_inst(compiler, DADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
 				src = PIC_ADDR_REG;
 				srcw = 0;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_common.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_common.c
index 928e111ac5..2b00d4f16d 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_common.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeMIPS_common.c
@@ -42,6 +42,14 @@ SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
 	return "MIPS64-R6" SLJIT_CPUINFO;
 #endif /* SLJIT_CONFIG_MIPS_32 */
 
+#elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	return "MIPS32-R2" SLJIT_CPUINFO;
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	return "MIPS64-R2" SLJIT_CPUINFO;
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
 #elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -193,6 +201,11 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = {
 #endif /* SLJIT_MIPS_REV >= 6 */
 #define DIV_S		(HI(17) | FMT_S | LO(3))
 #define DINSU		(HI(31) | LO(6))
+#define DMFC1		(HI(17) | (1 << 21) | LO(0))
+#define DMTC1		(HI(17) | (5 << 21) | LO(0))
+#define DROTR		(HI(0) | (1 << 21) | LO(58))
+#define DROTR32		(HI(0) | (1 << 21) | LO(62))
+#define DROTRV		(HI(0) | (1 << 6) | LO(22))
 #define DSLL		(HI(0) | LO(56))
 #define DSLL32		(HI(0) | LO(60))
 #define DSLLV		(HI(0) | LO(20))
@@ -220,7 +233,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = {
 #define LWL		(HI(34))
 #define LWR		(HI(38))
 #define LWC1		(HI(49))
-#define MFC1		(HI(17))
+#define MFC1		(HI(17) | (0 << 21))
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
 #define MOD		(HI(0) | (3 << 6) | LO(26))
 #define MODU		(HI(0) | (3 << 6) | LO(27))
@@ -245,6 +258,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = {
 #define NOR		(HI(0) | LO(39))
 #define OR		(HI(0) | LO(37))
 #define ORI		(HI(13))
+#define ROTR		(HI(0) | (1 << 21) | LO(2))
+#define ROTRV		(HI(0) | (1 << 6) | LO(6))
 #define SD		(HI(63))
 #define SDL		(HI(44))
 #define SDR		(HI(45))
@@ -293,12 +308,16 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = {
 #define SLL_W		SLL
 #define SRA_W		SRA
 #define SUBU_W		SUBU
+#define STORE_W		SW
+#define LOAD_W		LW
 #else
 #define ADDU_W		DADDU
 #define ADDIU_W		DADDIU
 #define SLL_W		DSLL
 #define SRA_W		DSRA
 #define SUBU_W		DSUBU
+#define STORE_W		SD
+#define LOAD_W		LD
 #endif
 
 #define SIMM_MAX	(0x7fff)
@@ -713,15 +732,22 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 #error "FIR check is not implemented for this architecture"
 #endif
 	case SLJIT_HAS_ZERO_REGISTER:
+	case SLJIT_HAS_COPY_F32:
+	case SLJIT_HAS_COPY_F64:
 		return 1;
-
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
 	case SLJIT_HAS_CLZ:
 	case SLJIT_HAS_CMOV:
 	case SLJIT_HAS_PREFETCH:
 		return 1;
-#endif /* SLJIT_MIPS_REV >= 1 */
 
+	case SLJIT_HAS_CTZ:
+		return 2;
+#endif /* SLJIT_MIPS_REV >= 1 */
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+	case SLJIT_HAS_ROT:
+		return 1;
+#endif /* SLJIT_MIPS_REV >= 2 */
 	default:
 		return 0;
 	}
@@ -766,14 +792,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
 #define SLOW_SRC2	0x20000
 #define SLOW_DEST	0x40000
 
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#define STACK_STORE	SW
-#define STACK_LOAD	LW
-#else
-#define STACK_STORE	SD
-#define STACK_LOAD	LD
-#endif
-
 static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw);
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size, sljit_ins *ins_ptr);
 
@@ -801,12 +819,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((local_size & SSIZE_OF(sw)) != 0)
 			local_size += SSIZE_OF(sw);
-		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 	}
 
 	local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
 #else
-	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 	local_size = (local_size + SLJIT_LOCALS_OFFSET + 31) & ~0x1f;
 #endif
 	compiler->local_size = local_size;
@@ -850,17 +868,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 #endif
 	}
 
-	FAIL_IF(push_inst(compiler, STACK_STORE | base | TA(RETURN_ADDR_REG) | IMM(offset), MOVABLE_INS));
+	FAIL_IF(push_inst(compiler, STORE_W | base | TA(RETURN_ADDR_REG) | IMM(offset), UNMOVABLE_INS));
 
 	tmp = SLJIT_S0 - saveds;
 	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_STORE | base | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, STORE_W | base | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_STORE | base | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, STORE_W | base | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -997,12 +1015,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((local_size & SSIZE_OF(sw)) != 0)
 			local_size += SSIZE_OF(sw);
-		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 	}
 
 	compiler->local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
 #else
-	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 	compiler->local_size = (local_size + SLJIT_LOCALS_OFFSET + 31) & ~0x1f;
 #endif
 	return SLJIT_SUCCESS;
@@ -1011,12 +1029,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size, sljit_ins *ins_ptr)
 {
 	sljit_s32 local_size, i, tmp, offset;
+	sljit_s32 load_return_addr = (frame_size == 0);
 	sljit_s32 scratches = compiler->scratches;
 	sljit_s32 saveds = compiler->saveds;
 	sljit_s32 fsaveds = compiler->fsaveds;
 	sljit_s32 fscratches = compiler->fscratches;
 	sljit_s32 kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
 
+	SLJIT_ASSERT(frame_size == 1 || (frame_size & 0xf) == 0);
+	frame_size &= ~0xf;
+
 	local_size = compiler->local_size;
 
 	tmp = GET_SAVED_REGISTERS_SIZE(scratches, saveds - kept_saveds_count, 1);
@@ -1024,10 +1046,10 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((tmp & SSIZE_OF(sw)) != 0)
 			tmp += SSIZE_OF(sw);
-		tmp += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+		tmp += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 	}
 #else
-	tmp += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+	tmp += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 #endif
 
 	if (local_size <= SIMM_MAX) {
@@ -1047,18 +1069,18 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
 	SLJIT_ASSERT(local_size >= frame_size);
 
 	offset = local_size - SSIZE_OF(sw);
-	if (frame_size == 0)
-		FAIL_IF(push_inst(compiler, STACK_LOAD | S(SLJIT_SP) | TA(RETURN_ADDR_REG) | IMM(offset), RETURN_ADDR_REG));
+	if (load_return_addr)
+		FAIL_IF(push_inst(compiler, LOAD_W | S(SLJIT_SP) | TA(RETURN_ADDR_REG) | IMM(offset), RETURN_ADDR_REG));
 
 	tmp = SLJIT_S0 - saveds;
 	for (i = SLJIT_S0 - kept_saveds_count; i > tmp; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_LOAD | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, LOAD_W | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_LOAD | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, LOAD_W | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -1099,8 +1121,38 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	return push_inst(compiler, ins, UNMOVABLE_INS);
 }
 
-#undef STACK_STORE
-#undef STACK_LOAD
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_ins ins;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
+		src = PIC_ADDR_REG;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, ADDU_W | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
+		src = PIC_ADDR_REG;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1, &ins));
+
+	if (!(src & SLJIT_IMM)) {
+		FAIL_IF(push_inst(compiler, JR | S(src), UNMOVABLE_INS));
+		return push_inst(compiler, ins, UNMOVABLE_INS);
+	}
+
+	if (ins != NOP)
+		FAIL_IF(push_inst(compiler, ins, MOVABLE_INS));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
 
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
@@ -1302,7 +1354,7 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, s
 
 		if (SLJIT_UNLIKELY(argw)) {
 			FAIL_IF(push_inst(compiler, SLL_W | T(OFFS_REG(arg)) | DA(tmp_ar) | SH_IMM(argw), tmp_ar));
-			FAIL_IF(push_inst(compiler, ADDU_W | S(base) | TA(tmp_ar) | DA(tmp_ar), tmp_ar));
+			FAIL_IF(push_inst(compiler, ADDU_W | SA(tmp_ar) | T(base) | DA(tmp_ar), tmp_ar));
 		}
 		else
 			FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(OFFS_REG(arg)) | DA(tmp_ar), tmp_ar));
@@ -1312,7 +1364,7 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, s
 	FAIL_IF(load_immediate(compiler, tmp_ar, TO_ARGW_HI(argw)));
 
 	if (base != 0)
-		FAIL_IF(push_inst(compiler, ADDU_W | S(base) | TA(tmp_ar) | DA(tmp_ar), tmp_ar));
+		FAIL_IF(push_inst(compiler, ADDU_W | SA(tmp_ar) | T(base) | DA(tmp_ar), tmp_ar));
 
 	return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar) | IMM(argw), delay_slot);
 }
@@ -1342,55 +1394,124 @@ static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, slji
 
 #define SELECT_OP(a, b) (b)
 
-#define EMIT_SHIFT(op_dimm, op_dimm32, op_imm, op_dv, op_v) \
-	if (flags & SRC2_IMM) { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_imm | T(src1) | D(dst) | SH_IMM(src2), DR(dst))); \
-	} \
-	else { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | D(dst), DR(dst))); \
-	}
+#define EMIT_SHIFT(dimm, dimm32, imm, dv, v) \
+	op_imm = (imm); \
+	op_v = (v);
 
 #else /* !SLJIT_CONFIG_MIPS_32 */
 
 #define SELECT_OP(a, b) \
 	(!(op & SLJIT_32) ? a : b)
 
-#define EMIT_SHIFT(op_dimm, op_dimm32, op_imm, op_dv, op_v) \
-	if (flags & SRC2_IMM) { \
-		if (src2 >= 32) { \
-			SLJIT_ASSERT(!(op & SLJIT_32)); \
-			ins = op_dimm32; \
-			src2 -= 32; \
-		} \
-		else \
-			ins = (op & SLJIT_32) ? op_imm : op_dimm; \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, ins | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, ins | T(src1) | D(dst) | SH_IMM(src2), DR(dst))); \
-	} \
-	else { \
-		ins = (op & SLJIT_32) ? op_v : op_dv; \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, ins | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, ins | S(src2) | T(src1) | D(dst), DR(dst))); \
-	}
+#define EMIT_SHIFT(dimm, dimm32, imm, dv, v) \
+	op_dimm = (dimm); \
+	op_dimm32 = (dimm32); \
+	op_imm = (imm); \
+	op_dv = (dv); \
+	op_v = (v);
 
 #endif /* SLJIT_CONFIG_MIPS_32 */
 
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV < 1)
+
+static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
+{
+	sljit_s32 is_clz = (GET_OPCODE(op) == SLJIT_CLZ);
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	sljit_ins max = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+	sljit_ins max = 32;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	/* The TMP_REG2 is the next value. */
+	if (src != TMP_REG2)
+		FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src) | TA(0) | D(TMP_REG2), DR(TMP_REG2)));
+
+	FAIL_IF(push_inst(compiler, BEQ | S(TMP_REG2) | TA(0) | IMM(is_clz ? 13 : 14), UNMOVABLE_INS));
+	/* The OTHER_FLAG is the counter. Delay slot. */
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | TA(OTHER_FLAG) | IMM(max), OTHER_FLAG));
+
+	if (!is_clz) {
+		FAIL_IF(push_inst(compiler, ANDI | S(TMP_REG2) | T(TMP_REG1) | IMM(1), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, BNE | S(TMP_REG1) | TA(0) | IMM(11), UNMOVABLE_INS));
+	} else
+		FAIL_IF(push_inst(compiler, BLTZ | S(TMP_REG2) | TA(0) | IMM(11), UNMOVABLE_INS));
+
+	/* Delay slot. */
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | TA(OTHER_FLAG) | IMM(0), OTHER_FLAG));
+
+	/* The TMP_REG1 is the next shift. */
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | T(TMP_REG1) | IMM(max), DR(TMP_REG1)));
+
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(TMP_REG2) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
+	FAIL_IF(push_inst(compiler, SELECT_OP(DSRL, SRL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(1), DR(TMP_REG1)));
+
+	FAIL_IF(push_inst(compiler, (is_clz ? SELECT_OP(DSRLV, SRLV) : SELECT_OP(DSLLV, SLLV)) | S(TMP_REG1) | TA(EQUAL_FLAG) | D(TMP_REG2), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, BNE | S(TMP_REG2) | TA(0) | IMM(-4), UNMOVABLE_INS));
+	/* Delay slot. */
+	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(TMP_REG1) | T(TMP_REG2) | IMM(-1), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, (is_clz ? SELECT_OP(DSRLV, SRLV) : SELECT_OP(DSLLV, SLLV)) | S(TMP_REG2) | TA(EQUAL_FLAG) | D(TMP_REG2), DR(TMP_REG2)));
+
+	FAIL_IF(push_inst(compiler, BEQ | S(TMP_REG2) | TA(0) | IMM(-7), UNMOVABLE_INS));
+	/* Delay slot. */
+	FAIL_IF(push_inst(compiler, OR | SA(OTHER_FLAG) | T(TMP_REG1) | DA(OTHER_FLAG), OTHER_FLAG));
+
+	return push_inst(compiler, SELECT_OP(DADDU, ADDU) | SA(OTHER_FLAG) | TA(0) | D(dst), DR(dst));
+}
+
+#endif /* SLJIT_MIPS_REV < 1 */
+
+static sljit_s32 emit_rev(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
+{
+	SLJIT_UNUSED_ARG(op);
+
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	if (!(op & SLJIT_32)) {
+		FAIL_IF(push_inst(compiler, DSRL32 | T(src) | D(TMP_REG1) | SH_IMM(0), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, ORI | SA(0) | TA(OTHER_FLAG) | 0xffff, OTHER_FLAG));
+		FAIL_IF(push_inst(compiler, DSLL32 | T(src) | D(dst) | SH_IMM(0), DR(dst)));
+		FAIL_IF(push_inst(compiler, DSLL32 | TA(OTHER_FLAG) | DA(OTHER_FLAG) | SH_IMM(0), OTHER_FLAG));
+		FAIL_IF(push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst)));
+		FAIL_IF(push_inst(compiler, ORI | SA(OTHER_FLAG) | TA(OTHER_FLAG) | 0xffff, OTHER_FLAG));
+
+		FAIL_IF(push_inst(compiler, DSRL | T(dst) | D(TMP_REG1) | SH_IMM(16), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, AND | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
+		FAIL_IF(push_inst(compiler, AND | S(TMP_REG1) | TA(OTHER_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, DSLL | TA(OTHER_FLAG) | DA(EQUAL_FLAG) | SH_IMM(8), EQUAL_FLAG));
+		FAIL_IF(push_inst(compiler, DSLL | T(dst) | D(dst) | SH_IMM(16), DR(dst)));
+		FAIL_IF(push_inst(compiler, XOR | SA(OTHER_FLAG) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
+		FAIL_IF(push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst)));
+
+		FAIL_IF(push_inst(compiler, DSRL | T(dst) | D(TMP_REG1) | SH_IMM(8), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, AND | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
+		FAIL_IF(push_inst(compiler, AND | S(TMP_REG1) | TA(OTHER_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, DSLL | T(dst) | D(dst) | SH_IMM(8), DR(dst)));
+		return push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst));
+	}
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+	FAIL_IF(push_inst(compiler, SRL | T(src) | D(TMP_REG1) | SH_IMM(16), DR(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, LUI | TA(OTHER_FLAG) | 0xff, OTHER_FLAG));
+	FAIL_IF(push_inst(compiler, SLL | T(src) | D(dst) | SH_IMM(16), DR(dst)));
+	FAIL_IF(push_inst(compiler, ORI | SA(OTHER_FLAG) | TA(OTHER_FLAG) | 0xff, OTHER_FLAG));
+	FAIL_IF(push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst)));
+
+	FAIL_IF(push_inst(compiler, SRL | T(dst) | D(TMP_REG1) | SH_IMM(8), DR(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, AND | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
+	FAIL_IF(push_inst(compiler, AND | S(TMP_REG1) | TA(OTHER_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, SLL | T(dst) | D(dst) | SH_IMM(8), DR(dst)));
+	return push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst));
+}
+
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
 {
 	sljit_s32 is_overflow, is_carry, carry_src_ar, is_handled;
+	sljit_ins op_imm, op_v;
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-	sljit_ins ins;
+	sljit_ins ins, op_dimm, op_dimm32, op_dv;
 #endif
 
 	switch (GET_OPCODE(op)) {
@@ -1481,43 +1602,41 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		return SLJIT_SUCCESS;
 #endif /* SLJIT_CONFIG_MIPS_64 */
 
-	case SLJIT_NOT:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		if (!(flags & UNUSED_DEST))
-			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | D(dst), DR(dst)));
-		return SLJIT_SUCCESS;
-
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		if (!(flags & UNUSED_DEST))
-			FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | T(dst) | D(dst), DR(dst)));
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+		return push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | D(dst), DR(dst));
+#else /* SLJIT_MIPS_REV < 6 */
+		return push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | T(dst) | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 6 */
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(0) | T(src2) | D(TMP_REG1), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, AND | S(src2) | T(TMP_REG1) | D(dst), DR(dst)));
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+		FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(dst) | D(dst), DR(dst)));
+#else /* SLJIT_MIPS_REV < 6 */
+		FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(dst) | T(dst) | D(dst), DR(dst)));
+#endif /* SLJIT_MIPS_REV >= 6 */
+		FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(dst) | T(TMP_REG1) | IMM(SELECT_OP(-64, -32)), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSRL32, SRL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(SELECT_OP(26, 27)), DR(TMP_REG1)));
+		return push_inst(compiler, XOR | S(dst) | T(TMP_REG1) | D(dst), DR(dst));
 #else /* SLJIT_MIPS_REV < 1 */
-		/* Nearly all instructions are unmovable in the following sequence. */
-		FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src2) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
-		/* Check zero. */
-		FAIL_IF(push_inst(compiler, BEQ | S(TMP_REG1) | TA(0) | IMM(5), UNMOVABLE_INS));
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-		FAIL_IF(push_inst(compiler, ORI | SA(0) | T(dst) | IMM(32), UNMOVABLE_INS));
-#else /* !SLJIT_CONFIG_MIPS_32 */
-		FAIL_IF(push_inst(compiler, ORI | SA(0) | T(dst) | IMM((op & SLJIT_32) ? 32 : 64), UNMOVABLE_INS));
-#endif /* SLJIT_CONFIG_MIPS_32 */
-		FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | T(dst) | IMM(-1), DR(dst)));
-		/* Loop for searching the highest bit. */
-		FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(dst) | T(dst) | IMM(1), DR(dst)));
-		FAIL_IF(push_inst(compiler, BGEZ | S(TMP_REG1) | IMM(-2), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSLL, SLL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(1), UNMOVABLE_INS));
+	case SLJIT_CLZ:
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		return emit_clz_ctz(compiler, op, dst, src2);
 #endif /* SLJIT_MIPS_REV >= 1 */
-		return SLJIT_SUCCESS;
+
+	case SLJIT_REV:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		return emit_rev(compiler, op, dst, src2);
 
 	case SLJIT_ADD:
 		/* Overflow computation (both add and sub): overflow = src1_sign ^ src2_sign ^ result_sign ^ carry_flag */
 		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		carry_src_ar = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		carry_src_ar = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			if (is_overflow) {
@@ -1573,7 +1692,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		return push_inst(compiler, XOR | S(TMP_REG1) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
 
 	case SLJIT_ADDC:
-		carry_src_ar = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		carry_src_ar = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(src2), DR(dst)));
@@ -1620,11 +1739,11 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		is_handled = 0;
 
 		if (flags & SRC2_IMM) {
-			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
+			if (GET_FLAG_TYPE(op) == SLJIT_LESS) {
 				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
 				is_handled = 1;
 			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
+			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS) {
 				FAIL_IF(push_inst(compiler, SLTI | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
 				is_handled = 1;
 			}
@@ -1641,19 +1760,15 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 
 			switch (GET_FLAG_TYPE(op)) {
 			case SLJIT_LESS:
-			case SLJIT_GREATER_EQUAL:
 				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
 				break;
 			case SLJIT_GREATER:
-			case SLJIT_LESS_EQUAL:
 				FAIL_IF(push_inst(compiler, SLTU | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
 				break;
 			case SLJIT_SIG_LESS:
-			case SLJIT_SIG_GREATER_EQUAL:
 				FAIL_IF(push_inst(compiler, SLT | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
 				break;
 			case SLJIT_SIG_GREATER:
-			case SLJIT_SIG_LESS_EQUAL:
 				FAIL_IF(push_inst(compiler, SLT | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
 				break;
 			}
@@ -1676,7 +1791,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 
 		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		is_carry = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			if (is_overflow) {
@@ -1725,7 +1840,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 			flags &= ~SRC2_IMM;
 		}
 
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		is_carry = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			if (is_carry)
@@ -1791,24 +1906,155 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		return SLJIT_SUCCESS;
 
 	case SLJIT_XOR:
+		if (!(flags & LOGICAL_OP)) {
+			SLJIT_ASSERT((flags & SRC2_IMM) && src2 == -1);
+			if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+			if (!(flags & UNUSED_DEST))
+				FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | D(dst), DR(dst)));
+			return SLJIT_SUCCESS;
+		}
 		EMIT_LOGICAL(XORI, XOR);
 		return SLJIT_SUCCESS;
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		EMIT_SHIFT(DSLL, DSLL32, SLL, DSLLV, SLLV);
-		return SLJIT_SUCCESS;
+		break;
 
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		EMIT_SHIFT(DSRL, DSRL32, SRL, DSRLV, SRLV);
-		return SLJIT_SUCCESS;
+		break;
 
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		EMIT_SHIFT(DSRA, DSRA32, SRA, DSRAV, SRAV);
+		break;
+
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+	case SLJIT_ROTL:
+		if ((flags & SRC2_IMM) || src2 == 0) {
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+			src2 = -src2 & 0x1f;
+#else /* !SLJIT_CONFIG_MIPS_32 */
+			src2 = -src2 & ((op & SLJIT_32) ? 0x1f : 0x3f);
+#endif /* SLJIT_CONFIG_MIPS_32 */
+		} else {
+			FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(0) | T(src2) | D(TMP_REG2), DR(TMP_REG2)));
+			src2 = TMP_REG2;
+		}
+		/* fallthrough */
+
+	case SLJIT_ROTR:
+		EMIT_SHIFT(DROTR, DROTR32, ROTR, DROTRV, ROTRV);
+		break;
+#else /* SLJIT_MIPS_REV < 1 */
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & SRC2_IMM) {
+			SLJIT_ASSERT(src2 != 0);
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+			if (!(op & SLJIT_32)) {
+				if (GET_OPCODE(op) == SLJIT_ROTL)
+					op_imm = ((src2 < 32) ? DSLL : DSLL32);
+				else
+					op_imm = ((src2 < 32) ? DSRL : DSRL32);
+
+				FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(OTHER_FLAG) | (((sljit_ins)src2 & 0x1f) << 6), OTHER_FLAG));
+
+				src2 = 64 - src2;
+				if (GET_OPCODE(op) == SLJIT_ROTL)
+					op_imm = ((src2 < 32) ? DSRL : DSRL32);
+				else
+					op_imm = ((src2 < 32) ? DSLL : DSLL32);
+
+				FAIL_IF(push_inst(compiler, op_imm | T(src1) | D(dst) | (((sljit_ins)src2 & 0x1f) << 6), DR(dst)));
+				return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+			}
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SLL : SRL;
+			FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(OTHER_FLAG) | ((sljit_ins)src2 << 6), OTHER_FLAG));
+
+			src2 = 32 - src2;
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SRL : SLL;
+			FAIL_IF(push_inst(compiler, op_imm | T(src1) | D(dst) | (((sljit_ins)src2 & 0x1f) << 6), DR(dst)));
+			return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+		}
+
+		if (src2 == 0) {
+			if (dst != src1)
+				return push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | TA(0) | D(dst), DR(dst));
+			return SLJIT_SUCCESS;
+		}
+
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(0) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+		if (!(op & SLJIT_32)) {
+			op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? DSLLV : DSRLV;
+			FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
+			op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? DSRLV : DSLLV;
+			FAIL_IF(push_inst(compiler, op_v | SA(EQUAL_FLAG) | T(src1) | D(dst), DR(dst)));
+			return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+		}
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+		op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? SLLV : SRLV;
+		FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
+		op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? SRLV : SLLV;
+		FAIL_IF(push_inst(compiler, op_v | SA(EQUAL_FLAG) | T(src1) | D(dst), DR(dst)));
+		return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 2 */
+
+	default:
+		SLJIT_UNREACHABLE();
 		return SLJIT_SUCCESS;
 	}
 
-	SLJIT_UNREACHABLE();
-	return SLJIT_SUCCESS;
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	if ((flags & SRC2_IMM) || src2 == 0) {
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG));
+
+		if (flags & UNUSED_DEST)
+			return SLJIT_SUCCESS;
+		return push_inst(compiler, op_imm | T(src1) | D(dst) | SH_IMM(src2), DR(dst));
+	}
+
+	if (op & SLJIT_SET_Z)
+		FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+	if (flags & UNUSED_DEST)
+		return SLJIT_SUCCESS;
+	return push_inst(compiler, op_v | S(src2) | T(src1) | D(dst), DR(dst));
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	if ((flags & SRC2_IMM) || src2 == 0) {
+		if (src2 >= 32) {
+			SLJIT_ASSERT(!(op & SLJIT_32));
+			ins = op_dimm32;
+			src2 -= 32;
+		}
+		else
+			ins = (op & SLJIT_32) ? op_imm : op_dimm;
+
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, ins | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG));
+
+		if (flags & UNUSED_DEST)
+			return SLJIT_SUCCESS;
+		return push_inst(compiler, ins | T(src1) | D(dst) | SH_IMM(src2), DR(dst));
+	}
+
+	ins = (op & SLJIT_32) ? op_v : op_dv;
+	if (op & SLJIT_SET_Z)
+		FAIL_IF(push_inst(compiler, ins | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+	if (flags & UNUSED_DEST)
+		return SLJIT_SUCCESS;
+	return push_inst(compiler, ins | S(src2) | T(src1) | D(dst), DR(dst));
+#endif /* SLJIT_CONFIG_MIPS_32 */
 }
 
 #define CHECK_IMM(flags, srcw) \
@@ -2098,10 +2344,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	case SLJIT_MOV_S16:
 		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
 
-	case SLJIT_NOT:
-		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
-
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
+	case SLJIT_REV:
 		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
 	}
 
@@ -2147,14 +2392,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		compiler->status_flags_state = 0;
 		return emit_op(compiler, op, flags | CUMULATIVE_OP, dst, dstw, src1, src1w, src2, src2w);
 
+	case SLJIT_XOR:
+		if (((src1 & SLJIT_IMM) && src1w == -1) || ((src2 & SLJIT_IMM) && src2w == -1)) {
+			return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
+		}
+		/* fallthrough */
 	case SLJIT_AND:
 	case SLJIT_OR:
-	case SLJIT_XOR:
 		return emit_op(compiler, op, flags | CUMULATIVE_OP | LOGICAL_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 		if (src2 & SLJIT_IMM)
 			src2w &= 0x1f;
@@ -2184,6 +2438,97 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
 }
 
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+#define SELECT_OP3(op, src2w, D, D32, W) (((op & SLJIT_32) ? (W) : ((src2w) < 32) ? (D) : (D32)) | (((sljit_ins)src2w & 0x1f) << 6))
+#define SELECT_OP2(op, D, W) ((op & SLJIT_32) ? (W) : (D))
+#else /* !SLJIT_CONFIG_MIPS_64 */
+#define SELECT_OP3(op, src2w, D, D32, W) ((W) | ((sljit_ins)(src2w) << 6))
+#define SELECT_OP2(op, D, W) (W)
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_s32 is_left;
+	sljit_ins ins1, ins2, ins3;
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	sljit_s32 inp_flags = ((op & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_MIPS_64 */
+	sljit_s32 inp_flags = WORD_DATA | LOAD_DATA;
+	sljit_sw bit_length = 32;
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	if (src1_reg == src2_reg) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_left ? SLJIT_ROTL : SLJIT_ROTR) | (op & SLJIT_32), dst_reg, 0, src1_reg, 0, src3, src3w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	if (src3 & SLJIT_IMM) {
+		src3w &= bit_length - 1;
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+
+		if (is_left) {
+			ins1 = SELECT_OP3(op, src3w, DSLL, DSLL32, SLL);
+			src3w = bit_length - src3w;
+			ins2 = SELECT_OP3(op, src3w, DSRL, DSRL32, SRL);
+		} else {
+			ins1 = SELECT_OP3(op, src3w, DSRL, DSRL32, SRL);
+			src3w = bit_length - src3w;
+			ins2 = SELECT_OP3(op, src3w, DSLL, DSLL32, SLL);
+		}
+
+		FAIL_IF(push_inst(compiler, ins1 | T(src1_reg) | D(dst_reg), DR(dst_reg)));
+		FAIL_IF(push_inst(compiler, ins2 | T(src2_reg) | D(TMP_REG1), DR(TMP_REG1)));
+		return push_inst(compiler, OR | S(dst_reg) | T(TMP_REG1) | D(dst_reg), DR(dst_reg));
+	}
+
+	if (src3 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, DR(TMP_REG2), src3, src3w));
+		src3 = TMP_REG2;
+	} else if (dst_reg == src3) {
+		FAIL_IF(push_inst(compiler, SELECT_OP2(op, DADDU, ADDU) | S(src3) | TA(0) | D(TMP_REG2), DR(TMP_REG2)));
+		src3 = TMP_REG2;
+	}
+
+	if (is_left) {
+		ins1 = SELECT_OP2(op, DSRL, SRL);
+		ins2 = SELECT_OP2(op, DSLLV, SLLV);
+		ins3 = SELECT_OP2(op, DSRLV, SRLV);
+	} else {
+		ins1 = SELECT_OP2(op, DSLL, SLL);
+		ins2 = SELECT_OP2(op, DSRLV, SRLV);
+		ins3 = SELECT_OP2(op, DSLLV, SLLV);
+	}
+
+	FAIL_IF(push_inst(compiler, ins2 | S(src3) | T(src1_reg) | D(dst_reg), DR(dst_reg)));
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		FAIL_IF(push_inst(compiler, ins1 | T(src2_reg) | D(TMP_REG1) | (1 << 6), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, XORI | S(src3) | T(TMP_REG2) | ((sljit_ins)bit_length - 1), DR(TMP_REG2)));
+		src2_reg = TMP_REG1;
+	} else
+		FAIL_IF(push_inst(compiler, SELECT_OP2(op, DSUBU, SUBU) | SA(0) | T(src3) | D(TMP_REG2), DR(TMP_REG2)));
+
+	FAIL_IF(push_inst(compiler, ins3 | S(TMP_REG2) | T(src2_reg) | D(TMP_REG1), DR(TMP_REG1)));
+	return push_inst(compiler, OR | S(dst_reg) | T(TMP_REG1) | D(dst_reg), DR(dst_reg));
+}
+
+#undef SELECT_OP3
+#undef SELECT_OP2
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -2216,6 +2561,36 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 dst_ar = RETURN_ADDR_REG;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		if (FAST_IS_REG(dst))
+			return push_inst(compiler, ADDU_W | SA(RETURN_ADDR_REG) | TA(0) | D(dst), UNMOVABLE_INS);
+		break;
+	case SLJIT_GET_RETURN_ADDRESS:
+		dst_ar = DR(FAST_IS_REG(dst) ? dst : TMP_REG2);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, dst_ar, SLJIT_MEM1(SLJIT_SP), compiler->local_size - SSIZE_OF(sw)));
+		break;
+	}
+
+	if (dst & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA, dst_ar, dst, dstw));
+
+		if (op == SLJIT_FAST_ENTER)
+			compiler->delay_slot = UNMOVABLE_INS;
+	}
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -2340,36 +2715,30 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile
 	switch (GET_FLAG_TYPE(op)) {
 	case SLJIT_F_EQUAL:
 	case SLJIT_ORDERED_EQUAL:
-	case SLJIT_UNORDERED_OR_NOT_EQUAL:
 		inst = C_EQ_S;
 		break;
 	case SLJIT_F_NOT_EQUAL:
 	case SLJIT_UNORDERED_OR_EQUAL:
-	case SLJIT_ORDERED_NOT_EQUAL:
 		inst = C_UEQ_S;
 		break;
 	case SLJIT_F_LESS:
 	case SLJIT_ORDERED_LESS:
-	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
 		inst = C_OLT_S;
 		break;
 	case SLJIT_F_GREATER_EQUAL:
 	case SLJIT_UNORDERED_OR_LESS:
-	case SLJIT_ORDERED_GREATER_EQUAL:
 		inst = C_ULT_S;
 		break;
 	case SLJIT_F_GREATER:
 	case SLJIT_ORDERED_GREATER:
-	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		inst = C_ULE_S;
 		break;
 	case SLJIT_F_LESS_EQUAL:
 	case SLJIT_UNORDERED_OR_GREATER:
-	case SLJIT_ORDERED_LESS_EQUAL:
 		inst = C_OLE_S;
 		break;
 	default:
-		SLJIT_ASSERT(GET_FLAG_TYPE(op) == SLJIT_UNORDERED || GET_FLAG_TYPE(op) == SLJIT_ORDERED);
+		SLJIT_ASSERT(GET_FLAG_TYPE(op) == SLJIT_UNORDERED);
 		inst = C_UN_S;
 		break;
 	}
@@ -2507,25 +2876,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 #undef FLOAT_DATA
 #undef FMT
 
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, ADDU_W | SA(RETURN_ADDR_REG) | TA(0) | D(dst), UNMOVABLE_INS);
-
-	/* Memory. */
-	FAIL_IF(emit_op_mem(compiler, WORD_DATA, RETURN_ADDR_REG, dst, dstw));
-	compiler->delay_slot = UNMOVABLE_INS;
-	return SLJIT_SUCCESS;
-}
-
 /* --------------------------------------------------------------------- */
 /*  Conditional instructions                                             */
 /* --------------------------------------------------------------------- */
@@ -2849,7 +3199,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (src & SLJIT_IMM) {
 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
@@ -2861,8 +3210,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 			jump->flags |= IS_MOVABLE;
 
 		src = TMP_REG2;
-	}
-	else if (src & SLJIT_MEM) {
+	} else if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
 		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(TMP_REG2), src, srcw));
 		src = TMP_REG2;
 	}
@@ -2882,6 +3231,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 		compiler->size += 6;
 #endif
 	}
+
 	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
 	return SLJIT_SUCCESS;
 }
@@ -3000,7 +3350,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-		if (dst_reg & SLJIT_32)
+		if (type & SLJIT_32)
 			srcw = (sljit_s32)srcw;
 #endif
 		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), srcw));
@@ -3008,9 +3358,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 		srcw = 0;
 	}
 
-	dst_reg &= ~SLJIT_32;
-
-	switch (type) {
+	switch (type & ~SLJIT_32) {
 	case SLJIT_EQUAL:
 		ins = MOVZ | TA(EQUAL_FLAG);
 		break;
@@ -3068,8 +3416,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 #endif /* SLJIT_MIPS_REV >= 1 */
 }
 
-#if !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
-
 static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem, sljit_sw *memw, sljit_s16 max_offset)
 {
 	sljit_s32 arg = *mem;
@@ -3080,7 +3426,7 @@ static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem
 
 		if (SLJIT_UNLIKELY(argw)) {
 			FAIL_IF(push_inst(compiler, SLL_W | T(OFFS_REG(arg)) | D(TMP_REG1) | SH_IMM(argw), DR(TMP_REG1)));
-			FAIL_IF(push_inst(compiler, ADDU_W | S(arg & REG_MASK) | T(TMP_REG1) | D(TMP_REG1), DR(TMP_REG1)));
+			FAIL_IF(push_inst(compiler, ADDU_W | S(TMP_REG1) | T(arg & REG_MASK) | D(TMP_REG1), DR(TMP_REG1)));
 		} else
 			FAIL_IF(push_inst(compiler, ADDU_W | S(arg & REG_MASK) | T(OFFS_REG(arg)) | D(TMP_REG1), DR(TMP_REG1)));
 
@@ -3108,7 +3454,7 @@ static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem
 	if ((arg & REG_MASK) == 0)
 		return SLJIT_SUCCESS;
 
-	return push_inst(compiler, ADDU_W | S(arg & REG_MASK) | T(TMP_REG1) | D(TMP_REG1), DR(TMP_REG1));
+	return push_inst(compiler, ADDU_W | S(TMP_REG1) | T(arg & REG_MASK) | D(TMP_REG1), DR(TMP_REG1));
 }
 
 #if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
@@ -3123,18 +3469,70 @@ static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem
 #define MEMF64_FS_SECOND(freg) FS(freg)
 #endif /* SLJIT_LITTLE_ENDIAN */
 
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+#define MEM_CHECK_UNALIGNED(type) ((type) & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16))
+#else /* !SLJIT_CONFIG_MIPS_32 */
+#define MEM_CHECK_UNALIGNED(type) ((type) & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32))
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
 	sljit_s32 op = type & 0xff;
 	sljit_s32 flags = 0;
+	sljit_ins ins;
+#if !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+	sljit_ins ins_right;
+#endif /* !(SLJIT_MIPS_REV >= 6) */
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
-	if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-		return SLJIT_ERR_UNSUPPORTED;
+	if (reg & REG_PAIR_MASK) {
+		ADJUST_LOCAL_OFFSET(mem, memw);
+
+#if !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+		if (MEM_CHECK_UNALIGNED(type)) {
+			FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - (2 * SSIZE_OF(sw) - 1)));
+
+			if (!(type & SLJIT_MEM_STORE) && (mem == REG_PAIR_FIRST(reg) || mem == REG_PAIR_SECOND(reg))) {
+				FAIL_IF(push_inst(compiler, ADDU_W | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
+				mem = TMP_REG1;
+			}
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+			ins = ((type & SLJIT_MEM_STORE) ? SWL : LWL) | S(mem);
+			ins_right = ((type & SLJIT_MEM_STORE) ? SWR : LWR) | S(mem);
+#else /* !SLJIT_CONFIG_MIPS_32 */
+			ins = ((type & SLJIT_MEM_STORE) ? SDL : LDL) | S(mem);
+			ins_right = ((type & SLJIT_MEM_STORE) ? SDR : LDR) | S(mem);
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM(memw), DR(REG_PAIR_FIRST(reg))));
+			FAIL_IF(push_inst(compiler, ins_right | T(REG_PAIR_FIRST(reg)) | IMM(memw + (SSIZE_OF(sw) - 1)), DR(REG_PAIR_FIRST(reg))));
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg))));
+			return push_inst(compiler, ins_right | T(REG_PAIR_SECOND(reg)) | IMM((memw + 2 * SSIZE_OF(sw) - 1)), DR(REG_PAIR_SECOND(reg)));
+		}
+#endif /* !(SLJIT_MIPS_REV >= 6) */
+
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - SSIZE_OF(sw)));
+
+		ins = ((type & SLJIT_MEM_STORE) ? STORE_W : LOAD_W) | S(mem);
+
+		if (!(type & SLJIT_MEM_STORE) && mem == REG_PAIR_FIRST(reg)) {
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg))));
+			return push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM(memw), DR(REG_PAIR_FIRST(reg)));
+		}
+
+		FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM(memw), DR(REG_PAIR_FIRST(reg))));
+		return push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg)));
+	}
+
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+	return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+#else /* !(SLJIT_MIPS_REV >= 6) */
+	ADJUST_LOCAL_OFFSET(mem, memw);
 
 	switch (op) {
 	case SLJIT_MOV_U8:
@@ -3172,7 +3570,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	case SLJIT_MOV:
 	case SLJIT_MOV_P:
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-		if (type & SLJIT_MEM_ALIGNED_32) {
+		if (type & SLJIT_MEM_UNALIGNED_32) {
 			flags = WORD_DATA;
 			if (!(type & SLJIT_MEM_STORE))
 				flags |= LOAD_DATA;
@@ -3189,7 +3587,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 		}
 
 		if (mem == reg) {
-			FAIL_IF(push_inst(compiler, DADDU | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
+			FAIL_IF(push_inst(compiler, ADDU_W | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
 			mem = TMP_REG1;
 		}
 
@@ -3206,37 +3604,32 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 		return push_inst(compiler, SWR | S(mem) | T(reg) | IMM(memw + 3), MOVABLE_INS);
 	}
 
+	if (mem == reg) {
+		FAIL_IF(push_inst(compiler, ADDU_W | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
+		mem = TMP_REG1;
+	}
+
+	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(reg) | IMM(memw), DR(reg)));
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-	if (mem == reg) {
-		FAIL_IF(push_inst(compiler, ADDU | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
-		mem = TMP_REG1;
-	}
-
-	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(reg) | IMM(memw), DR(reg)));
 	return push_inst(compiler, LWR | S(mem) | T(reg) | IMM(memw + 3), DR(reg));
-
 #else /* !SLJIT_CONFIG_MIPS_32 */
-	if (mem == reg) {
-		FAIL_IF(push_inst(compiler, DADDU | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
-		mem = TMP_REG1;
-	}
-
-	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(reg) | IMM(memw), DR(reg)));
 	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(reg) | IMM(memw + 3), DR(reg)));
 
-	if (op == SLJIT_MOV_U32) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
-		return push_inst(compiler, DINSU | T(reg) | SA(0) | (31 << 11) | (0 << 11), DR(reg));
-#else  /* SLJIT_MIPS_REV < 1 */
-		FAIL_IF(push_inst(compiler, DSLL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg)));
-		return push_inst(compiler, DSRL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg));
-#endif /* SLJIT_MIPS_REV >= 2 */
-	}
+	if (op != SLJIT_MOV_U32)
+		return SLJIT_SUCCESS;
 
-	return SLJIT_SUCCESS;
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+	return push_inst(compiler, DINSU | T(reg) | SA(0) | (31 << 11) | (0 << 11), DR(reg));
+#else  /* SLJIT_MIPS_REV < 1 */
+	FAIL_IF(push_inst(compiler, DSLL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg)));
+	return push_inst(compiler, DSRL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg));
+#endif /* SLJIT_MIPS_REV >= 2 */
 #endif /* SLJIT_CONFIG_MIPS_32 */
+#endif /* SLJIT_MIPS_REV >= 6 */
 }
 
+#if !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw)
@@ -3244,9 +3637,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
 
-	if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-		return SLJIT_ERR_UNSUPPORTED;
-
 	FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - (type & SLJIT_32) ? 3 : 7));
 	SLJIT_ASSERT(FAST_IS_REG(mem) && mem != TMP_REG2);
 
@@ -3318,12 +3708,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	return SLJIT_SUCCESS;
 }
 
+#endif /* !SLJIT_MIPS_REV || SLJIT_MIPS_REV < 6 */
+
 #undef MEM16_IMM_FIRST
 #undef MEM16_IMM_SECOND
 #undef MEMF64_FS_FIRST
 #undef MEMF64_FS_SECOND
-
-#endif /* !SLJIT_MIPS_REV || SLJIT_MIPS_REV < 6 */
+#undef MEM_CHECK_UNALIGNED
 
 #undef TO_ARGW_HI
 
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_32.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_32.c
index 1eb518a6da..25cfcb9072 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_32.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_32.c
@@ -38,12 +38,15 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg,
 	return (imm & 0xffff) ? push_inst(compiler, ORI | S(reg) | A(reg) | IMM(imm)) : SLJIT_SUCCESS;
 }
 
+/* Simplified mnemonics: clrlwi. */
 #define INS_CLEAR_LEFT(dst, src, from) \
-	(RLWINM | S(src) | A(dst) | ((from) << 6) | (31 << 1))
+	(RLWINM | S(src) | A(dst) | RLWI_MBE(from, 31))
 
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_s32 dst, sljit_s32 src1, sljit_s32 src2)
 {
+	sljit_u32 imm;
+
 	switch (op) {
 	case SLJIT_MOV:
 	case SLJIT_MOV_U32:
@@ -82,14 +85,20 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 		return SLJIT_SUCCESS;
 
-	case SLJIT_NOT:
-		SLJIT_ASSERT(src1 == TMP_REG1);
-		return push_inst(compiler, NOR | RC(flags) | S(src2) | A(dst) | B(src2));
-
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(src1 == TMP_REG1);
 		return push_inst(compiler, CNTLZW | S(src2) | A(dst));
 
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1);
+		FAIL_IF(push_inst(compiler, NEG | D(TMP_REG1) | A(src2)));
+		FAIL_IF(push_inst(compiler, AND | S(src2) | A(dst) | B(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, CNTLZW | S(dst) | A(dst)));
+		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG1) | A(dst) | IMM(-32)));
+		/* The highest bits are set, if dst < 32, zero otherwise. */
+		FAIL_IF(push_inst(compiler, SRWI(27) | S(TMP_REG1) | A(TMP_REG1)));
+		return push_inst(compiler, XOR | S(dst) | A(dst) | B(TMP_REG1));
+
 	case SLJIT_ADD:
 		if (flags & ALT_FORM1) {
 			/* Setting XER SO is not enough, CR SO is also needed. */
@@ -103,12 +112,14 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 			if (flags & ALT_FORM3)
 				return push_inst(compiler, ADDIS | D(dst) | A(src1) | compiler->imm);
 
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM4) {
-				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((compiler->imm >> 16) & 0xffff) + ((compiler->imm >> 15) & 0x1))));
+				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((imm >> 16) & 0xffff) + ((imm >> 15) & 0x1))));
 				src1 = dst;
 			}
 
-			return push_inst(compiler, ADDI | D(dst) | A(src1) | (compiler->imm & 0xffff));
+			return push_inst(compiler, ADDI | D(dst) | A(src1) | (imm & 0xffff));
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
@@ -208,8 +219,10 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(imm >> 16));
 		}
 		return push_inst(compiler, OR | RC(flags) | S(src1) | A(dst) | B(src2));
 
@@ -224,34 +237,82 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(imm >> 16));
+		}
+		if (flags & ALT_FORM4) {
+			SLJIT_ASSERT(src1 == TMP_REG1);
+			return push_inst(compiler, NOR | RC(flags) | S(src2) | A(dst) | B(src2));
 		}
 		return push_inst(compiler, XOR | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			compiler->imm &= 0x1f;
-			return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11) | ((31 - compiler->imm) << 1));
+			imm = compiler->imm & 0x1f;
+			return push_inst(compiler, SLWI(imm) | RC(flags) | S(src1) | A(dst));
 		}
+
+		if (op == SLJIT_MSHL) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x1f));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, SLW | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			compiler->imm &= 0x1f;
-			return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (((32 - compiler->imm) & 0x1f) << 11) | (compiler->imm << 6) | (31 << 1));
+			imm = compiler->imm & 0x1f;
+			/* Since imm can be 0, SRWI() cannot be used. */
+			return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | RLWI_SH((32 - imm) & 0x1f) | RLWI_MBE(imm, 31));
 		}
+
+		if (op == SLJIT_MLSHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x1f));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, SRW | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			compiler->imm &= 0x1f;
-			return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11));
+			imm = compiler->imm & 0x1f;
+			return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (imm << 11));
 		}
+
+		if (op == SLJIT_MASHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x1f));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, SRAW | RC(flags) | S(src1) | A(dst) | B(src2));
+
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & ALT_FORM1) {
+			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
+			if (op == SLJIT_ROTR)
+				imm = (sljit_u32)(-(sljit_s32)imm);
+
+			imm &= 0x1f;
+			return push_inst(compiler, RLWINM | S(src1) | A(dst) | RLWI_SH(imm) | RLWI_MBE(0, 31));
+		}
+
+		if (op == SLJIT_ROTR) {
+			FAIL_IF(push_inst(compiler, SUBFIC | D(TMP_REG2) | A(src2) | 0));
+			src2 = TMP_REG2;
+		}
+
+		return push_inst(compiler, RLWNM | S(src1) | A(dst) | B(src2) | RLWI_MBE(0, 31));
 	}
 
 	SLJIT_UNREACHABLE();
@@ -264,6 +325,48 @@ static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_
 	return push_inst(compiler, ORI | S(reg) | A(reg) | IMM(init_value));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	sljit_s32 reg2 = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	if (op & SLJIT_32) {
+		if (op == SLJIT_COPY32_TO_F32) {
+			FAIL_IF(push_inst(compiler, STW | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+			return push_inst(compiler, LFS | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET);
+		}
+
+		FAIL_IF(push_inst(compiler, STFS | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+		return push_inst(compiler, LWZ | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET);
+	}
+
+	if (reg & REG_PAIR_MASK) {
+		reg2 = REG_PAIR_SECOND(reg);
+		reg = REG_PAIR_FIRST(reg);
+	}
+
+	if (op == SLJIT_COPY_TO_F64) {
+		FAIL_IF(push_inst(compiler, STW | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET_HI));
+
+		if (reg2 != 0)
+			FAIL_IF(push_inst(compiler, STW | S(reg2) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+		else
+			FAIL_IF(push_inst(compiler, STFD | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+
+		return push_inst(compiler, LFD | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET);
+	}
+
+	FAIL_IF(push_inst(compiler, STFD | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+
+	if (reg2 != 0)
+		FAIL_IF(push_inst(compiler, LWZ | S(reg2) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+
+	return push_inst(compiler, LWZ | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET_HI);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
 {
 	sljit_ins *inst = (sljit_ins *)addr;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_64.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_64.c
index 61491011c4..8d774cf57a 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_64.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_64.c
@@ -35,8 +35,9 @@
 #error "Must implement count leading zeroes"
 #endif
 
-#define PUSH_RLDICR(reg, shift) \
-	push_inst(compiler, RLDI(reg, reg, 63 - shift, shift, 1))
+/* Computes SLDI(63 - shift). */
+#define PUSH_SLDI_NEG(reg, shift) \
+	push_inst(compiler, RLDICR | S(reg) | A(reg) | RLDI_SH(63 - shift) | RLDI_ME(shift))
 
 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg, sljit_sw imm)
 {
@@ -66,14 +67,14 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg,
 	if ((tmp & ~0xffff000000000000ul) == 0) {
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
 		shift += 15;
-		return PUSH_RLDICR(reg, shift);
+		return PUSH_SLDI_NEG(reg, shift);
 	}
 
 	if ((tmp & ~0xffffffff00000000ul) == 0) {
 		FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
 		FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | IMM(tmp >> 32)));
 		shift += 31;
-		return PUSH_RLDICR(reg, shift);
+		return PUSH_SLDI_NEG(reg, shift);
 	}
 
 	/* Cut out the 16 bit from immediate. */
@@ -82,13 +83,13 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg,
 
 	if (tmp2 <= 0xffff) {
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
-		FAIL_IF(PUSH_RLDICR(reg, shift));
+		FAIL_IF(PUSH_SLDI_NEG(reg, shift));
 		return push_inst(compiler, ORI | S(reg) | A(reg) | (sljit_ins)tmp2);
 	}
 
 	if (tmp2 <= 0xffffffff) {
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | IMM(tmp >> 48)));
-		FAIL_IF(PUSH_RLDICR(reg, shift));
+		FAIL_IF(PUSH_SLDI_NEG(reg, shift));
 		FAIL_IF(push_inst(compiler, ORIS | S(reg) | A(reg) | (sljit_ins)(tmp2 >> 16)));
 		return (imm & 0xffff) ? push_inst(compiler, ORI | S(reg) | A(reg) | IMM(tmp2)) : SLJIT_SUCCESS;
 	}
@@ -100,22 +101,23 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg,
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
 		shift2 += 15;
 		shift += (63 - shift2);
-		FAIL_IF(PUSH_RLDICR(reg, shift));
+		FAIL_IF(PUSH_SLDI_NEG(reg, shift));
 		FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | (sljit_ins)(tmp2 >> 48)));
-		return PUSH_RLDICR(reg, shift2);
+		return PUSH_SLDI_NEG(reg, shift2);
 	}
 
 	/* The general version. */
 	FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | (sljit_ins)((sljit_uw)imm >> 48)));
 	FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | IMM(imm >> 32)));
-	FAIL_IF(PUSH_RLDICR(reg, 31));
+	FAIL_IF(PUSH_SLDI_NEG(reg, 31));
 	FAIL_IF(push_inst(compiler, ORIS | S(reg) | A(reg) | IMM(imm >> 16)));
 	return push_inst(compiler, ORI | S(reg) | A(reg) | IMM(imm));
 }
 
-/* Simplified mnemonics: clrldi. */
-#define INS_CLEAR_LEFT(dst, src, from) \
-	(RLDICL | S(src) | A(dst) | ((from) << 6) | (1 << 5))
+#undef PUSH_SLDI_NEG
+
+#define CLRLDI(dst, src, n) \
+	(RLDICL | S(src) | A(dst) | RLDI_SH(0) | RLDI_MB(n))
 
 /* Sign extension for integer operations. */
 #define UN_EXTS() \
@@ -145,6 +147,8 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg,
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_s32 dst, sljit_s32 src1, sljit_s32 src2)
 {
+	sljit_u32 imm;
+
 	switch (op) {
 	case SLJIT_MOV:
 	case SLJIT_MOV_P:
@@ -159,7 +163,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 			if (op == SLJIT_MOV_S32)
 				return push_inst(compiler, EXTSW | S(src2) | A(dst));
-			return push_inst(compiler, INS_CLEAR_LEFT(dst, src2, 0));
+			return push_inst(compiler, CLRLDI(dst, src2, 32));
 		}
 		else {
 			SLJIT_ASSERT(dst == src2);
@@ -172,7 +176,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 			if (op == SLJIT_MOV_S8)
 				return push_inst(compiler, EXTSB | S(src2) | A(dst));
-			return push_inst(compiler, INS_CLEAR_LEFT(dst, src2, 24));
+			return push_inst(compiler, CLRLDI(dst, src2, 56));
 		}
 		else if ((flags & REG_DEST) && op == SLJIT_MOV_S8)
 			return push_inst(compiler, EXTSB | S(src2) | A(dst));
@@ -187,36 +191,39 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 			if (op == SLJIT_MOV_S16)
 				return push_inst(compiler, EXTSH | S(src2) | A(dst));
-			return push_inst(compiler, INS_CLEAR_LEFT(dst, src2, 16));
+			return push_inst(compiler, CLRLDI(dst, src2, 48));
 		}
 		else {
 			SLJIT_ASSERT(dst == src2);
 		}
 		return SLJIT_SUCCESS;
 
-	case SLJIT_NOT:
-		SLJIT_ASSERT(src1 == TMP_REG1);
-		UN_EXTS();
-		return push_inst(compiler, NOR | RC(flags) | S(src2) | A(dst) | B(src2));
-
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(src1 == TMP_REG1);
-		if (flags & ALT_FORM1)
-			return push_inst(compiler, CNTLZW | S(src2) | A(dst));
-		return push_inst(compiler, CNTLZD | S(src2) | A(dst));
+		return push_inst(compiler, ((flags & ALT_FORM1) ? CNTLZW : CNTLZD) | S(src2) | A(dst));
+
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1);
+		FAIL_IF(push_inst(compiler, NEG | D(TMP_REG1) | A(src2)));
+		FAIL_IF(push_inst(compiler, AND | S(src2) | A(dst) | B(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, ((flags & ALT_FORM1) ? CNTLZW : CNTLZD) | S(dst) | A(dst)));
+		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG1) | A(dst) | IMM((flags & ALT_FORM1) ? -32 : -64)));
+		/* The highest bits are set, if dst < bit width, zero otherwise. */
+		FAIL_IF(push_inst(compiler, ((flags & ALT_FORM1) ? SRWI(27) : SRDI(58)) | S(TMP_REG1) | A(TMP_REG1)));
+		return push_inst(compiler, XOR | S(dst) | A(dst) | B(TMP_REG1));
 
 	case SLJIT_ADD:
 		if (flags & ALT_FORM1) {
 			if (flags & ALT_SIGN_EXT) {
-				FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, src1, 32, 31, 1)));
+				FAIL_IF(push_inst(compiler, SLDI(32) | S(src1) | A(TMP_REG1)));
 				src1 = TMP_REG1;
-				FAIL_IF(push_inst(compiler, RLDI(TMP_REG2, src2, 32, 31, 1)));
+				FAIL_IF(push_inst(compiler, SLDI(32) | S(src2) | A(TMP_REG2)));
 				src2 = TMP_REG2;
 			}
 			/* Setting XER SO is not enough, CR SO is also needed. */
 			FAIL_IF(push_inst(compiler, ADD | OE(ALT_SET_FLAGS) | RC(ALT_SET_FLAGS) | D(dst) | A(src1) | B(src2)));
 			if (flags & ALT_SIGN_EXT)
-				return push_inst(compiler, RLDI(dst, dst, 32, 32, 0));
+				return push_inst(compiler, SRDI(32) | S(dst) | A(dst));
 			return SLJIT_SUCCESS;
 		}
 
@@ -227,12 +234,14 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 			if (flags & ALT_FORM3)
 				return push_inst(compiler, ADDIS | D(dst) | A(src1) | compiler->imm);
 
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM4) {
-				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((compiler->imm >> 16) & 0xffff) + ((compiler->imm >> 15) & 0x1))));
+				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((imm >> 16) & 0xffff) + ((imm >> 15) & 0x1))));
 				src1 = dst;
 			}
 
-			return push_inst(compiler, ADDI | D(dst) | A(src1) | (compiler->imm & 0xffff));
+			return push_inst(compiler, ADDI | D(dst) | A(src1) | (imm & 0xffff));
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
@@ -287,11 +296,11 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		if (flags & ALT_FORM3) {
 			if (flags & ALT_SIGN_EXT) {
 				if (src1 != TMP_ZERO) {
-					FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, src1, 32, 31, 1)));
+					FAIL_IF(push_inst(compiler, SLDI(32) | S(src1) | A(TMP_REG1)));
 					src1 = TMP_REG1;
 				}
 				if (src2 != TMP_ZERO) {
-					FAIL_IF(push_inst(compiler, RLDI(TMP_REG2, src2, 32, 31, 1)));
+					FAIL_IF(push_inst(compiler, SLDI(32) | S(src2) | A(TMP_REG2)));
 					src2 = TMP_REG2;
 				}
 			}
@@ -303,7 +312,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 				FAIL_IF(push_inst(compiler, NEG | OE(ALT_SET_FLAGS) | RC(ALT_SET_FLAGS) | D(dst) | A(src2)));
 
 			if (flags & ALT_SIGN_EXT)
-				return push_inst(compiler, RLDI(dst, dst, 32, 32, 0));
+				return push_inst(compiler, SRDI(32) | S(dst) | A(dst));
 			return SLJIT_SUCCESS;
 		}
 
@@ -362,8 +371,10 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(imm >> 16));
 		}
 		return push_inst(compiler, OR | RC(flags) | S(src1) | A(dst) | B(src2));
 
@@ -378,46 +389,110 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(imm >> 16));
+		}
+		if (flags & ALT_FORM4) {
+			SLJIT_ASSERT(src1 == TMP_REG1);
+			UN_EXTS();
+			return push_inst(compiler, NOR | RC(flags) | S(src2) | A(dst) | B(src2));
 		}
 		return push_inst(compiler, XOR | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM2) {
-				compiler->imm &= 0x1f;
-				return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11) | ((31 - compiler->imm) << 1));
+				imm &= 0x1f;
+				return push_inst(compiler, SLWI(imm) | RC(flags) | S(src1) | A(dst));
 			}
-			compiler->imm &= 0x3f;
-			return push_inst(compiler, RLDI(dst, src1, compiler->imm, 63 - compiler->imm, 1) | RC(flags));
+
+			imm &= 0x3f;
+			return push_inst(compiler, SLDI(imm) | RC(flags) | S(src1) | A(dst));
 		}
+
+		if (op == SLJIT_MSHL) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | ((flags & ALT_FORM2) ? 0x1f : 0x3f)));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, ((flags & ALT_FORM2) ? SLW : SLD) | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM2) {
-				compiler->imm &= 0x1f;
-				return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (((32 - compiler->imm) & 0x1f) << 11) | (compiler->imm << 6) | (31 << 1));
+				imm &= 0x1f;
+				/* Since imm can be 0, SRWI() cannot be used. */
+				return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | RLWI_SH((32 - imm) & 0x1f) | RLWI_MBE(imm, 31));
 			}
-			compiler->imm &= 0x3f;
-			return push_inst(compiler, RLDI(dst, src1, 64 - compiler->imm, compiler->imm, 0) | RC(flags));
+
+			imm &= 0x3f;
+			/* Since imm can be 0, SRDI() cannot be used. */
+			return push_inst(compiler, RLDICL | RC(flags) | S(src1) | A(dst) | RLDI_SH((64 - imm) & 0x3f) | RLDI_MB(imm));
 		}
+
+		if (op == SLJIT_MLSHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | ((flags & ALT_FORM2) ? 0x1f : 0x3f)));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, ((flags & ALT_FORM2) ? SRW : SRD) | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM2) {
-				compiler->imm &= 0x1f;
-				return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11));
+				imm &= 0x1f;
+				return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (imm << 11));
 			}
-			compiler->imm &= 0x3f;
-			return push_inst(compiler, SRADI | RC(flags) | S(src1) | A(dst) | ((compiler->imm & 0x1f) << 11) | ((compiler->imm & 0x20) >> 4));
+
+			imm &= 0x3f;
+			return push_inst(compiler, SRADI | RC(flags) | S(src1) | A(dst) | RLDI_SH(imm));
 		}
+
+		if (op == SLJIT_MASHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | ((flags & ALT_FORM2) ? 0x1f : 0x3f)));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, ((flags & ALT_FORM2) ? SRAW : SRAD) | RC(flags) | S(src1) | A(dst) | B(src2));
+
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & ALT_FORM1) {
+			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
+			if (op == SLJIT_ROTR)
+				imm = (sljit_u32)(-(sljit_s32)imm);
+
+			if (flags & ALT_FORM2) {
+				imm &= 0x1f;
+				return push_inst(compiler, RLWINM | S(src1) | A(dst) | RLWI_SH(imm) | RLWI_MBE(0, 31));
+			}
+
+			imm &= 0x3f;
+			return push_inst(compiler, RLDICL | S(src1) | A(dst) | RLDI_SH(imm));
+		}
+
+		if (op == SLJIT_ROTR) {
+			FAIL_IF(push_inst(compiler, SUBFIC | D(TMP_REG2) | A(src2) | 0));
+			src2 = TMP_REG2;
+		}
+
+		return push_inst(compiler, ((flags & ALT_FORM2) ? (RLWNM | RLWI_MBE(0, 31)) : (RLDCL | RLDI_MB(0))) | S(src1) | A(dst) | B(src2));
 	}
 
 	SLJIT_UNREACHABLE();
@@ -483,11 +558,26 @@ static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_
 {
 	FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | IMM(init_value >> 48)));
 	FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | IMM(init_value >> 32)));
-	FAIL_IF(PUSH_RLDICR(reg, 31));
+	FAIL_IF(push_inst(compiler, SLDI(32) | S(reg) | A(reg)));
 	FAIL_IF(push_inst(compiler, ORIS | S(reg) | A(reg) | IMM(init_value >> 16)));
 	return push_inst(compiler, ORI | S(reg) | A(reg) | IMM(init_value));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64) {
+		FAIL_IF(push_inst(compiler, ((op & SLJIT_32) ? STW : STD) | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+		return push_inst(compiler, ((op & SLJIT_32) ? LFS : LFD) | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET);
+	}
+
+	FAIL_IF(push_inst(compiler, ((op & SLJIT_32) ? STFS : STFD) | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+	return push_inst(compiler, ((op & SLJIT_32) ? LWZ : LD) | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
 {
 	sljit_ins *inst = (sljit_ins*)addr;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_common.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_common.c
index 719e772bc4..81ba7d36b0 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_common.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativePPC_common.c
@@ -183,6 +183,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define FSUBS		(HI(59) | LO(20))
 #define LD		(HI(58) | 0)
 #define LFD		(HI(50))
+#define LFS		(HI(48))
+#define LWBRX		(HI(31) | LO(534))
 #define LWZ		(HI(32))
 #define MFCR		(HI(31) | LO(19))
 #define MFLR		(HI(31) | LO(339) | 0x80000)
@@ -203,8 +205,13 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define OR		(HI(31) | LO(444))
 #define ORI		(HI(24))
 #define ORIS		(HI(25))
-#define RLDICL		(HI(30))
+#define RLDCL		(HI(30) | LO(8))
+#define RLDICL		(HI(30) | LO(0 << 1))
+#define RLDICR		(HI(30) | LO(1 << 1))
+#define RLDIMI		(HI(30) | LO(3 << 1))
+#define RLWIMI		(HI(20))
 #define RLWINM		(HI(21))
+#define RLWNM		(HI(23))
 #define SLD		(HI(31) | LO(27))
 #define SLW		(HI(31) | LO(24))
 #define SRAD		(HI(31) | LO(794))
@@ -218,7 +225,9 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define STDUX		(HI(31) | LO(181))
 #define STFD		(HI(54))
 #define STFIWX		(HI(31) | LO(983))
+#define STFS		(HI(52))
 #define STW		(HI(36))
+#define STWBRX		(HI(31) | LO(662))
 #define STWU		(HI(37))
 #define STWUX		(HI(31) | LO(183))
 #define SUBF		(HI(31) | LO(40))
@@ -233,9 +242,34 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define SIMM_MIN	(-0x8000)
 #define UIMM_MAX	(0xffff)
 
-#define RLDI(dst, src, sh, mb, type) \
-	(HI(30) | S(src) | A(dst) | ((sljit_ins)(type) << 2) | (((sljit_ins)(sh) & 0x1f) << 11) \
-	| (((sljit_ins)(sh) & 0x20) >> 4) | (((sljit_ins)(mb) & 0x1f) << 6) | ((sljit_ins)(mb) & 0x20))
+/* Shift helpers. */
+#define RLWI_SH(sh) ((sljit_ins)(sh) << 11)
+#define RLWI_MBE(mb, me) (((sljit_ins)(mb) << 6) | ((sljit_ins)(me) << 1))
+#define RLDI_SH(sh) ((((sljit_ins)(sh) & 0x1f) << 11) | (((sljit_ins)(sh) & 0x20) >> 4))
+#define RLDI_MB(mb) ((((sljit_ins)(mb) & 0x1f) << 6) | ((sljit_ins)(mb) & 0x20))
+#define RLDI_ME(me) RLDI_MB(me)
+
+#define SLWI(shift) (RLWINM | RLWI_SH(shift) | RLWI_MBE(0, 31 - (shift)))
+#define SLDI(shift) (RLDICR | RLDI_SH(shift) | RLDI_ME(63 - (shift)))
+/* shift > 0 */
+#define SRWI(shift) (RLWINM | RLWI_SH(32 - (shift)) | RLWI_MBE((shift), 31))
+#define SRDI(shift) (RLDICL | RLDI_SH(64 - (shift)) | RLDI_MB(shift))
+
+#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
+#define SLWI_W(shift) SLWI(shift)
+#define TMP_MEM_OFFSET (2 * sizeof(sljit_sw))
+#else /* !SLJIT_CONFIG_PPC_32 */
+#define SLWI_W(shift) SLDI(shift)
+#define TMP_MEM_OFFSET (6 * sizeof(sljit_sw))
+#endif /* SLJIT_CONFIG_PPC_32 */
+
+#if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
+#define TMP_MEM_OFFSET_LOW TMP_MEM_OFFSET
+#define TMP_MEM_OFFSET_HI (TMP_MEM_OFFSET + sizeof(sljit_s32))
+#else /* !SLJIT_LITTLE_ENDIAN */
+#define TMP_MEM_OFFSET_LOW (TMP_MEM_OFFSET + sizeof(sljit_s32))
+#define TMP_MEM_OFFSET_HI TMP_MEM_OFFSET
+#endif /* SLJIT_LITTLE_ENDIAN */
 
 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_function_context(void** func_ptr, struct sljit_function_context* context, sljit_uw addr, void* func)
@@ -371,7 +405,7 @@ static SLJIT_INLINE void put_label_set(struct sljit_put_label *put_label)
 			inst++;
 		}
 
-		inst[1] = RLDI(reg, reg, 32, 31, 1);
+		inst[1] = SLDI(32) | S(reg) | A(reg);
 		inst[2] = ORIS | S(reg) | A(reg) | IMM((addr >> 16) & 0xffff);
 		inst += 2;
 	}
@@ -379,7 +413,7 @@ static SLJIT_INLINE void put_label_set(struct sljit_put_label *put_label)
 	inst[1] = ORI | S(reg) | A(reg) | IMM(addr & 0xffff);
 }
 
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
 {
@@ -641,9 +675,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 	/* A saved register is set to a zero value. */
 	case SLJIT_HAS_ZERO_REGISTER:
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_PREFETCH:
 		return 1;
 
+	case SLJIT_HAS_CTZ:
+	case SLJIT_HAS_REV:
+		return 2;
+
 	default:
 		return 0;
 	}
@@ -713,13 +752,16 @@ ALT_FORM5		0x010000 */
 #endif
 
 #if (defined SLJIT_PPC_STACK_FRAME_V2 && SLJIT_PPC_STACK_FRAME_V2)
-#define LR_SAVE_OFFSET		2 * SSIZE_OF(sw)
+#define LR_SAVE_OFFSET		(2 * SSIZE_OF(sw))
 #else
 #define LR_SAVE_OFFSET		SSIZE_OF(sw)
 #endif
 
 #define STACK_MAX_DISTANCE	(0x8000 - SSIZE_OF(sw) - LR_SAVE_OFFSET)
 
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flags, sljit_s32 reg,
+	sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg);
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
@@ -736,7 +778,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
 	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 0)
-		+ GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+		+ GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 
 	if (!(options & SLJIT_ENTER_REG_ARG))
 		local_size += SSIZE_OF(sw);
@@ -846,7 +888,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
 	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 0)
-		+ GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+		+ GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 
 	if (!(options & SLJIT_ENTER_REG_ARG))
 		local_size += SSIZE_OF(sw);
@@ -855,7 +897,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_s32 i, tmp, base, offset;
 	sljit_s32 local_size = compiler->local_size;
@@ -873,7 +915,8 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 	}
 
 	offset = local_size;
-	FAIL_IF(push_inst(compiler, STACK_LOAD | S(0) | A(base) | IMM(offset + LR_SAVE_OFFSET)));
+	if (!is_return_to)
+		FAIL_IF(push_inst(compiler, STACK_LOAD | S(0) | A(base) | IMM(offset + LR_SAVE_OFFSET)));
 
 	tmp = SLJIT_FS0 - compiler->fsaveds;
 	for (i = SLJIT_FS0; i > tmp; i--) {
@@ -902,7 +945,8 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 		FAIL_IF(push_inst(compiler, STACK_LOAD | S(i) | A(base) | IMM(offset)));
 	}
 
-	push_inst(compiler, MTLR | S(0));
+	if (!is_return_to)
+		push_inst(compiler, MTLR | S(0));
 
 	if (local_size > 0)
 		return push_inst(compiler, ADDI | D(SLJIT_SP) | A(base) | IMM(local_size));
@@ -911,17 +955,40 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 	return push_inst(compiler, OR | S(base) | A(SLJIT_SP) | B(base));
 }
 
+#undef STACK_STORE
+#undef STACK_LOAD
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
 {
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 	return push_inst(compiler, BLR);
 }
 
-#undef STACK_STORE
-#undef STACK_LOAD
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_CALL_REG, src, srcw, TMP_CALL_REG));
+		src = TMP_CALL_REG;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, OR | S(src) | A(TMP_CALL_REG) | B(src)));
+		src = TMP_CALL_REG;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
 
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
@@ -1087,7 +1154,6 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flag
 {
 	sljit_ins inst;
 	sljit_s32 offs_reg;
-	sljit_sw high_short;
 
 	/* Should work when (arg & REG_MASK) == 0. */
 	SLJIT_ASSERT(A(0) == 0);
@@ -1098,11 +1164,7 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flag
 		offs_reg = OFFS_REG(arg);
 
 		if (argw != 0) {
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-			FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(arg)) | A(tmp_reg) | ((sljit_ins)argw << 11) | ((31 - (sljit_ins)argw) << 1)));
-#else
-			FAIL_IF(push_inst(compiler, RLDI(tmp_reg, OFFS_REG(arg), argw, 63 - argw, 1)));
-#endif
+			FAIL_IF(push_inst(compiler, SLWI_W(argw) | S(OFFS_REG(arg)) | A(tmp_reg)));
 			offs_reg = tmp_reg;
 		}
 
@@ -1110,7 +1172,7 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flag
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 		SLJIT_ASSERT(!(inst & INT_ALIGNED));
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(offs_reg));
 	}
@@ -1125,36 +1187,24 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flag
 		inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | B(tmp_reg));
 	}
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 	if (argw <= SIMM_MAX && argw >= SIMM_MIN)
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | IMM(argw));
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	if (argw <= 0x7fff7fffl && argw >= -0x80000000l) {
-#endif
-
-		high_short = (sljit_s32)(argw + ((argw & 0x8000) << 1)) & ~0xffff;
-
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-		SLJIT_ASSERT(high_short && high_short <= 0x7fffffffl && high_short >= -0x80000000l);
-#else
-		SLJIT_ASSERT(high_short);
-#endif
-
-		FAIL_IF(push_inst(compiler, ADDIS | D(tmp_reg) | A(arg) | IMM(high_short >> 16)));
+#endif /* SLJIT_CONFIG_PPC_64 */
+		FAIL_IF(push_inst(compiler, ADDIS | D(tmp_reg) | A(arg) | IMM((argw + 0x8000) >> 16)));
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_reg) | IMM(argw));
-
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	}
 
-	/* The rest is PPC-64 only. */
-
 	FAIL_IF(load_immediate(compiler, tmp_reg, argw));
 
 	inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
 	return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | B(tmp_reg));
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 }
 
 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 input_flags,
@@ -1277,29 +1327,94 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_prefetch(struct sljit_compiler *compiler,
-        sljit_s32 src, sljit_sw srcw)
+static sljit_s32 emit_rev(struct sljit_compiler *compiler, sljit_s32 op_flags,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
 {
-	if (!(src & OFFS_REG_MASK)) {
-		if (srcw == 0 && (src & REG_MASK))
-			return push_inst(compiler, DCBT | A(0) | B(src & REG_MASK));
+	sljit_s32 mem, offs_reg, inp_flags;
+	sljit_sw memw;
+	SLJIT_UNUSED_ARG(op_flags);
 
-		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
-		/* Works with SLJIT_MEM0() case as well. */
-		return push_inst(compiler, DCBT | A(src & REG_MASK) | B(TMP_REG1));
+	if (!((dst | src) & SLJIT_MEM)) {
+		/* Both are registers. */
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+		if (!(op_flags & SLJIT_32)) {
+			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(0) | IMM(TMP_MEM_OFFSET_HI)));
+			FAIL_IF(push_inst(compiler, RLDICL | S(src) | A(TMP_REG1) | RLDI_SH(32) | RLDI_MB(32)));
+			FAIL_IF(push_inst(compiler, STWBRX | S(src) | A(SLJIT_SP) | B(TMP_REG2)));
+			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(0) | IMM(TMP_MEM_OFFSET_LOW)));
+			FAIL_IF(push_inst(compiler, STWBRX | S(TMP_REG1) | A(SLJIT_SP) | B(TMP_REG2)));
+			return push_inst(compiler, LD | D(dst) | A(SLJIT_SP) | TMP_MEM_OFFSET);
+		}
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(0) | IMM(TMP_MEM_OFFSET)));
+		FAIL_IF(push_inst(compiler, STWBRX | S(src) | A(SLJIT_SP) | B(TMP_REG2)));
+		return push_inst(compiler, LWZ | D(dst) | A(SLJIT_SP) | TMP_MEM_OFFSET);
 	}
 
-	srcw &= 0x3;
+	mem = src;
+	memw = srcw;
 
-	if (srcw == 0)
-		return push_inst(compiler, DCBT | A(src & REG_MASK) | B(OFFS_REG(src)));
+	if (dst & SLJIT_MEM) {
+		mem = dst;
+		memw = dstw;
 
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-	FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(src)) | A(TMP_REG1) | ((sljit_ins)srcw << 11) | ((31 - (sljit_ins)srcw) << 1)));
-#else
-	FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, OFFS_REG(src), srcw, 63 - srcw, 1)));
-#endif
-	return push_inst(compiler, DCBT | A(src & REG_MASK) | B(TMP_REG1));
+		if (src & SLJIT_MEM) {
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+			inp_flags = ((op_flags & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+#else /* !SLJIT_CONFIG_PPC_64 */
+			inp_flags = WORD_DATA | LOAD_DATA;
+#endif /* SLJIT_CONFIG_PPC_64 */
+			FAIL_IF(emit_op_mem(compiler, inp_flags, TMP_REG1, src, srcw, TMP_REG2));
+			src = TMP_REG1;
+		}
+	}
+
+	if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+		offs_reg = OFFS_REG(mem);
+		mem &= REG_MASK;
+		memw &= 0x3;
+
+		if (memw != 0) {
+			FAIL_IF(push_inst(compiler, SLWI_W(memw) | S(offs_reg) | A(TMP_REG2)));
+			offs_reg = TMP_REG2;
+		}
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	} else if (memw > 0x7fff7fffl || memw < -0x80000000l) {
+		FAIL_IF(load_immediate(compiler, TMP_REG2, memw));
+		offs_reg = TMP_REG2;
+		mem &= REG_MASK;
+#endif /* SLJIT_CONFIG_PPC_64 */
+	} else {
+		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(mem & REG_MASK) | IMM(memw)));
+		if (memw > SIMM_MAX || memw < SIMM_MIN)
+			FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG2) | A(TMP_REG2) | IMM((memw + 0x8000) >> 16)));
+
+		mem = 0;
+		offs_reg = TMP_REG2;
+	}
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	if (!(op_flags & SLJIT_32)) {
+		if (dst & SLJIT_MEM) {
+			FAIL_IF(push_inst(compiler, STWBRX | S(src) | A(mem) | B(offs_reg)));
+			FAIL_IF(push_inst(compiler, RLDICL | S(src) | A(TMP_REG1) | RLDI_SH(32) | RLDI_MB(32)));
+			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(offs_reg) | IMM(SSIZE_OF(s32))));
+			return push_inst(compiler, STWBRX | S(TMP_REG1) | A(mem) | B(TMP_REG2));
+		}
+
+		FAIL_IF(push_inst(compiler, LWBRX | S(dst) | A(mem) | B(offs_reg)));
+		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(offs_reg) | IMM(SSIZE_OF(s32))));
+		FAIL_IF(push_inst(compiler, LWBRX | S(TMP_REG1) | A(mem) | B(TMP_REG2)));
+		return push_inst(compiler, RLDIMI | S(TMP_REG1) | A(dst) | RLDI_SH(32) | RLDI_MB(0));
+	}
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+	if (dst & SLJIT_MEM)
+		return push_inst(compiler, STWBRX | S(src) | A(mem) | B(offs_reg));
+
+	return push_inst(compiler, LWBRX | S(dst) | A(mem) | B(offs_reg));
 }
 
 #define EMIT_MOV(type, type_flags, type_cast) \
@@ -1322,14 +1437,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	if (GET_FLAG_TYPE(op_flags) == SLJIT_OVERFLOW)
 		FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));
 
-	if (op < SLJIT_NOT && FAST_IS_REG(src) && src == dst) {
+	if (op <= SLJIT_MOV_P && FAST_IS_REG(src) && src == dst) {
 		if (!TYPE_CAST_NEEDED(op))
 			return SLJIT_SUCCESS;
 	}
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	if (op_flags & SLJIT_32) {
-		if (op < SLJIT_NOT) {
+		if (op <= SLJIT_MOV_P) {
 			if (src & SLJIT_MEM) {
 				if (op == SLJIT_MOV_S32)
 					op = SLJIT_MOV_U32;
@@ -1379,15 +1494,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	case SLJIT_MOV_S16:
 		return EMIT_MOV(SLJIT_MOV_S16, HALF_DATA | SIGNED_DATA, (sljit_s16));
 
-	case SLJIT_NOT:
-		return emit_op(compiler, SLJIT_NOT, flags, dst, dstw, TMP_REG1, 0, src, srcw);
-
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-		return emit_op(compiler, SLJIT_CLZ, flags | (!(op_flags & SLJIT_32) ? 0 : ALT_FORM1), dst, dstw, TMP_REG1, 0, src, srcw);
+		return emit_op(compiler, op, flags | (!(op_flags & SLJIT_32) ? 0 : ALT_FORM1), dst, dstw, TMP_REG1, 0, src, srcw);
 #else
-		return emit_op(compiler, SLJIT_CLZ, flags, dst, dstw, TMP_REG1, 0, src, srcw);
+		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
 #endif
+	case SLJIT_REV:
+		return emit_rev(compiler, op_flags, dst, dstw, src, srcw);
 	}
 
 	return SLJIT_SUCCESS;
@@ -1533,7 +1648,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 				return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM3, dst, dstw, src2, src2w, TMP_REG2, 0);
 			}
 		}
-		return emit_op(compiler, SLJIT_ADD, flags | ((GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY)) ? ALT_FORM5 : 0), dst, dstw, src1, src1w, src2, src2w);
+		return emit_op(compiler, SLJIT_ADD, flags | ((GET_FLAG_TYPE(op) == SLJIT_CARRY) ? ALT_FORM5 : 0), dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_ADDC:
 		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
@@ -1600,7 +1715,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		}
 
 		/* We know ALT_SIGN_EXT is set if it is an SLJIT_32 on 64 bit systems. */
-		return emit_op(compiler, SLJIT_SUB, flags | ((GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY)) ? ALT_FORM5 : 0), dst, dstw, src1, src1w, src2, src2w);
+		return emit_op(compiler, SLJIT_SUB, flags | ((GET_FLAG_TYPE(op) == SLJIT_CARRY) ? ALT_FORM5 : 0), dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SUBC:
 		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
@@ -1625,9 +1740,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 			FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));
 		return emit_op(compiler, SLJIT_MUL, flags, dst, dstw, src1, src1w, src2, src2w);
 
+	case SLJIT_XOR:
+		if ((src2 & SLJIT_IMM) && src2w == -1) {
+			return emit_op(compiler, GET_OPCODE(op), flags | ALT_FORM4, dst, dstw, TMP_REG1, 0, src1, src1w);
+		}
+		if ((src1 & SLJIT_IMM) && src1w == -1) {
+			return emit_op(compiler, GET_OPCODE(op), flags | ALT_FORM4, dst, dstw, TMP_REG1, 0, src2, src2w);
+		}
+		/* fallthrough */
 	case SLJIT_AND:
 	case SLJIT_OR:
-	case SLJIT_XOR:
 		/* Commutative unsigned operations. */
 		if (!HAS_FLAGS(op) || GET_OPCODE(op) == SLJIT_AND) {
 			if (TEST_UL_IMM(src2, src2w)) {
@@ -1661,8 +1783,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		return emit_op(compiler, GET_OPCODE(op), flags, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 		if (op & SLJIT_32)
 			flags |= ALT_FORM2;
@@ -1692,6 +1819,115 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 #undef TEST_SUB_FORM2
 #undef TEST_SUB_FORM3
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_s32 is_right;
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	sljit_s32 inp_flags = ((op & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_PPC_64 */
+	sljit_s32 inp_flags = WORD_DATA | LOAD_DATA;
+	sljit_sw bit_length = 32;
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+
+	is_right = (GET_OPCODE(op) == SLJIT_LSHR || GET_OPCODE(op) == SLJIT_MLSHR);
+
+	if (src1_reg == src2_reg) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_right ? SLJIT_ROTR : SLJIT_ROTL) | (op & SLJIT_32), dst_reg, 0, src1_reg, 0, src3, src3w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	if (src3 & SLJIT_IMM) {
+		src3w &= bit_length - 1;
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+		if (!(op & SLJIT_32)) {
+			if (is_right) {
+				FAIL_IF(push_inst(compiler, SRDI(src3w) | S(src1_reg) | A(dst_reg)));
+				return push_inst(compiler, RLDIMI | S(src2_reg) | A(dst_reg) | RLDI_SH(64 - src3w) | RLDI_MB(0));
+			}
+
+			FAIL_IF(push_inst(compiler, SLDI(src3w) | S(src1_reg) | A(dst_reg)));
+			/* Computes SRDI(64 - src2w). */
+			FAIL_IF(push_inst(compiler, RLDICL | S(src2_reg) | A(TMP_REG1) | RLDI_SH(src3w) | RLDI_MB(64 - src3w)));
+			return push_inst(compiler, OR | S(dst_reg) | A(dst_reg) | B(TMP_REG1));
+		}
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+		if (is_right) {
+			FAIL_IF(push_inst(compiler, SRWI(src3w) | S(src1_reg) | A(dst_reg)));
+			return push_inst(compiler, RLWIMI | S(src2_reg) | A(dst_reg) | RLWI_SH(32 - src3w) | RLWI_MBE(0, src3w - 1));
+		}
+
+		FAIL_IF(push_inst(compiler, SLWI(src3w) | S(src1_reg) | A(dst_reg)));
+		return push_inst(compiler, RLWIMI | S(src2_reg) | A(dst_reg) | RLWI_SH(src3w) | RLWI_MBE(32 - src3w, 31));
+	}
+
+	if (src3 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, TMP_REG2, src3, src3w, TMP_REG2));
+		src3 = TMP_REG2;
+	}
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	if (!(op & SLJIT_32)) {
+		if (GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR || dst_reg == src3) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src3) | A(TMP_REG2) | 0x3f));
+			src3 = TMP_REG2;
+		}
+
+		FAIL_IF(push_inst(compiler, (is_right ? SRD : SLD) | S(src1_reg) | A(dst_reg) | B(src3)));
+		FAIL_IF(push_inst(compiler, (is_right ? SLDI(1) : SRDI(1)) | S(src2_reg) | A(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, XORI | S(src3) | A(TMP_REG2) | 0x3f));
+		FAIL_IF(push_inst(compiler, (is_right ? SLD : SRD) | S(TMP_REG1) | A(TMP_REG1) | B(TMP_REG2)));
+		return push_inst(compiler, OR | S(dst_reg) | A(dst_reg) | B(TMP_REG1));
+	}
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+	if (GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR || dst_reg == src3) {
+		FAIL_IF(push_inst(compiler, ANDI | S(src3) | A(TMP_REG2) | 0x1f));
+		src3 = TMP_REG2;
+	}
+
+	FAIL_IF(push_inst(compiler, (is_right ? SRW : SLW) | S(src1_reg) | A(dst_reg) | B(src3)));
+	FAIL_IF(push_inst(compiler, (is_right ? SLWI(1) : SRWI(1)) | S(src2_reg) | A(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, XORI | S(src3) | A(TMP_REG2) | 0x1f));
+	FAIL_IF(push_inst(compiler, (is_right ? SLW : SRW) | S(TMP_REG1) | A(TMP_REG1) | B(TMP_REG2)));
+	return push_inst(compiler, OR | S(dst_reg) | A(dst_reg) | B(TMP_REG1));
+}
+
+static sljit_s32 emit_prefetch(struct sljit_compiler *compiler,
+        sljit_s32 src, sljit_sw srcw)
+{
+	if (!(src & OFFS_REG_MASK)) {
+		if (srcw == 0 && (src & REG_MASK))
+			return push_inst(compiler, DCBT | A(0) | B(src & REG_MASK));
+
+		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
+		/* Works with SLJIT_MEM0() case as well. */
+		return push_inst(compiler, DCBT | A(src & REG_MASK) | B(TMP_REG1));
+	}
+
+	srcw &= 0x3;
+
+	if (srcw == 0)
+		return push_inst(compiler, DCBT | A(src & REG_MASK) | B(OFFS_REG(src)));
+
+	FAIL_IF(push_inst(compiler, SLWI_W(srcw) | S(OFFS_REG(src)) | A(TMP_REG1)));
+	return push_inst(compiler, DCBT | A(src & REG_MASK) | B(TMP_REG1));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1704,7 +1940,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 		if (FAST_IS_REG(src))
 			FAIL_IF(push_inst(compiler, MTLR | S(src)));
 		else {
-			FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
+			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, src, srcw, TMP_REG2));
 			FAIL_IF(push_inst(compiler, MTLR | S(TMP_REG2)));
 		}
 
@@ -1721,6 +1957,34 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 dst_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		if (FAST_IS_REG(dst))
+			return push_inst(compiler, MFLR | D(dst));
+
+		FAIL_IF(push_inst(compiler, MFLR | D(TMP_REG1)));
+		break;
+	case SLJIT_GET_RETURN_ADDRESS:
+		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, dst_r, SLJIT_MEM1(SLJIT_SP), compiler->local_size + LR_SAVE_OFFSET, TMP_REG2));
+		break;
+	}
+
+	if (dst & SLJIT_MEM)
+		return emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -1749,21 +2013,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *c
 #define FLOAT_DATA(op) (DOUBLE_DATA | ((op & SLJIT_32) >> 6))
 #define SELECT_FOP(op, single, double) ((sljit_ins)((op & SLJIT_32) ? single : double))
 
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-#define FLOAT_TMP_MEM_OFFSET (6 * sizeof(sljit_sw))
-#else
-#define FLOAT_TMP_MEM_OFFSET (2 * sizeof(sljit_sw))
-
-#if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
-#define FLOAT_TMP_MEM_OFFSET_LOW (2 * sizeof(sljit_sw))
-#define FLOAT_TMP_MEM_OFFSET_HI (3 * sizeof(sljit_sw))
-#else
-#define FLOAT_TMP_MEM_OFFSET_LOW (3 * sizeof(sljit_sw))
-#define FLOAT_TMP_MEM_OFFSET_HI (2 * sizeof(sljit_sw))
-#endif
-
-#endif /* SLJIT_CONFIG_PPC_64 */
-
 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
@@ -1780,19 +2029,19 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_comp
 
 	if (op == SLJIT_CONV_SW_FROM_F64) {
 		if (FAST_IS_REG(dst)) {
-			FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
-			return emit_op_mem(compiler, WORD_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1);
+			FAIL_IF(push_inst(compiler, STFD | FS(TMP_FREG1) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+			return push_inst(compiler, LD | S(dst) | A(SLJIT_SP) | TMP_MEM_OFFSET);
 		}
 		return emit_op_mem(compiler, DOUBLE_DATA, TMP_FREG1, dst, dstw, TMP_REG1);
 	}
-#else
+#else /* !SLJIT_CONFIG_PPC_64 */
 	FAIL_IF(push_inst(compiler, FCTIWZ | FD(TMP_FREG1) | FB(src)));
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 	if (FAST_IS_REG(dst)) {
-		FAIL_IF(load_immediate(compiler, TMP_REG1, FLOAT_TMP_MEM_OFFSET));
+		FAIL_IF(load_immediate(compiler, TMP_REG1, TMP_MEM_OFFSET));
 		FAIL_IF(push_inst(compiler, STFIWX | FS(TMP_FREG1) | A(SLJIT_SP) | B(TMP_REG1)));
-		return emit_op_mem(compiler, INT_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1);
+		return push_inst(compiler, LWZ | S(dst) | A(SLJIT_SP) | TMP_MEM_OFFSET);
 	}
 
 	SLJIT_ASSERT(dst & SLJIT_MEM);
@@ -1800,22 +2049,16 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_comp
 	if (dst & OFFS_REG_MASK) {
 		dstw &= 0x3;
 		if (dstw) {
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-			FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(dst)) | A(TMP_REG1) | ((sljit_ins)dstw << 11) | ((31 - (sljit_ins)dstw) << 1)));
-#else
-			FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, OFFS_REG(dst), dstw, 63 - dstw, 1)));
-#endif
+			FAIL_IF(push_inst(compiler, SLWI_W(dstw) | S(OFFS_REG(dst)) | A(TMP_REG1)));
 			dstw = TMP_REG1;
-		}
-		else
+		} else
 			dstw = OFFS_REG(dst);
 	}
 	else {
 		if ((dst & REG_MASK) && !dstw) {
 			dstw = dst & REG_MASK;
 			dst = 0;
-		}
-		else {
+		} else {
 			/* This works regardless we have SLJIT_MEM1 or SLJIT_MEM0. */
 			FAIL_IF(load_immediate(compiler, TMP_REG1, dstw));
 			dstw = TMP_REG1;
@@ -1839,8 +2082,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
 
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
-	}
-	else if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32) {
+	} else if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32) {
 		if (FAST_IS_REG(src))
 			FAIL_IF(push_inst(compiler, EXTSW | S(src) | A(TMP_REG1)));
 		else
@@ -1849,21 +2091,21 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
 	}
 
 	if (FAST_IS_REG(src)) {
-		FAIL_IF(emit_op_mem(compiler, WORD_DATA, src, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
-		FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
-	}
-	else
+		FAIL_IF(push_inst(compiler, STD | S(src) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+		FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG1) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+	} else
 		FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, src, srcw, TMP_REG1));
 
 	FAIL_IF(push_inst(compiler, FCFID | FD(dst_r) | FB(TMP_FREG1)));
 
 	if (dst & SLJIT_MEM)
 		return emit_op_mem(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, TMP_REG1);
+
 	if (op & SLJIT_32)
 		return push_inst(compiler, FRSP | FD(dst_r) | FB(dst_r));
 	return SLJIT_SUCCESS;
 
-#else
+#else /* !SLJIT_CONFIG_PPC_64 */
 
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 	sljit_s32 invert_sign = 1;
@@ -1872,8 +2114,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw ^ (sljit_sw)0x80000000));
 		src = TMP_REG1;
 		invert_sign = 0;
-	}
-	else if (!FAST_IS_REG(src)) {
+	} else if (!FAST_IS_REG(src)) {
 		FAIL_IF(emit_op_mem(compiler, WORD_DATA | SIGNED_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
 		src = TMP_REG1;
 	}
@@ -1882,16 +2123,16 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
 	   The double precision format has exactly 53 bit precision, so the lower 32 bit represents
 	   the lower 32 bit of such value. The result of xor 2^31 is the same as adding 0x80000000
 	   to the input, which shifts it into the 0 - 0xffffffff range. To get the converted floating
-	   point value, we need to substract 2^53 + 2^31 from the constructed value. */
+	   point value, we need to subtract 2^53 + 2^31 from the constructed value. */
 	FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG2) | A(0) | 0x4330));
 	if (invert_sign)
 		FAIL_IF(push_inst(compiler, XORIS | S(src) | A(TMP_REG1) | 0x8000));
-	FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_HI, TMP_REG1));
-	FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW, TMP_REG2));
+	FAIL_IF(push_inst(compiler, STW | S(TMP_REG2) | A(SLJIT_SP) | TMP_MEM_OFFSET_HI));
+	FAIL_IF(push_inst(compiler, STW | S(TMP_REG1) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
 	FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG1) | A(0) | 0x8000));
-	FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
-	FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW, TMP_REG2));
-	FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG2, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
+	FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG1) | A(SLJIT_SP) | TMP_MEM_OFFSET));
+	FAIL_IF(push_inst(compiler, STW | S(TMP_REG1) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+	FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG2) | A(SLJIT_SP) | TMP_MEM_OFFSET));
 
 	FAIL_IF(push_inst(compiler, FSUB | FD(dst_r) | FA(TMP_FREG1) | FB(TMP_FREG2)));
 
@@ -1901,7 +2142,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
 		return push_inst(compiler, FRSP | FD(dst_r) | FB(dst_r));
 	return SLJIT_SUCCESS;
 
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 }
 
 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
@@ -1922,13 +2163,10 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile
 
 	switch (GET_FLAG_TYPE(op)) {
 	case SLJIT_UNORDERED_OR_EQUAL:
-	case SLJIT_ORDERED_NOT_EQUAL:
 		return push_inst(compiler, CROR | ((4 + 2) << 21) | ((4 + 2) << 16) | ((4 + 3) << 11));
 	case SLJIT_UNORDERED_OR_LESS:
-	case SLJIT_ORDERED_GREATER_EQUAL:
 		return push_inst(compiler, CROR | ((4 + 0) << 21) | ((4 + 0) << 16) | ((4 + 3) << 11));
 	case SLJIT_UNORDERED_OR_GREATER:
-	case SLJIT_ORDERED_LESS_EQUAL:
 		return push_inst(compiler, CROR | ((4 + 1) << 21) | ((4 + 1) << 16) | ((4 + 3) << 11));
 	}
 
@@ -2036,24 +2274,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 
 #undef SELECT_FOP
 
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, MFLR | D(dst));
-
-	/* Memory. */
-	FAIL_IF(push_inst(compiler, MFLR | D(TMP_REG2)));
-	return emit_op(compiler, SLJIT_MOV, WORD_DATA, dst, dstw, TMP_REG1, 0, TMP_REG2, 0);
-}
-
 /* --------------------------------------------------------------------- */
 /*  Conditional instructions                                             */
 /* --------------------------------------------------------------------- */
@@ -2204,7 +2424,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compile
 #endif
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
@@ -2219,7 +2439,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (FAST_IS_REG(src)) {
 #if (defined SLJIT_PASS_ENTRY_ADDR_TO_CALL && SLJIT_PASS_ENTRY_ADDR_TO_CALL)
@@ -2246,9 +2465,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 
 		FAIL_IF(emit_const(compiler, TMP_CALL_REG, 0));
 		src_r = TMP_CALL_REG;
-	}
-	else {
-		FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_CALL_REG, 0, TMP_REG1, 0, src, srcw));
+	} else {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_CALL_REG, src, srcw, TMP_CALL_REG));
 		src_r = TMP_CALL_REG;
 	}
 
@@ -2267,17 +2486,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 
 	if (src & SLJIT_MEM) {
 		ADJUST_LOCAL_OFFSET(src, srcw);
-		FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_CALL_REG, 0, TMP_REG1, 0, src, srcw));
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_CALL_REG, src, srcw, TMP_CALL_REG));
 		src = TMP_CALL_REG;
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			FAIL_IF(push_inst(compiler, OR | S(src) | A(TMP_CALL_REG) | B(src)));
 			src = TMP_CALL_REG;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP;
 	}
 
@@ -2425,7 +2644,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 	}
 
 	FAIL_IF(push_inst(compiler, (from_xer ? MFXER : MFCR) | D(reg)));
-	FAIL_IF(push_inst(compiler, RLWINM | S(reg) | A(reg) | ((1 + bit) << 11) | (31 << 6) | (31 << 1)));
+	/* Simplified mnemonics: extrwi. */
+	FAIL_IF(push_inst(compiler, RLWINM | S(reg) | A(reg) | RLWI_SH(1 + bit) | RLWI_MBE(31, 31)));
 
 	if (invert)
 		FAIL_IF(push_inst(compiler, XORI | S(reg) | A(reg) | 0x1));
@@ -2453,18 +2673,94 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);;
 }
 
+#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
+
+#define EMIT_MEM_LOAD_IMM(inst, mem, memw) \
+	((sljit_s16)(memw) > SIMM_MAX - SSIZE_OF(sw))
+
+#else /* !SLJIT_CONFIG_PPC_32 */
+
+#define EMIT_MEM_LOAD_IMM(inst, mem, memw) \
+	((((inst) & INT_ALIGNED) && ((memw) & 0x3) != 0) \
+		|| ((sljit_s16)(memw) > SIMM_MAX - SSIZE_OF(sw)) \
+		|| ((memw) > 0x7fff7fffl || (memw) < -0x80000000l)) \
+
+#endif /* SLJIT_CONFIG_PPC_32 */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_ins inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	inst = data_transfer_insts[WORD_DATA | ((type & SLJIT_MEM_STORE) ? 0 : LOAD_DATA)];
+
+	if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+		memw &= 0x3;
+
+		if (memw != 0) {
+			FAIL_IF(push_inst(compiler, SLWI_W(memw) | S(OFFS_REG(mem)) | A(TMP_REG1)));
+			FAIL_IF(push_inst(compiler, ADD | D(TMP_REG1) | A(TMP_REG1) | B(mem & REG_MASK)));
+		} else
+			FAIL_IF(push_inst(compiler, ADD | D(TMP_REG1) | A(mem & REG_MASK) | B(OFFS_REG(mem))));
+
+		mem = TMP_REG1;
+		memw = 0;
+	} else {
+		if (EMIT_MEM_LOAD_IMM(inst, mem, memw)) {
+			if ((mem & REG_MASK) != 0) {
+				SLJIT_SKIP_CHECKS(compiler);
+				FAIL_IF(sljit_emit_op2(compiler, SLJIT_ADD, TMP_REG1, 0, mem & REG_MASK, 0, SLJIT_IMM, memw));
+			} else
+				FAIL_IF(load_immediate(compiler, TMP_REG1, memw));
+
+			memw = 0;
+			mem = TMP_REG1;
+		} else if (memw > SIMM_MAX || memw < SIMM_MIN) {
+			FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG1) | A(mem & REG_MASK) | IMM((memw + 0x8000) >> 16)));
+
+			memw &= 0xffff;
+			mem = TMP_REG1;
+		} else {
+			memw &= 0xffff;
+			mem &= REG_MASK;
+		}
+	}
+
+	SLJIT_ASSERT((memw >= 0 && memw <= SIMM_MAX - SSIZE_OF(sw)) || (memw >= 0x8000 && memw <= 0xffff));
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	inst &= (sljit_ins)~INT_ALIGNED;
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+	if (!(type & SLJIT_MEM_STORE) && mem == REG_PAIR_FIRST(reg)) {
+		FAIL_IF(push_inst(compiler, inst | D(REG_PAIR_SECOND(reg)) | A(mem) | IMM(memw + SSIZE_OF(sw))));
+		return push_inst(compiler, inst | D(REG_PAIR_FIRST(reg)) | A(mem) | IMM(memw));
+	}
+
+	FAIL_IF(push_inst(compiler, inst | D(REG_PAIR_FIRST(reg)) | A(mem) | IMM(memw)));
+	return push_inst(compiler, inst | D(REG_PAIR_SECOND(reg)) | A(mem) | IMM(memw + SSIZE_OF(sw)));
+}
+
+#undef EMIT_MEM_LOAD_IMM
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
 {
 	sljit_s32 mem_flags;
 	sljit_ins inst;
 
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
-
-	if (type & SLJIT_MEM_UNALIGNED)
-		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
 
 	if (type & SLJIT_MEM_POST)
 		return SLJIT_ERR_UNSUPPORTED;
@@ -2552,7 +2848,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	return SLJIT_SUCCESS;
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw)
 {
@@ -2560,10 +2856,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
 	sljit_ins inst;
 
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
-
-	if (type & SLJIT_MEM_UNALIGNED)
-		return sljit_emit_fmem_unaligned(compiler, type, freg, mem, memw);
+	CHECK(check_sljit_emit_fmem_update(compiler, type, freg, mem, memw));
 
 	if (type & SLJIT_MEM_POST)
 		return SLJIT_ERR_UNSUPPORTED;
@@ -2611,7 +2904,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compi
 	PTR_FAIL_IF(emit_const(compiler, dst_r, init_value));
 
 	if (dst & SLJIT_MEM)
-		PTR_FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, dst, dstw, TMP_REG1, 0, TMP_REG2, 0));
+		PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, dst_r, dst, dstw, TMP_REG1));
 
 	return const_;
 }
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_32.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_32.c
index 24b8dc3905..4490be2aaf 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_32.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_32.c
@@ -27,6 +27,7 @@
 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r, sljit_sw imm, sljit_s32 tmp_r)
 {
 	SLJIT_UNUSED_ARG(tmp_r);
+	SLJIT_ASSERT(dst_r != tmp_r);
 
 	if (imm <= SIMM_MAX && imm >= SIMM_MIN)
 		return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm));
@@ -42,6 +43,51 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r
 	return push_inst(compiler, ADDI | RD(dst_r) | RS1(dst_r) | IMM_I(imm));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	sljit_ins inst;
+	sljit_s32 reg2 = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	if (op & SLJIT_32) {
+		if (op == SLJIT_COPY32_TO_F32)
+			inst = FMV_W_X | RS1(reg) | FRD(freg);
+		else
+			inst = FMV_X_W | FRS1(freg) | RD(reg);
+
+		return push_inst(compiler, inst);
+	}
+
+	FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RS1(SLJIT_SP) | IMM_I(-16)));
+
+	if (reg & REG_PAIR_MASK) {
+		reg2 = REG_PAIR_SECOND(reg);
+		reg = REG_PAIR_FIRST(reg);
+	}
+
+	if (op == SLJIT_COPY_TO_F64) {
+		if (reg2 != 0)
+			FAIL_IF(push_inst(compiler, SW | RS1(SLJIT_SP) | RS2(reg2) | (8 << 7)));
+		else
+			FAIL_IF(push_inst(compiler, FSW | RS1(SLJIT_SP) | FRS2(freg) | (8 << 7)));
+
+		FAIL_IF(push_inst(compiler, SW | RS1(SLJIT_SP) | RS2(reg) | (12 << 7)));
+		FAIL_IF(push_inst(compiler, FLD | FRD(freg) | RS1(SLJIT_SP) | IMM_I(8)));
+	} else {
+		FAIL_IF(push_inst(compiler, FSD | RS1(SLJIT_SP) | FRS2(freg) | (8 << 7)));
+
+		if (reg2 != 0)
+			FAIL_IF(push_inst(compiler, FMV_X_W | FRS1(freg) | RD(reg2)));
+
+		FAIL_IF(push_inst(compiler, LW | RD(reg) | RS1(SLJIT_SP) | IMM_I(12)));
+	}
+
+	return push_inst(compiler, ADDI | RD(SLJIT_SP) | RS1(SLJIT_SP) | IMM_I(16));
+}
+
 static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value, sljit_ins last_ins)
 {
 	if ((init_value & 0x800) != 0)
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_64.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_64.c
index 16a5f5f557..f93d6ff667 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_64.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_64.c
@@ -28,6 +28,8 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r
 {
 	sljit_sw high;
 
+	SLJIT_ASSERT(dst_r != tmp_r);
+
 	if (imm <= SIMM_MAX && imm >= SIMM_MIN)
 		return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm));
 
@@ -124,6 +126,25 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r
 	return push_inst(compiler, XOR | RD(dst_r) | RS1(dst_r) | RS2(tmp_r));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	sljit_ins inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64)
+		inst = FMV_W_X | RS1(reg) | FRD(freg);
+	else
+		inst = FMV_X_W | FRS1(freg) | RD(reg);
+
+	if (!(op & SLJIT_32))
+		inst |= (sljit_ins)1 << 25;
+
+	return push_inst(compiler, inst);
+}
+
 static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value, sljit_ins last_ins)
 {
 	sljit_sw high;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_common.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_common.c
index cc7d4a0e7b..473e06040a 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_common.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeRISCV_common.c
@@ -97,16 +97,19 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
 #define FLD		(F3(0x3) | OPC(0x7))
 #define FLE_S		(F7(0x50) | F3(0x0) | OPC(0x53))
 #define FLT_S		(F7(0x50) | F3(0x1) | OPC(0x53))
-#define FSD		(F3(0x3) | OPC(0x27))
 /* These conversion opcodes are partly defined. */
 #define FCVT_S_D	(F7(0x20) | OPC(0x53))
 #define FCVT_S_W	(F7(0x68) | OPC(0x53))
 #define FCVT_W_S	(F7(0x60) | F3(0x1) | OPC(0x53))
 #define FMUL_S		(F7(0x8) | F3(0x7) | OPC(0x53))
+#define FMV_X_W		(F7(0x70) | F3(0x0) | OPC(0x53))
+#define FMV_W_X		(F7(0x78) | F3(0x0) | OPC(0x53))
+#define FSD		(F3(0x3) | OPC(0x27))
 #define FSGNJ_S		(F7(0x10) | F3(0x0) | OPC(0x53))
 #define FSGNJN_S	(F7(0x10) | F3(0x1) | OPC(0x53))
 #define FSGNJX_S	(F7(0x10) | F3(0x2) | OPC(0x53))
 #define FSUB_S		(F7(0x4) | F3(0x7) | OPC(0x53))
+#define FSW		(F3(0x2) | OPC(0x27))
 #define JAL		(OPC(0x6f))
 #define JALR		(F3(0x0) | OPC(0x67))
 #define LD		(F3(0x3) | OPC(0x3))
@@ -236,7 +239,7 @@ static SLJIT_INLINE sljit_ins* detect_jump_type(struct sljit_jump *jump, sljit_i
 
 		jump->flags |= PATCH_ABS44;
 		inst[3] = inst[0];
-		return inst + 4;
+		return inst + 3;
 	}
 
 	if (target_addr <= S52_MAX) {
@@ -531,7 +534,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 {
 	switch (feature_type) {
 	case SLJIT_HAS_FPU:
+#ifdef SLJIT_IS_FPU_AVAILABLE
+		return SLJIT_IS_FPU_AVAILABLE;
+#elif defined(__riscv_float_abi_soft)
+		return 0;
+#else
+		return 1;
+#endif /* SLJIT_IS_FPU_AVAILABLE */
 	case SLJIT_HAS_ZERO_REGISTER:
+	case SLJIT_HAS_COPY_F32:
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	case SLJIT_HAS_COPY_F64:
+#endif /* !SLJIT_CONFIG_RISCV_64 */
 		return 1;
 	default:
 		return 0;
@@ -592,6 +606,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
 
 #define STACK_MAX_DISTANCE (-SIMM_MIN)
 
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw);
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
@@ -608,10 +624,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((local_size & SSIZE_OF(sw)) != 0)
 			local_size += SSIZE_OF(sw);
-		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 	}
 #else
-	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 #endif
 	local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
 	compiler->local_size = local_size;
@@ -702,10 +718,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((local_size & SSIZE_OF(sw)) != 0)
 			local_size += SSIZE_OF(sw);
-		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 	}
 #else
-	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64);
 #endif
 	compiler->local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
 
@@ -714,7 +730,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 
 #define STACK_MAX_DISTANCE (-SIMM_MIN - 16)
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_s32 i, tmp, offset;
 	sljit_s32 local_size = compiler->local_size;
@@ -734,7 +750,8 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 	SLJIT_ASSERT(local_size > 0);
 
 	offset = local_size - SSIZE_OF(sw);
-	FAIL_IF(push_inst(compiler, STACK_LOAD | RD(RETURN_ADDR_REG) | RS1(SLJIT_SP) | IMM_I(offset)));
+	if (!is_return_to)
+		FAIL_IF(push_inst(compiler, STACK_LOAD | RD(RETURN_ADDR_REG) | RS1(SLJIT_SP) | IMM_I(offset)));
 
 	tmp = SLJIT_S0 - compiler->saveds;
 	for (i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options); i > tmp; i--) {
@@ -774,10 +791,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 	return push_inst(compiler, JALR | RD(TMP_ZERO) | RS1(RETURN_ADDR_REG) | IMM_I(0));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(src) | IMM_I(0)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -980,23 +1020,22 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, s
 
 		if (SLJIT_UNLIKELY(argw)) {
 			FAIL_IF(push_inst(compiler, SLLI | RD(tmp_r) | RS1(OFFS_REG(arg)) | IMM_I(argw)));
-			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(base) | RS2(tmp_r)));
+			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(tmp_r) | RS2(base)));
 		}
 		else
 			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(base) | RS2(OFFS_REG(arg))));
-		return push_mem_inst(compiler, flags, reg, tmp_r, 0);
+
+		argw = 0;
+	} else {
+		FAIL_IF(load_immediate(compiler, tmp_r, TO_ARGW_HI(argw), TMP_REG3));
+
+		if (base != 0)
+			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(tmp_r) | RS2(base)));
 	}
 
-	FAIL_IF(load_immediate(compiler, tmp_r, TO_ARGW_HI(argw), TMP_REG3));
-
-	if (base != 0)
-		FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(base) | RS2(tmp_r)));
-
 	return push_mem_inst(compiler, flags, reg, tmp_r, argw & 0xfff);
 }
 
-#undef TO_ARGW_HI
-
 static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, sljit_sw arg2w)
 {
 	if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
@@ -1006,12 +1045,101 @@ static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, slji
 
 #if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
 #define WORD 0
+#define WORD_32 0
 #define IMM_EXTEND(v) (IMM_I(v))
 #else /* !SLJIT_CONFIG_RISCV_32 */
 #define WORD word
+#define WORD_32 0x08
 #define IMM_EXTEND(v) (IMM_I((op & SLJIT_32) ? (v) : (32 + (v))))
 #endif /* SLJIT_CONFIG_RISCV_32 */
 
+static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
+{
+	sljit_s32 is_clz = (GET_OPCODE(op) == SLJIT_CLZ);
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_ins max = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+	sljit_ins max = 32;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	SLJIT_ASSERT(WORD == 0 || WORD == 0x8);
+
+	/* The OTHER_FLAG is the counter. */
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(OTHER_FLAG) | RS1(TMP_ZERO) | IMM_I(max)));
+
+	/* The TMP_REG2 is the next value. */
+	if (src != TMP_REG2)
+		FAIL_IF(push_inst(compiler, ADDI | WORD | RD(TMP_REG2) | RS1(src) | IMM_I(0)));
+
+	FAIL_IF(push_inst(compiler, BEQ | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)((is_clz ? 4 : 5) * SSIZE_OF(ins)) << 7) | ((sljit_ins)(8 * SSIZE_OF(ins)) << 20)));
+
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(OTHER_FLAG) | RS1(TMP_ZERO) | IMM_I(0)));
+	if (!is_clz) {
+		FAIL_IF(push_inst(compiler, ANDI | RD(TMP_REG1) | RS1(TMP_REG2) | IMM_I(1)));
+		FAIL_IF(push_inst(compiler, BNE | RS1(TMP_REG1) | RS2(TMP_ZERO) | ((sljit_ins)(2 * SSIZE_OF(ins)) << 7) | ((sljit_ins)(8 * SSIZE_OF(ins)) << 20)));
+	} else
+		FAIL_IF(push_inst(compiler, BLT | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)(2 * SSIZE_OF(ins)) << 7) | ((sljit_ins)(8 * SSIZE_OF(ins)) << 20)));
+
+	/* The TMP_REG1 is the next shift. */
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(TMP_REG1) | RS1(TMP_ZERO) | IMM_I(max)));
+
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(EQUAL_FLAG) | RS1(TMP_REG2) | IMM_I(0)));
+	FAIL_IF(push_inst(compiler, SRLI | WORD | RD(TMP_REG1) | RS1(TMP_REG1) | IMM_I(1)));
+
+	FAIL_IF(push_inst(compiler, (is_clz ? SRL : SLL) | WORD | RD(TMP_REG2) | RS1(EQUAL_FLAG) | RS2(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, BNE | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)0xfe000e80 - ((2 * SSIZE_OF(ins)) << 7))));
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(TMP_REG2) | RS1(TMP_REG1) | IMM_I(-1)));
+	FAIL_IF(push_inst(compiler, (is_clz ? SRL : SLL) | WORD | RD(TMP_REG2) | RS1(EQUAL_FLAG) | RS2(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, OR | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | RS2(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, BEQ | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)0xfe000e80 - ((5 * SSIZE_OF(ins)) << 7))));
+
+	return push_inst(compiler, ADDI | WORD | RD(dst) | RS1(OTHER_FLAG) | IMM_I(0));
+}
+
+static sljit_s32 emit_rev(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
+{
+	SLJIT_UNUSED_ARG(op);
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	if (!(op & SLJIT_32)) {
+		FAIL_IF(push_inst(compiler, LUI | RD(OTHER_FLAG) | 0x10000));
+		FAIL_IF(push_inst(compiler, SRLI | RD(TMP_REG1) | RS1(src) | IMM_I(32)));
+		FAIL_IF(push_inst(compiler, ADDI | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | IMM_I(0xfff)));
+		FAIL_IF(push_inst(compiler, SLLI | RD(dst) | RS1(src) | IMM_I(32)));
+		FAIL_IF(push_inst(compiler, SLLI | RD(EQUAL_FLAG) | RS1(OTHER_FLAG) | IMM_I(32)));
+		FAIL_IF(push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, OR | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | RS2(EQUAL_FLAG)));
+
+		FAIL_IF(push_inst(compiler, SRLI | RD(TMP_REG1) | RS1(dst) | IMM_I(16)));
+		FAIL_IF(push_inst(compiler, AND | RD(dst) | RS1(dst) | RS2(OTHER_FLAG)));
+		FAIL_IF(push_inst(compiler, AND | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(OTHER_FLAG)));
+		FAIL_IF(push_inst(compiler, SLLI | RD(EQUAL_FLAG) | RS1(OTHER_FLAG) | IMM_I(8)));
+		FAIL_IF(push_inst(compiler, SLLI | RD(dst) | RS1(dst) | IMM_I(16)));
+		FAIL_IF(push_inst(compiler, XOR | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | RS2(EQUAL_FLAG)));
+		FAIL_IF(push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(TMP_REG1)));
+
+		FAIL_IF(push_inst(compiler, SRLI | RD(TMP_REG1) | RS1(dst) | IMM_I(8)));
+		FAIL_IF(push_inst(compiler, AND | RD(dst) | RS1(dst) | RS2(OTHER_FLAG)));
+		FAIL_IF(push_inst(compiler, AND | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(OTHER_FLAG)));
+		FAIL_IF(push_inst(compiler, SLLI | RD(dst) | RS1(dst) | IMM_I(8)));
+		return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(TMP_REG1));
+	}
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	FAIL_IF(push_inst(compiler, SRLI | WORD_32 | RD(TMP_REG1) | RS1(src) | IMM_I(16)));
+	FAIL_IF(push_inst(compiler, LUI | RD(OTHER_FLAG) | 0xff0000));
+	FAIL_IF(push_inst(compiler, SLLI | WORD_32 | RD(dst) | RS1(src) | IMM_I(16)));
+	FAIL_IF(push_inst(compiler, ORI | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | IMM_I(0xff)));
+	FAIL_IF(push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(TMP_REG1)));
+
+	FAIL_IF(push_inst(compiler, SRLI | WORD_32 | RD(TMP_REG1) | RS1(dst) | IMM_I(8)));
+	FAIL_IF(push_inst(compiler, AND | RD(dst) | RS1(dst) | RS2(OTHER_FLAG)));
+	FAIL_IF(push_inst(compiler, AND | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(OTHER_FLAG)));
+	FAIL_IF(push_inst(compiler, SLLI | WORD_32 | RD(dst) | RS1(dst) | IMM_I(8)));
+	return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(TMP_REG1));
+}
+
 #define EMIT_LOGICAL(op_imm, op_reg) \
 	if (flags & SRC2_IMM) { \
 		if (op & SLJIT_SET_Z) \
@@ -1026,30 +1154,21 @@ static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, slji
 			FAIL_IF(push_inst(compiler, op_reg | RD(dst) | RS1(src1) | RS2(src2))); \
 	}
 
-#define EMIT_SHIFT(op_imm, op_reg) \
-	if (flags & SRC2_IMM) { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(src2))); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(dst) | RS1(src1) | IMM_I(src2))); \
-	} \
-	else { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_reg | WORD | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2))); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_reg | WORD | RD(dst) | RS1(src1) | RS2(src2))); \
-	}
+#define EMIT_SHIFT(imm, reg) \
+	op_imm = (imm); \
+	op_reg = (reg);
 
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
 {
 	sljit_s32 is_overflow, is_carry, carry_src_r, is_handled;
+	sljit_ins op_imm, op_reg;
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
 	sljit_ins word = (op & SLJIT_32) >> 5;
-
-	SLJIT_ASSERT(word == 0 || word == 0x8);
 #endif /* SLJIT_CONFIG_RISCV_64 */
 
+	SLJIT_ASSERT(WORD == 0 || WORD == 0x8);
+
 	switch (GET_OPCODE(op)) {
 	case SLJIT_MOV:
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
@@ -1110,34 +1229,18 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 #endif /* SLJIT_CONFIG_RISCV_64 */
 
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		/* Nearly all instructions are unmovable in the following sequence. */
-#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
-		FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(src2) | IMM_I(0)));
-		FAIL_IF(push_inst(compiler, ADDI | RD(dst) | RS1(TMP_ZERO) | IMM_I(32)));
-#else /* !SLJIT_CONFIG_RISCV_32 */
-		if (op & SLJIT_32) {
-			FAIL_IF(push_inst(compiler, SLLI | RD(TMP_REG1) | RS1(src2) | IMM_I(32)));
-			FAIL_IF(push_inst(compiler, ADDI | RD(dst) | RS1(TMP_ZERO) | IMM_I(32)));
-		} else {
-			FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(src2) | IMM_I(0)));
-			FAIL_IF(push_inst(compiler, ADDI | RD(dst) | RS1(TMP_ZERO) | IMM_I(64)));
-		}
-#endif /* SLJIT_CONFIG_RISCV_32 */
-		/* Check zero. */
-		FAIL_IF(push_inst(compiler, BEQ | RS1(TMP_REG1) | RS2(TMP_ZERO) | ((sljit_ins)(6 * SSIZE_OF(ins)) << 7)));
-		FAIL_IF(push_inst(compiler, ADDI | RD(dst) | RS1(TMP_ZERO) | IMM_I(0)));
-		FAIL_IF(push_inst(compiler, BLT | RS1(TMP_REG1) | RS2(TMP_ZERO) | ((sljit_ins)(4 * SSIZE_OF(ins)) << 7)));
-		/* Loop for searching the highest bit. */
-		FAIL_IF(push_inst(compiler, ADDI | RD(dst) | RS1(dst) | IMM_I(1)));
-		FAIL_IF(push_inst(compiler, SLLI | RD(TMP_REG1) | RS1(TMP_REG1) | IMM_I(1)));
-		FAIL_IF(push_inst(compiler, BGE | RS1(TMP_REG1) | RS2(TMP_ZERO) | ((sljit_ins)(0x1fc001d - 1 * SSIZE_OF(ins)) << 7)));
-		return SLJIT_SUCCESS;
+		return emit_clz_ctz(compiler, op, dst, src2);
+
+	case SLJIT_REV:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		return emit_rev(compiler, op, dst, src2);
 
 	case SLJIT_ADD:
 		/* Overflow computation (both add and sub): overflow = src1_sign ^ src2_sign ^ result_sign ^ carry_flag */
 		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		carry_src_r = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		carry_src_r = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			if (is_overflow) {
@@ -1193,7 +1296,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		return push_inst(compiler, XOR | RD(OTHER_FLAG) | RS1(TMP_REG1) | RS2(OTHER_FLAG));
 
 	case SLJIT_ADDC:
-		carry_src_r = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		carry_src_r = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			FAIL_IF(push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(src2)));
@@ -1240,11 +1343,11 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		is_handled = 0;
 
 		if (flags & SRC2_IMM) {
-			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
+			if (GET_FLAG_TYPE(op) == SLJIT_LESS) {
 				FAIL_IF(push_inst(compiler, SLTUI | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2)));
 				is_handled = 1;
 			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
+			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS) {
 				FAIL_IF(push_inst(compiler, SLTI | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2)));
 				is_handled = 1;
 			}
@@ -1261,19 +1364,15 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 
 			switch (GET_FLAG_TYPE(op)) {
 			case SLJIT_LESS:
-			case SLJIT_GREATER_EQUAL:
 				FAIL_IF(push_inst(compiler, SLTU | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
 				break;
 			case SLJIT_GREATER:
-			case SLJIT_LESS_EQUAL:
 				FAIL_IF(push_inst(compiler, SLTU | RD(OTHER_FLAG) | RS1(src2) | RS2(src1)));
 				break;
 			case SLJIT_SIG_LESS:
-			case SLJIT_SIG_GREATER_EQUAL:
 				FAIL_IF(push_inst(compiler, SLT | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
 				break;
 			case SLJIT_SIG_GREATER:
-			case SLJIT_SIG_LESS_EQUAL:
 				FAIL_IF(push_inst(compiler, SLT | RD(OTHER_FLAG) | RS1(src2) | RS2(src1)));
 				break;
 			}
@@ -1296,7 +1395,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		}
 
 		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		is_carry = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			if (is_overflow) {
@@ -1345,7 +1444,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 			flags &= ~SRC2_IMM;
 		}
 
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+		is_carry = GET_FLAG_TYPE(op) == SLJIT_CARRY;
 
 		if (flags & SRC2_IMM) {
 			if (is_carry)
@@ -1406,20 +1505,71 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl
 		return SLJIT_SUCCESS;
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		EMIT_SHIFT(SLLI, SLL);
-		return SLJIT_SUCCESS;
+		break;
 
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		EMIT_SHIFT(SRLI, SRL);
-		return SLJIT_SUCCESS;
+		break;
 
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		EMIT_SHIFT(SRAI, SRA);
+		break;
+
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & SRC2_IMM) {
+			SLJIT_ASSERT(src2 != 0);
+
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SLLI : SRLI;
+			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2)));
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+			src2 = ((op & SLJIT_32) ? 32 : 64) - src2;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+			src2 = 32 - src2;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SRLI : SLLI;
+			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(dst) | RS1(src1) | IMM_I(src2)));
+			return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(OTHER_FLAG));
+		}
+
+		if (src2 == TMP_ZERO) {
+			if (dst != src1)
+				return push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(0));
+			return SLJIT_SUCCESS;
+		}
+
+		FAIL_IF(push_inst(compiler, SUB | WORD | RD(EQUAL_FLAG) | RS1(TMP_ZERO) | RS2(src2)));
+		op_reg = (GET_OPCODE(op) == SLJIT_ROTL) ? SLL : SRL;
+		FAIL_IF(push_inst(compiler, op_reg | WORD | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
+		op_reg = (GET_OPCODE(op) == SLJIT_ROTL) ? SRL : SLL;
+		FAIL_IF(push_inst(compiler, op_reg | WORD | RD(dst) | RS1(src1) | RS2(EQUAL_FLAG)));
+		return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(OTHER_FLAG));
+
+	default:
+		SLJIT_UNREACHABLE();
 		return SLJIT_SUCCESS;
 	}
 
-	SLJIT_UNREACHABLE();
-	return SLJIT_SUCCESS;
+	if (flags & SRC2_IMM) {
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(src2)));
+
+		if (flags & UNUSED_DEST)
+			return SLJIT_SUCCESS;
+		return push_inst(compiler, op_imm | WORD | RD(dst) | RS1(src1) | IMM_I(src2));
+	}
+
+	if (op & SLJIT_SET_Z)
+		FAIL_IF(push_inst(compiler, op_reg | WORD | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+
+	if (flags & UNUSED_DEST)
+		return SLJIT_SUCCESS;
+	return push_inst(compiler, op_reg | WORD | RD(dst) | RS1(src1) | RS2(src2));
 }
 
 #undef IMM_EXTEND
@@ -1599,8 +1749,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 	return SLJIT_SUCCESS;
 }
 
-#undef WORD
-
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
@@ -1649,10 +1797,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	case SLJIT_MOV_S16:
 		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
 
-	case SLJIT_NOT:
-		return emit_op(compiler, SLJIT_XOR | (op & (SLJIT_32 | SLJIT_SET_Z)), flags, dst, dstw, src, srcw, SLJIT_IMM, -1);
-
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
+	case SLJIT_REV:
 		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
 	}
 
@@ -1704,19 +1851,24 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
-#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
-		if (src2 & SLJIT_IMM)
-			src2w &= 0x1f;
-#else
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
 		if (src2 & SLJIT_IMM) {
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+			src2w &= 0x1f;
+#else /* !SLJIT_CONFIG_RISCV_32 */
 			if (op & SLJIT_32)
 				src2w &= 0x1f;
 			else
 				src2w &= 0x3f;
+#endif /* SLJIT_CONFIG_RISCV_32 */
 		}
-#endif
+
 		return emit_op(compiler, op, flags | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
 	}
 
@@ -1735,6 +1887,91 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_s32 is_left;
+	sljit_ins ins1, ins2, ins3;
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_s32 inp_flags = ((op & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+	sljit_s32 inp_flags = WORD_DATA | LOAD_DATA;
+	sljit_sw bit_length = 32;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	SLJIT_ASSERT(WORD == 0 || WORD == 0x8);
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	if (src1_reg == src2_reg) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_left ? SLJIT_ROTL : SLJIT_ROTR) | (op & SLJIT_32), dst_reg, 0, src1_reg, 0, src3, src3w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	if (src3 & SLJIT_IMM) {
+		src3w &= bit_length - 1;
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+
+		if (is_left) {
+			ins1 = SLLI | WORD | IMM_I(src3w);
+			src3w = bit_length - src3w;
+			ins2 = SRLI | WORD | IMM_I(src3w);
+		} else {
+			ins1 = SRLI | WORD | IMM_I(src3w);
+			src3w = bit_length - src3w;
+			ins2 = SLLI | WORD | IMM_I(src3w);
+		}
+
+		FAIL_IF(push_inst(compiler, ins1 | RD(dst_reg) | RS1(src1_reg)));
+		FAIL_IF(push_inst(compiler, ins2 | RD(TMP_REG1) | RS1(src2_reg)));
+		return push_inst(compiler, OR | RD(dst_reg) | RS1(dst_reg) | RS2(TMP_REG1));
+	}
+
+	if (src3 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, TMP_REG2, src3, src3w));
+		src3 = TMP_REG2;
+	} else if (dst_reg == src3) {
+		push_inst(compiler, ADDI | WORD | RD(TMP_REG2) | RS1(src3) | IMM_I(0));
+		src3 = TMP_REG2;
+	}
+
+	if (is_left) {
+		ins1 = SLL;
+		ins2 = SRLI;
+		ins3 = SRL;
+	} else {
+		ins1 = SRL;
+		ins2 = SLLI;
+		ins3 = SLL;
+	}
+
+	FAIL_IF(push_inst(compiler, ins1 | WORD | RD(dst_reg) | RS1(src1_reg) | RS2(src3)));
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		FAIL_IF(push_inst(compiler, ins2 | WORD | RD(TMP_REG1) | RS1(src2_reg) | IMM_I(1)));
+		FAIL_IF(push_inst(compiler, XORI | RD(TMP_REG2) | RS1(src3) | IMM_I((sljit_ins)bit_length - 1)));
+		src2_reg = TMP_REG1;
+	} else
+		FAIL_IF(push_inst(compiler, SUB | WORD | RD(TMP_REG2) | RS1(TMP_ZERO) | RS2(src3)));
+
+	FAIL_IF(push_inst(compiler, ins3 | WORD | RD(TMP_REG1) | RS1(src2_reg) | RS2(TMP_REG2)));
+	return push_inst(compiler, OR | RD(dst_reg) | RS1(dst_reg) | RS2(TMP_REG1));
+}
+
+#undef WORD
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1762,6 +1999,34 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 dst_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		if (FAST_IS_REG(dst))
+			return push_inst(compiler, ADDI | RD(dst) | RS1(RETURN_ADDR_REG) | IMM_I(0));
+
+		SLJIT_ASSERT(RETURN_ADDR_REG == TMP_REG2);
+		break;
+	case SLJIT_GET_RETURN_ADDRESS:
+		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, dst_r, SLJIT_MEM1(SLJIT_SP), compiler->local_size - SSIZE_OF(sw)));
+		break;
+	}
+
+	if (dst & SLJIT_MEM)
+		return emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -1888,38 +2153,29 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile
 
 	switch (GET_FLAG_TYPE(op)) {
 	case SLJIT_F_EQUAL:
-	case SLJIT_F_NOT_EQUAL:
 	case SLJIT_ORDERED_EQUAL:
-	case SLJIT_UNORDERED_OR_NOT_EQUAL:
 		inst = FEQ_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2);
 		break;
 	case SLJIT_F_LESS:
-	case SLJIT_F_GREATER_EQUAL:
 	case SLJIT_ORDERED_LESS:
-	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
 		inst = FLT_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2);
 		break;
 	case SLJIT_ORDERED_GREATER:
-	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		inst = FLT_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src2) | FRS2(src1);
 		break;
 	case SLJIT_F_GREATER:
-	case SLJIT_F_LESS_EQUAL:
 	case SLJIT_UNORDERED_OR_GREATER:
-	case SLJIT_ORDERED_LESS_EQUAL:
 		inst = FLE_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2);
 		break;
 	case SLJIT_UNORDERED_OR_LESS:
-	case SLJIT_ORDERED_GREATER_EQUAL:
 		inst = FLE_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src2) | FRS2(src1);
 		break;
 	case SLJIT_UNORDERED_OR_EQUAL: /* Not supported. */
-	case SLJIT_ORDERED_NOT_EQUAL: /* Not supported. */
 		FAIL_IF(push_inst(compiler, FLT_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2)));
 		FAIL_IF(push_inst(compiler, FLT_S | FMT(op) | RD(TMP_REG1) | FRS1(src2) | FRS2(src1)));
 		inst = OR | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | RS2(TMP_REG1);
 		break;
-	default: /* SLJIT_UNORDERED, SLJIT_ORDERED */
+	default: /* SLJIT_UNORDERED */
 		FAIL_IF(push_inst(compiler, FADD_S | FMT(op) | FRD(TMP_FREG1) | FRS1(src1) | FRS2(src2)));
 		inst = FEQ_S | FMT(op) | RD(OTHER_FLAG) | FRS1(TMP_FREG1) | FRS2(TMP_FREG1);
 		break;
@@ -2059,23 +2315,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 #undef FLOAT_DATA
 #undef FMT
 
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, ADDI | RD(dst) | RS1(RETURN_ADDR_REG) | IMM_I(0));
-
-	/* Memory. */
-	return emit_op_mem(compiler, WORD_DATA, RETURN_ADDR_REG, dst, dstw);
-}
-
 /* --------------------------------------------------------------------- */
 /*  Conditional instructions                                             */
 /* --------------------------------------------------------------------- */
@@ -2196,7 +2435,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compile
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
@@ -2313,10 +2552,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (!(src & SLJIT_IMM)) {
 		if (src & SLJIT_MEM) {
+			ADJUST_LOCAL_OFFSET(src, srcw);
 			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
 			src = TMP_REG1;
 		}
@@ -2348,20 +2587,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 	SLJIT_UNUSED_ARG(arg_types);
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
 		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
 		src = TMP_REG1;
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(src) | IMM_I(0)));
 			src = TMP_REG1;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP;
 	}
 
@@ -2466,6 +2705,62 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 flags;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+		memw &= 0x3;
+
+		if (SLJIT_UNLIKELY(memw != 0)) {
+			FAIL_IF(push_inst(compiler, SLLI | RD(TMP_REG1) | RS1(OFFS_REG(mem)) | IMM_I(memw)));
+			FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(mem & REG_MASK)));
+		} else
+			FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(mem & REG_MASK) | RS2(OFFS_REG(mem))));
+
+		mem = TMP_REG1;
+		memw = 0;
+	} else if (memw > SIMM_MAX - SSIZE_OF(sw) || memw < SIMM_MIN) {
+		if (((memw + 0x800) & 0xfff) <= 0xfff - SSIZE_OF(sw)) {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, TO_ARGW_HI(memw), TMP_REG3));
+			memw &= 0xfff;
+		} else {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, memw, TMP_REG3));
+			memw = 0;
+		}
+
+		if (mem & REG_MASK)
+			FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(mem & REG_MASK)));
+
+		mem = TMP_REG1;
+	} else {
+		mem &= REG_MASK;
+		memw &= 0xfff;
+	}
+
+	SLJIT_ASSERT((memw >= 0 && memw <= SIMM_MAX - SSIZE_OF(sw)) || (memw > SIMM_MAX && memw <= 0xfff));
+
+	if (!(type & SLJIT_MEM_STORE) && mem == REG_PAIR_FIRST(reg)) {
+		FAIL_IF(push_mem_inst(compiler, WORD_DATA | LOAD_DATA, REG_PAIR_SECOND(reg), mem, (memw + SSIZE_OF(sw)) & 0xfff));
+		return push_mem_inst(compiler, WORD_DATA | LOAD_DATA, REG_PAIR_FIRST(reg), mem, memw);
+	}
+
+	flags = WORD_DATA | (!(type & SLJIT_MEM_STORE) ? LOAD_DATA : 0);
+
+	FAIL_IF(push_mem_inst(compiler, flags, REG_PAIR_FIRST(reg), mem, memw));
+	return push_mem_inst(compiler, flags, REG_PAIR_SECOND(reg), mem, (memw + SSIZE_OF(sw)) & 0xfff);
+}
+
+#undef TO_ARGW_HI
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
 	struct sljit_const *const_;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeS390X.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeS390X.c
index be1ef438d3..8d86d072b1 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeS390X.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeS390X.c
@@ -103,11 +103,8 @@ static const sljit_gpr r15 = 15;	/* reg_map[SLJIT_NUMBER_OF_REGISTERS + 1]: stac
 /* When reg cannot be unused. */
 #define IS_GPR_REG(reg)		((reg > 0) && (reg) <= SLJIT_SP)
 
-/* Link registers. The normal link register is r14, but since
-   we use that for flags we need to use r0 instead to do fast
-   calls so that flags are preserved. */
+/* Link register. */
 static const sljit_gpr link_r = 14;     /* r14 */
-static const sljit_gpr fast_link_r = 0; /* r0 */
 
 #define TMP_FREG1	(0)
 
@@ -144,12 +141,6 @@ static SLJIT_INLINE sljit_gpr gpr(sljit_s32 r)
 	return reg_map[r];
 }
 
-static SLJIT_INLINE sljit_gpr fgpr(sljit_s32 r)
-{
-	SLJIT_ASSERT(r >= 0 && r < (sljit_s32)(sizeof(freg_map) / sizeof(freg_map[0])));
-	return freg_map[r];
-}
-
 /* Size of instruction in bytes. Tags must already be cleared. */
 static SLJIT_INLINE sljit_uw sizeof_ins(sljit_ins ins)
 {
@@ -998,7 +989,7 @@ static sljit_s32 make_addr_bx(struct sljit_compiler *compiler,
 	(cond) ? EVAL(i1, r, addr) : EVAL(i2, r, addr)
 
 /* May clobber tmp1. */
-static sljit_s32 load_word(struct sljit_compiler *compiler, sljit_gpr dst,
+static sljit_s32 load_word(struct sljit_compiler *compiler, sljit_gpr dst_r,
 		sljit_s32 src, sljit_sw srcw,
 		sljit_s32 is_32bit)
 {
@@ -1006,21 +997,36 @@ static sljit_s32 load_word(struct sljit_compiler *compiler, sljit_gpr dst,
 	sljit_ins ins;
 
 	SLJIT_ASSERT(src & SLJIT_MEM);
-	if (have_ldisp() || !is_32bit)
-		FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
-	else
+
+	if (is_32bit && ((src & OFFS_REG_MASK) || is_u12(srcw) || !is_s20(srcw))) {
 		FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1));
+		return push_inst(compiler, 0x58000000 /* l */ | R20A(dst_r) | R16A(addr.index) | R12A(addr.base) | (sljit_ins)addr.offset);
+	}
 
-	if (is_32bit)
-		ins = WHEN(is_u12(addr.offset), dst, l, ly, addr);
-	else
-		ins = lg(dst, addr.offset, addr.index, addr.base);
+	FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
 
-	return push_inst(compiler, ins);
+	ins = is_32bit ? 0xe30000000058 /* ly */ : 0xe30000000004 /* lg */;
+	return push_inst(compiler, ins | R36A(dst_r) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
 }
 
 /* May clobber tmp1. */
-static sljit_s32 store_word(struct sljit_compiler *compiler, sljit_gpr src,
+static sljit_s32 load_unsigned_word(struct sljit_compiler *compiler, sljit_gpr dst_r,
+		sljit_s32 src, sljit_sw srcw,
+		sljit_s32 is_32bit)
+{
+	struct addr addr;
+	sljit_ins ins;
+
+	SLJIT_ASSERT(src & SLJIT_MEM);
+
+	FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
+
+	ins = is_32bit ? 0xe30000000016 /* llgf */ : 0xe30000000004 /* lg */;
+	return push_inst(compiler, ins | R36A(dst_r) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
+}
+
+/* May clobber tmp1. */
+static sljit_s32 store_word(struct sljit_compiler *compiler, sljit_gpr src_r,
 		sljit_s32 dst, sljit_sw dstw,
 		sljit_s32 is_32bit)
 {
@@ -1028,17 +1034,16 @@ static sljit_s32 store_word(struct sljit_compiler *compiler, sljit_gpr src,
 	sljit_ins ins;
 
 	SLJIT_ASSERT(dst & SLJIT_MEM);
-	if (have_ldisp() || !is_32bit)
-		FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp1));
-	else
+
+	if (is_32bit && ((dst & OFFS_REG_MASK) || is_u12(dstw) || !is_s20(dstw))) {
 		FAIL_IF(make_addr_bx(compiler, &addr, dst, dstw, tmp1));
+		return push_inst(compiler, 0x50000000 /* st */ | R20A(src_r) | R16A(addr.index) | R12A(addr.base) | (sljit_ins)addr.offset);
+	}
 
-	if (is_32bit)
-		ins = WHEN(is_u12(addr.offset), src, st, sty, addr);
-	else
-		ins = stg(src, addr.offset, addr.index, addr.base);
+	FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp1));
 
-	return push_inst(compiler, ins);
+	ins = is_32bit ? 0xe30000000050 /* sty */ : 0xe30000000024 /* stg */;
+	return push_inst(compiler, ins | R36A(src_r) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
 }
 
 #undef WHEN
@@ -1638,12 +1643,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 {
 	/* TODO(mundaym): implement all */
 	switch (feature_type) {
+	case SLJIT_HAS_FPU:
 	case SLJIT_HAS_CLZ:
-		return have_eimm() ? 1 : 0; /* FLOGR instruction */
+	case SLJIT_HAS_REV:
+	case SLJIT_HAS_ROT:
+	case SLJIT_HAS_PREFETCH:
+	case SLJIT_HAS_COPY_F32:
+	case SLJIT_HAS_COPY_F64:
+		return 1;
+	case SLJIT_HAS_CTZ:
+		return 2;
 	case SLJIT_HAS_CMOV:
 		return have_lscond1() ? 1 : 0;
-	case SLJIT_HAS_FPU:
-		return 1;
 	}
 	return 0;
 }
@@ -1762,7 +1773,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_gpr last_reg)
 {
 	sljit_s32 offset, i, tmp;
 	sljit_s32 local_size = compiler->local_size;
@@ -1778,7 +1789,7 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 	offset = 2 * SSIZE_OF(sw);
 	if (saveds + scratches >= SLJIT_NUMBER_OF_REGISTERS) {
 		if (kept_saveds_count == 0) {
-			FAIL_IF(push_inst(compiler, lmg(r6, r14, offset, r15)));
+			FAIL_IF(push_inst(compiler, lmg(r6, last_reg, offset, r15)));
 			offset += 9 * SSIZE_OF(sw);
 		} else {
 			FAIL_IF(push_inst(compiler, lmg(r6, r13 - (sljit_gpr)kept_saveds_count, offset, r15)));
@@ -1795,10 +1806,14 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 
 		if (kept_saveds_count == 0) {
 			if (saveds == 0) {
-				FAIL_IF(push_inst(compiler, lg(r14, offset, 0, r15)));
+				if (last_reg == r14)
+					FAIL_IF(push_inst(compiler, lg(r14, offset, 0, r15)));
 				offset += SSIZE_OF(sw);
+			} else if (saveds == 1 && last_reg == r13) {
+				FAIL_IF(push_inst(compiler, lg(r13, offset, 0, r15)));
+				offset += 2 * SSIZE_OF(sw);
 			} else {
-				FAIL_IF(push_inst(compiler, lmg(r14 - (sljit_gpr)saveds, r14, offset, r15)));
+				FAIL_IF(push_inst(compiler, lmg(r14 - (sljit_gpr)saveds, last_reg, offset, r15)));
 				offset += (saveds + 1) * SSIZE_OF(sw);
 			}
 		} else if (saveds > kept_saveds_count) {
@@ -1813,7 +1828,8 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 	}
 
 	if (kept_saveds_count > 0) {
-		FAIL_IF(push_inst(compiler, lg(r14, offset, 0, r15)));
+		if (last_reg == r14)
+			FAIL_IF(push_inst(compiler, lg(r14, offset, 0, r15)));
 		offset += SSIZE_OF(sw);
 	}
 
@@ -1836,10 +1852,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, r14));
 	return push_inst(compiler, br(r14)); /* return */
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(load_word(compiler, tmp1, src, srcw, 0 /* 64-bit */));
+		src = TMP_REG2;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, lgr(tmp1, gpr(src))));
+		src = TMP_REG2;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, r13));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -1928,12 +1967,85 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 	return push_inst(compiler, lgr(arg1, tmp0));
 }
 
+static sljit_s32 sljit_emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 op, sljit_gpr dst_r, sljit_gpr src_r)
+{
+	sljit_s32 is_ctz = (GET_OPCODE(op) == SLJIT_CTZ);
+
+	if ((op & SLJIT_32) && src_r != tmp0) {
+		FAIL_IF(push_inst(compiler, 0xb9160000 /* llgfr */ | R4A(tmp0) | R0A(src_r)));
+		src_r = tmp0;
+	}
+
+	if (is_ctz) {
+		FAIL_IF(push_inst(compiler, ((op & SLJIT_32) ? 0x1300 /* lcr */ : 0xb9030000 /* lcgr */) | R4A(tmp1) | R0A(src_r)));
+
+		if (src_r == tmp0)
+			FAIL_IF(push_inst(compiler, ((op & SLJIT_32) ? 0x1400 /* nr */ : 0xb9800000 /* ngr */) | R4A(tmp0) | R0A(tmp1)));
+		else
+			FAIL_IF(push_inst(compiler, 0xb9e40000 /* ngrk */ | R12A(tmp1) | R4A(tmp0) | R0A(src_r)));
+
+		src_r = tmp0;
+	}
+
+	FAIL_IF(push_inst(compiler, 0xb9830000 /* flogr */ | R4A(tmp0) | R0A(src_r)));
+
+	if (is_ctz)
+		FAIL_IF(push_inst(compiler, 0xec00000000d9 /* aghik */ | R36A(tmp1) | R32A(tmp0) | ((sljit_ins)(-64 & 0xffff) << 16)));
+
+	if (op & SLJIT_32) {
+		if (!is_ctz && dst_r != tmp0)
+			return push_inst(compiler, 0xec00000000d9 /* aghik */ | R36A(dst_r) | R32A(tmp0) | ((sljit_ins)(-32 & 0xffff) << 16));
+
+		FAIL_IF(push_inst(compiler, 0xc20800000000 /* agfi */ | R36A(tmp0) | (sljit_u32)-32));
+	}
+
+	if (is_ctz)
+		FAIL_IF(push_inst(compiler, 0xec0000000057 /* rxsbg */ | R36A(tmp0) | R32A(tmp1) | ((sljit_ins)((op & SLJIT_32) ? 59 : 58) << 24) | (63 << 16) | ((sljit_ins)((op & SLJIT_32) ? 5 : 6) << 8)));
+
+	if (dst_r == tmp0)
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, ((op & SLJIT_32) ? 0x1800 /* lr */ : 0xb9040000 /* lgr */) | R4A(dst_r) | R0A(tmp0));
+}
+
+static sljit_s32 sljit_emit_rev(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
+{
+	struct addr addr;
+	sljit_gpr reg;
+	sljit_ins ins;
+
+	if (dst & SLJIT_MEM) {
+		if (src & SLJIT_MEM) {
+			FAIL_IF(load_word(compiler, tmp0, src, srcw, op & SLJIT_32));
+			reg = tmp0;
+		} else
+			reg = gpr(src);
+
+		FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp1));
+		ins = (op & SLJIT_32) ? 0xe3000000003e /* strv */ : 0xe3000000002f /* strvg */;
+		return push_inst(compiler, ins | R36A(reg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
+	}
+
+	reg = gpr(dst);
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
+		ins = (op & SLJIT_32) ? 0xe3000000001e /* lrv */ : 0xe3000000000f /* lrvg */;
+		return push_inst(compiler, ins | R36A(reg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
+	}
+
+	ins = (op & SLJIT_32) ? 0xb91f0000 /* lrvr */ : 0xb90f0000 /* lrvgr */;
+	return push_inst(compiler, ins | R4A(reg) | R0A(gpr(src)));
+}
+
 /* LEVAL will be defined later with different parameters as needed */
 #define WHEN2(cond, i1, i2) (cond) ? LEVAL(i1) : LEVAL(i2)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
-        sljit_s32 dst, sljit_sw dstw,
-        sljit_s32 src, sljit_sw srcw)
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
 {
 	sljit_ins ins;
 	struct addr mem;
@@ -2159,65 +2271,28 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 
 	SLJIT_ASSERT((src & SLJIT_IMM) == 0); /* no immediates */
 
-	dst_r = FAST_IS_REG(dst) ? gpr(REG_MASK & dst) : tmp0;
-	src_r = FAST_IS_REG(src) ? gpr(REG_MASK & src) : tmp0;
-	if (src & SLJIT_MEM)
-		FAIL_IF(load_word(compiler, src_r, src, srcw, src & SLJIT_32));
+	dst_r = FAST_IS_REG(dst) ? gpr(dst) : tmp0;
+	src_r = FAST_IS_REG(src) ? gpr(src) : tmp0;
 
 	compiler->status_flags_state = op & (VARIABLE_FLAG_MASK | SLJIT_SET_Z);
 
 	/* TODO(mundaym): optimize loads and stores */
-	switch (opcode | (op & SLJIT_32)) {
-	case SLJIT_NOT:
-		/* emulate ~x with x^-1 */
-		FAIL_IF(push_load_imm_inst(compiler, tmp1, -1));
-		if (src_r != dst_r)
-			FAIL_IF(push_inst(compiler, lgr(dst_r, src_r)));
-
-		FAIL_IF(push_inst(compiler, xgr(dst_r, tmp1)));
-		break;
-	case SLJIT_NOT32:
-		/* emulate ~x with x^-1 */
-		if (have_eimm())
-			FAIL_IF(push_inst(compiler, xilf(dst_r, 0xffffffff)));
-		else {
-			FAIL_IF(push_load_imm_inst(compiler, tmp1, -1));
-			if (src_r != dst_r)
-				FAIL_IF(push_inst(compiler, lr(dst_r, src_r)));
-
-			FAIL_IF(push_inst(compiler, xr(dst_r, tmp1)));
-		}
-		break;
+	switch (opcode) {
 	case SLJIT_CLZ:
-		if (have_eimm()) {
-			FAIL_IF(push_inst(compiler, flogr(tmp0, src_r))); /* clobbers tmp1 */
-			if (dst_r != tmp0)
-				FAIL_IF(push_inst(compiler, lgr(dst_r, tmp0)));
-		} else {
-			abort(); /* TODO(mundaym): no eimm (?) */
-		}
-		break;
-	case SLJIT_CLZ32:
-		if (have_eimm()) {
-			FAIL_IF(push_inst(compiler, sllg(tmp1, src_r, 32, 0)));
-			FAIL_IF(push_inst(compiler, iilf(tmp1, 0xffffffff)));
-			FAIL_IF(push_inst(compiler, flogr(tmp0, tmp1))); /* clobbers tmp1 */
-			if (dst_r != tmp0)
-				FAIL_IF(push_inst(compiler, lr(dst_r, tmp0)));
-		} else {
-			abort(); /* TODO(mundaym): no eimm (?) */
-		}
+	case SLJIT_CTZ:
+		if (src & SLJIT_MEM)
+			FAIL_IF(load_unsigned_word(compiler, src_r, src, srcw, op & SLJIT_32));
+
+		FAIL_IF(sljit_emit_clz_ctz(compiler, op, dst_r, src_r));
 		break;
+	case SLJIT_REV:
+		return sljit_emit_rev(compiler, op, dst, dstw, src, srcw);
 	default:
 		SLJIT_UNREACHABLE();
 	}
 
-	if ((op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_SET_Z | SLJIT_SET_OVERFLOW))
-		FAIL_IF(update_zero_overflow(compiler, op, dst_r));
-
-	/* TODO(carenas): doesn't need FAIL_IF */
 	if (dst & SLJIT_MEM)
-		FAIL_IF(store_word(compiler, dst_r, dst, dstw, op & SLJIT_32));
+		return store_word(compiler, dst_r, dst, dstw, op & SLJIT_32);
 
 	return SLJIT_SUCCESS;
 }
@@ -2236,11 +2311,6 @@ static SLJIT_INLINE int is_commutative(sljit_s32 op)
 	return 0;
 }
 
-static SLJIT_INLINE int is_shift(sljit_s32 op) {
-	sljit_s32 v = GET_OPCODE(op);
-	return (v == SLJIT_SHL || v == SLJIT_ASHR || v == SLJIT_LSHR) ? 1 : 0;
-}
-
 static const struct ins_forms add_forms = {
 	0x1a00, /* ar */
 	0xb9080000, /* agr */
@@ -2639,12 +2709,12 @@ static sljit_s32 sljit_emit_bitwise(struct sljit_compiler *compiler, sljit_s32 o
 				FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
 
 			if ((imm & 0x000000000000ffffull) != 0 || imm == 0)
-				return push_inst(compiler, 0xa7010000 | R20A(src_r) | imm);
+				return push_inst(compiler, 0xa7010000 /* tmll */ | R20A(src_r) | imm);
 			if ((imm & 0x00000000ffff0000ull) != 0)
-				return push_inst(compiler, 0xa7000000 | R20A(src_r) | (imm >> 16));
+				return push_inst(compiler, 0xa7000000 /* tmlh */ | R20A(src_r) | (imm >> 16));
 			if ((imm & 0x0000ffff00000000ull) != 0)
-				return push_inst(compiler, 0xa7030000 | R20A(src_r) | (imm >> 32));
-			return push_inst(compiler, 0xa7020000 | R20A(src_r) | (imm >> 48));
+				return push_inst(compiler, 0xa7030000 /* tmhl */ | R20A(src_r) | (imm >> 32));
+			return push_inst(compiler, 0xa7020000 /* tmhh */ | R20A(src_r) | (imm >> 48));
 		}
 
 		if (!(op & SLJIT_SET_Z))
@@ -2674,33 +2744,41 @@ static sljit_s32 sljit_emit_shift(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_ins ins;
 
 	if (FAST_IS_REG(src1))
-		src_r = gpr(src1 & REG_MASK);
+		src_r = gpr(src1);
 	else
 		FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
 
-	if (src2 & SLJIT_IMM)
+	if (!(src2 & SLJIT_IMM)) {
+		if (FAST_IS_REG(src2))
+			base_r = gpr(src2);
+		else {
+			FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
+			base_r = tmp1;
+		}
+
+		if ((op & SLJIT_32) && (type == SLJIT_MSHL || type == SLJIT_MLSHR || type == SLJIT_MASHR)) {
+			if (base_r != tmp1) {
+				FAIL_IF(push_inst(compiler, 0xec0000000055 /* risbg */ | R36A(tmp1) | R32A(base_r) | (59 << 24) | (1 << 23) | (63 << 16)));
+				base_r = tmp1;
+			} else
+				FAIL_IF(push_inst(compiler, 0xa5070000 /* nill */ | R20A(tmp1) | 0x1f));
+		}
+	} else
 		imm = (sljit_ins)(src2w & ((op & SLJIT_32) ? 0x1f : 0x3f));
-	else if (FAST_IS_REG(src2))
-		base_r = gpr(src2 & REG_MASK);
-	else {
-		FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
-		base_r = tmp1;
-	}
 
 	if ((op & SLJIT_32) && dst_r == src_r) {
-		if (type == SLJIT_SHL)
+		if (type == SLJIT_SHL || type == SLJIT_MSHL)
 			ins = 0x89000000 /* sll */;
-		else if (type == SLJIT_LSHR)
+		else if (type == SLJIT_LSHR || type == SLJIT_MLSHR)
 			ins = 0x88000000 /* srl */;
 		else
 			ins = 0x8a000000 /* sra */;
 
 		FAIL_IF(push_inst(compiler, ins | R20A(dst_r) | R12A(base_r) | imm));
-	}
-	else {
-		if (type == SLJIT_SHL)
+	} else {
+		if (type == SLJIT_SHL || type == SLJIT_MSHL)
 			ins = (op & SLJIT_32) ? 0xeb00000000df /* sllk */ : 0xeb000000000d /* sllg */;
-		else if (type == SLJIT_LSHR)
+		else if (type == SLJIT_LSHR || type == SLJIT_MLSHR)
 			ins = (op & SLJIT_32) ? 0xeb00000000de /* srlk */ : 0xeb000000000c /* srlg */;
 		else
 			ins = (op & SLJIT_32) ? 0xeb00000000dc /* srak */ : 0xeb000000000a /* srag */;
@@ -2714,6 +2792,47 @@ static sljit_s32 sljit_emit_shift(struct sljit_compiler *compiler, sljit_s32 op,
 	return SLJIT_SUCCESS;
 }
 
+static sljit_s32 sljit_emit_rotate(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_gpr dst_r = FAST_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+	sljit_gpr src_r = tmp0;
+	sljit_gpr base_r = tmp0;
+	sljit_ins imm = 0;
+	sljit_ins ins;
+
+	if (FAST_IS_REG(src1))
+		src_r = gpr(src1);
+	else
+		FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
+
+	if (!(src2 & SLJIT_IMM)) {
+		if (FAST_IS_REG(src2))
+			base_r = gpr(src2);
+		else {
+			FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
+			base_r = tmp1;
+		}
+	}
+
+	if (GET_OPCODE(op) == SLJIT_ROTR) {
+		if (!(src2 & SLJIT_IMM)) {
+			ins = (op & SLJIT_32) ? 0x1300 /* lcr */ : 0xb9030000 /* lcgr */;
+			FAIL_IF(push_inst(compiler, ins | R4A(tmp1) | R0A(base_r)));
+			base_r = tmp1;
+		} else
+			src2w = -src2w;
+	}
+
+	if (src2 & SLJIT_IMM)
+		imm = (sljit_ins)(src2w & ((op & SLJIT_32) ? 0x1f : 0x3f));
+
+	ins = (op & SLJIT_32) ? 0xeb000000001d /* rll */ : 0xeb000000001c /* rllg */;
+	return push_inst(compiler, ins | R36A(dst_r) | R32A(src_r) | R28A(base_r) | (imm << 16));
+}
+
 static const struct ins_forms addc_forms = {
 	0xb9980000, /* alcr */
 	0xb9880000, /* alcgr */
@@ -2786,10 +2905,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		FAIL_IF(sljit_emit_bitwise(compiler, op, dst, src1, src1w, src2, src2w));
 		break;
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		FAIL_IF(sljit_emit_shift(compiler, op, dst, src1, src1w, src2, src2w));
 		break;
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		FAIL_IF(sljit_emit_rotate(compiler, op, dst, src1, src1w, src2, src2w));
+		break;
 	}
 
 	if (dst & SLJIT_MEM)
@@ -2808,11 +2934,129 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return sljit_emit_op2(compiler, op, (sljit_s32)tmp0, 0, src1, src1w, src2, src2w);
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(
-	struct sljit_compiler *compiler,
-	sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_s32 is_right;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+	sljit_gpr dst_r = gpr(dst_reg);
+	sljit_gpr src1_r = gpr(src1_reg);
+	sljit_gpr src2_r = gpr(src2_reg);
+	sljit_gpr src3_r = tmp1;
+	sljit_ins ins;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+
+	is_right = (GET_OPCODE(op) == SLJIT_LSHR || GET_OPCODE(op) == SLJIT_MLSHR);
+
+	if (src1_reg == src2_reg) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_right ? SLJIT_ROTR : SLJIT_ROTL) | (op & SLJIT_32), dst_reg, 0, src1_reg, 0, src3, src3w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	if (src3 & SLJIT_IMM) {
+		src3w &= bit_length - 1;
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+
+		if (op & SLJIT_32) {
+			if (dst_r == src1_r) {
+				ins = is_right ? 0x88000000 /* srl */ : 0x89000000 /* sll */;
+				FAIL_IF(push_inst(compiler, ins | R20A(dst_r) | (sljit_ins)src3w));
+			} else {
+				ins = is_right ? 0xeb00000000de /* srlk */ : 0xeb00000000df /* sllk */;
+				FAIL_IF(push_inst(compiler, ins | R36A(dst_r) | R32A(src1_r) | ((sljit_ins)src3w << 16)));
+			}
+		} else {
+			ins = is_right ? 0xeb000000000c /* srlg */ : 0xeb000000000d /* sllg */;
+			FAIL_IF(push_inst(compiler, ins | R36A(dst_r) | R32A(src1_r) | ((sljit_ins)src3w << 16)));
+		}
+
+		ins = 0xec0000000055 /* risbg */;
+
+		if (is_right) {
+			src3w = bit_length - src3w;
+			ins |= ((sljit_ins)(64 - bit_length) << 24) | ((sljit_ins)(63 - src3w) << 16) | ((sljit_ins)src3w << 8);
+		} else
+			ins |= ((sljit_ins)(64 - src3w) << 24) | ((sljit_ins)63 << 16) | ((sljit_ins)(src3w + 64 - bit_length) << 8);
+
+		return push_inst(compiler, ins | R36A(dst_r) | R32A(src2_r));
+	}
+
+	if (!(src3 & SLJIT_MEM)) {
+		src3_r = gpr(src3);
+
+		if (dst_r == src3_r) {
+			FAIL_IF(push_inst(compiler, 0x1800 /* lr */ | R4A(tmp1) | R0A(src3_r)));
+			src3_r = tmp1;
+		}
+	} else
+		FAIL_IF(load_word(compiler, tmp1, src3, src3w, op & SLJIT_32));
+
+	if (op & SLJIT_32) {
+		if (GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR) {
+			if (src3_r != tmp1) {
+				FAIL_IF(push_inst(compiler, 0xec0000000055 /* risbg */ | R36A(tmp1) | R32A(src3_r) | (59 << 24) | (1 << 23) | (63 << 16)));
+				src3_r = tmp1;
+			} else
+				FAIL_IF(push_inst(compiler, 0xa5070000 /* nill */ | R20A(tmp1) | 0x1f));
+		}
+
+		if (dst_r == src1_r) {
+			ins = is_right ? 0x88000000 /* srl */ : 0x89000000 /* sll */;
+			FAIL_IF(push_inst(compiler, ins | R20A(dst_r) | R12A(src3_r)));
+		} else {
+			ins = is_right ? 0xeb00000000de /* srlk */ : 0xeb00000000df /* sllk */;
+			FAIL_IF(push_inst(compiler, ins | R36A(dst_r) | R32A(src1_r) | R28A(src3_r)));
+		}
+
+		if (src3_r != tmp1) {
+			FAIL_IF(push_inst(compiler, 0xa50f0000 /* llill */ | R20A(tmp1) | 0x1f));
+			FAIL_IF(push_inst(compiler, 0x1700 /* xr */ | R4A(tmp1) | R0A(src3_r)));
+		} else
+			FAIL_IF(push_inst(compiler, 0xc00700000000 /* xilf */ | R36A(tmp1) | 0x1f));
+
+		ins = is_right ? 0xeb00000000df /* sllk */ : 0xeb00000000de /* srlk */;
+		FAIL_IF(push_inst(compiler, ins | R36A(tmp0) | R32A(src2_r) | R28A(tmp1) | (0x1 << 16)));
+
+		return push_inst(compiler, 0x1600 /* or */ | R4A(dst_r) | R0A(tmp0));
+	}
+
+	ins = is_right ? 0xeb000000000c /* srlg */ : 0xeb000000000d /* sllg */;
+	FAIL_IF(push_inst(compiler, ins | R36A(dst_r) | R32A(src1_r) | R28A(src3_r)));
+
+	ins = is_right ? 0xeb000000000d /* sllg */ : 0xeb000000000c /* srlg */;
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		if (src3_r != tmp1)
+			FAIL_IF(push_inst(compiler, 0xa50f0000 /* llill */ | R20A(tmp1) | 0x3f));
+
+		FAIL_IF(push_inst(compiler, ins | R36A(tmp0) | R32A(src2_r) | (0x1 << 16)));
+		src2_r = tmp0;
+
+		if (src3_r != tmp1)
+			FAIL_IF(push_inst(compiler, 0xb9820000 /* xgr */ | R4A(tmp1) | R0A(src3_r)));
+		else
+			FAIL_IF(push_inst(compiler, 0xc00700000000 /* xilf */ | R36A(tmp1) | 0x3f));
+	} else
+		FAIL_IF(push_inst(compiler, 0xb9030000 /* lcgr */ | R4A(tmp1) | R0A(src3_r)));
+
+	FAIL_IF(push_inst(compiler, ins | R36A(tmp0) | R32A(src2_r) | R28A(tmp1)));
+	return push_inst(compiler, 0xb9810000 /* ogr */ | R4A(dst_r) | R0A(tmp0));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src, sljit_sw srcw)
 {
 	sljit_gpr src_r;
+	struct addr addr;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
@@ -2826,22 +3070,49 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(
 
 		return push_inst(compiler, br(src_r));
 	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
-		/* TODO(carenas): implement? */
 		return SLJIT_SUCCESS;
 	case SLJIT_PREFETCH_L1:
 	case SLJIT_PREFETCH_L2:
 	case SLJIT_PREFETCH_L3:
 	case SLJIT_PREFETCH_ONCE:
-		/* TODO(carenas): implement */
-		return SLJIT_SUCCESS;
+		FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
+		return push_inst(compiler, 0xe31000000036 /* pfd */ | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
 	default:
-                /* TODO(carenas): probably should not success by default */
 		return SLJIT_SUCCESS;
 	}
 
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_gpr dst_r = link_r;
+	sljit_s32 size;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		if (FAST_IS_REG(dst))
+			return push_inst(compiler, lgr(gpr(dst), link_r));
+		break;
+	case SLJIT_GET_RETURN_ADDRESS:
+		dst_r = FAST_IS_REG(dst) ? gpr(dst) : tmp0;
+
+		size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds - SLJIT_KEPT_SAVEDS_COUNT(compiler->options), 2);
+		FAIL_IF(load_word(compiler, dst_r, SLJIT_MEM1(SLJIT_SP), compiler->local_size + size, 0));
+		break;
+	}
+
+	if (dst & SLJIT_MEM)
+		return store_word(compiler, dst_r, dst, dstw, 0);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -2851,7 +3122,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-	return (sljit_s32)fgpr(reg);
+	return (sljit_s32)freg_map[reg];
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -3120,21 +3391,31 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
 	return SLJIT_SUCCESS;
 }
 
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
 {
+	sljit_gpr gen_r;
+
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
 
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, lgr(gpr(dst), fast_link_r));
+	gen_r = gpr(reg);
 
-	/* memory */
-	return store_word(compiler, fast_link_r, dst, dstw, 0);
+	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64) {
+		if (op & SLJIT_32) {
+			FAIL_IF(push_inst(compiler, 0xeb000000000d /* sllg */ | R36A(tmp0) | R32A(gen_r) | (32 << 16)));
+			gen_r = tmp0;
+		}
+
+		return push_inst(compiler, 0xb3c10000 /* ldgr */ | F4(freg) | R0A(gen_r));
+	}
+
+	FAIL_IF(push_inst(compiler, 0xb3cd0000 /* lgdr */ | R4A(gen_r) | F0(freg)));
+
+	if (!(op & SLJIT_32))
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, 0xeb000000000c /* srlg */ | R36A(gen_r) | R32A(gen_r) | (32 << 16));
 }
 
 /* --------------------------------------------------------------------- */
@@ -3174,7 +3455,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile
 	/* emit jump instruction */
 	type &= 0xff;
 	if (type >= SLJIT_FAST_CALL)
-		PTR_FAIL_IF(push_inst(compiler, brasl(type == SLJIT_FAST_CALL ? fast_link_r : link_r, 0)));
+		PTR_FAIL_IF(push_inst(compiler, brasl(link_r, 0)));
 	else
 		PTR_FAIL_IF(push_inst(compiler, brcl(mask, 0)));
 
@@ -3189,7 +3470,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compile
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, r14));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
@@ -3215,7 +3496,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 
 	/* emit jump instruction */
 	if (type >= SLJIT_FAST_CALL)
-		return push_inst(compiler, basr(type == SLJIT_FAST_CALL ? fast_link_r : link_r, src_r));
+		return push_inst(compiler, basr(link_r, src_r));
 
 	return push_inst(compiler, br(src_r));
 }
@@ -3233,15 +3514,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 		ADJUST_LOCAL_OFFSET(src, srcw);
 		FAIL_IF(load_word(compiler, tmp1, src, srcw, 0 /* 64-bit */));
 		src = TMP_REG2;
+		srcw = 0;
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			FAIL_IF(push_inst(compiler, lgr(tmp1, gpr(src))));
 			src = TMP_REG2;
+			srcw = 0;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, r14));
 		type = SLJIT_JUMP;
 	}
 
@@ -3323,27 +3606,92 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	sljit_s32 dst_reg,
 	sljit_s32 src, sljit_sw srcw)
 {
-	sljit_u8 mask = get_cc(compiler, type);
-	sljit_gpr dst_r = gpr(dst_reg & ~SLJIT_32);
-	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp0;
+	sljit_ins mask = get_cc(compiler, type & ~SLJIT_32);
+	sljit_gpr src_r;
+	sljit_ins ins;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
-	if (src & SLJIT_IMM) {
-		/* TODO(mundaym): fast path with lscond2 */
-		FAIL_IF(push_load_imm_inst(compiler, src_r, srcw));
+	if (type & SLJIT_32)
+		srcw = (sljit_s32)srcw;
+
+	if (have_lscond2() && (src & SLJIT_IMM) && is_s16(srcw)) {
+		ins = (type & SLJIT_32) ? 0xec0000000042 /* lochi */ : 0xec0000000046 /* locghi */;
+		return push_inst(compiler, ins | R36A(gpr(dst_reg)) | (mask << 32) | (sljit_ins)(srcw & 0xffff) << 16);
 	}
 
-	#define LEVAL(i) i(dst_r, src_r, mask)
-	if (have_lscond1())
-		return push_inst(compiler,
-			WHEN2(dst_reg & SLJIT_32, locr, locgr));
+	if (src & SLJIT_IMM) {
+		FAIL_IF(push_load_imm_inst(compiler, tmp0, srcw));
+		src_r = tmp0;
+	} else
+		src_r = gpr(src);
 
-	#undef LEVAL
+	if (have_lscond1()) {
+		ins = (type & SLJIT_32) ? 0xb9f20000 /* locr */ : 0xb9e20000 /* locgr */;
+		return push_inst(compiler, ins | (mask << 12) | R4A(gpr(dst_reg)) | R0A(src_r));
+	}
 
-	/* TODO(mundaym): implement */
-	return SLJIT_ERR_UNSUPPORTED;
+	return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_ins ins, reg1, reg2, base, offs = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	base = gpr(mem & REG_MASK);
+	reg1 = gpr(REG_PAIR_FIRST(reg));
+	reg2 = gpr(REG_PAIR_SECOND(reg));
+
+	if (mem & OFFS_REG_MASK) {
+		memw &= 0x3;
+		offs = gpr(OFFS_REG(mem));
+
+		if (memw != 0) {
+			FAIL_IF(push_inst(compiler, 0xeb000000000d /* sllg */ | R36A(tmp1) | R32A(offs) | ((sljit_ins)memw << 16)));
+			offs = tmp1;
+		} else if (!(type & SLJIT_MEM_STORE) && (base == reg1 || base == reg2) && (offs == reg1 || offs == reg2)) {
+			FAIL_IF(push_inst(compiler, 0xb9f80000 | R12A(tmp1) | R4A(base) | R0A(offs)));
+			base = tmp1;
+			offs = 0;
+		}
+
+		memw = 0;
+	} else if (memw < -0x80000 || memw > 0x7ffff - ((reg2 == reg1 + 1) ? 0 : SSIZE_OF(sw))) {
+		FAIL_IF(push_load_imm_inst(compiler, tmp1, memw));
+
+		if (base == 0)
+			base = tmp1;
+		else
+			offs = tmp1;
+
+		memw = 0;
+	}
+
+	if (offs == 0 && reg2 == (reg1 + 1)) {
+		ins = (type & SLJIT_MEM_STORE) ? 0xeb0000000024 /* stmg */ : 0xeb0000000004 /* lmg */;
+		return push_inst(compiler, ins | R36A(reg1) | R32A(reg2) | R28A(base) | disp_s20((sljit_s32)memw));
+	}
+
+	ins = ((type & SLJIT_MEM_STORE) ? 0xe30000000024 /* stg */ : 0xe30000000004 /* lg */) | R32A(offs) | R28A(base);
+
+	if (!(type & SLJIT_MEM_STORE) && base == reg1) {
+		FAIL_IF(push_inst(compiler, ins | R36A(reg2) | disp_s20((sljit_s32)memw + SSIZE_OF(sw))));
+		return push_inst(compiler, ins | R36A(reg1) | disp_s20((sljit_s32)memw));
+	}
+
+	FAIL_IF(push_inst(compiler, ins | R36A(reg1) | disp_s20((sljit_s32)memw)));
+	return push_inst(compiler, ins | R36A(reg2) | disp_s20((sljit_s32)memw + SSIZE_OF(sw)));
 }
 
 /* --------------------------------------------------------------------- */
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_32.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_32.c
index 2a45a2ca6c..69c917101f 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_32.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_32.c
@@ -114,7 +114,7 @@ static sljit_u8* emit_x86_instruction(struct sljit_compiler *compiler, sljit_uw
 				inst_size += 4;
 		}
 		else if (flags & EX86_SHIFT_INS) {
-			imma &= 0x1f;
+			SLJIT_ASSERT(imma <= 0x1f);
 			if (imma != 1) {
 				inst_size++;
 				flags |= EX86_BYTE_ARG;
@@ -528,20 +528,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_s32 kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
-	sljit_s32 saveds;
+	sljit_s32 local_size, saveds;
 	sljit_uw size;
 	sljit_u8 *inst;
 
-	BINARY_IMM32(ADD, compiler->local_size, SLJIT_SP, 0);
-
 	size = (sljit_uw)((compiler->scratches > 9 ? (compiler->scratches - 9) : 0) +
 		(compiler->saveds <= 3 ? compiler->saveds : 3) - kept_saveds_count);
 
+	local_size = compiler->local_size;
+
 	if (!(compiler->options & SLJIT_ENTER_REG_ARG))
 		size++;
+	else if (is_return_to && size == 0) {
+		local_size += SSIZE_OF(sw);
+		is_return_to = 0;
+	}
+
+	if (local_size > 0)
+		BINARY_IMM32(ADD, local_size, SLJIT_SP, 0);
 
 	if (size == 0)
 		return SLJIT_SUCCESS;
@@ -563,6 +570,9 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 	if (!(compiler->options & SLJIT_ENTER_REG_ARG))
 		POP_REG(reg_map[TMP_REG1]);
 
+	if (is_return_to)
+		BINARY_IMM32(ADD, sizeof(sljit_sw), SLJIT_SP, 0);
+
 	return SLJIT_SUCCESS;
 }
 
@@ -576,7 +586,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	SLJIT_ASSERT(compiler->args_size >= 0);
 	SLJIT_ASSERT(compiler->local_size > 0);
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
 	FAIL_IF(!inst);
@@ -585,6 +595,31 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 src_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if ((src & SLJIT_MEM) || (src > SLJIT_R2 && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		CHECK_EXTRA_REGS(src, srcw, (void)0);
+
+		src_r = (compiler->options & SLJIT_ENTER_REG_ARG) ? TMP_REG1 : SLJIT_R1;
+
+		EMIT_MOV(compiler, src_r, 0, src, srcw);
+		src = src_r;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Call / return instructions                                           */
 /* --------------------------------------------------------------------- */
@@ -796,7 +831,7 @@ static sljit_s32 tail_call_with_args(struct sljit_compiler *compiler,
 			types >>= SLJIT_ARG_SHIFT;
 		}
 
-		return emit_stack_frame_release(compiler);
+		return emit_stack_frame_release(compiler, 0);
 	}
 
 	stack_size = args_size + SSIZE_OF(sw);
@@ -971,7 +1006,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compile
 	if (type & SLJIT_CALL_RETURN) {
 		if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
 			PTR_FAIL_IF(tail_call_reg_arg_with_args(compiler, arg_types));
-			PTR_FAIL_IF(emit_stack_frame_release(compiler));
+			PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 
 			SLJIT_SKIP_CHECKS(compiler);
 			return sljit_emit_jump(compiler, SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP));
@@ -1022,7 +1057,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 		if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
 			FAIL_IF(tail_call_reg_arg_with_args(compiler, arg_types));
 
-			if ((src & SLJIT_MEM) || (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0)) {
+			if ((src & SLJIT_MEM) || (src > SLJIT_R2 && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
 				ADJUST_LOCAL_OFFSET(src, srcw);
 				CHECK_EXTRA_REGS(src, srcw, (void)0);
 
@@ -1031,7 +1066,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 				srcw = 0;
 			}
 
-			FAIL_IF(emit_stack_frame_release(compiler));
+			FAIL_IF(emit_stack_frame_release(compiler, 0));
 
 			SLJIT_SKIP_CHECKS(compiler);
 			return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
@@ -1080,14 +1115,37 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 	return post_call_with_args(compiler, arg_types, stack_size);
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	sljit_u8* inst;
+
+	if (compiler->options & SLJIT_ENTER_REG_ARG) {
+		if (src == SLJIT_FR0)
+			return SLJIT_SUCCESS;
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+	}
+
+	if (FAST_IS_REG(src)) {
+		FAIL_IF(emit_sse2_store(compiler, op & SLJIT_32, SLJIT_MEM1(SLJIT_SP), 0, src));
+
+		src = SLJIT_MEM1(SLJIT_SP);
+		srcw = 0;
+	} else {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+	}
+
+	inst = emit_x86_instruction(compiler, 1 | EX86_SSE2_OP1, 0, 0, src, srcw);
+	*inst = (op & SLJIT_32) ? FLDS : FLDL;
+
+	return SLJIT_SUCCESS;
+}
+
+static sljit_s32 emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
 {
 	sljit_u8 *inst;
 
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
 
 	if (FAST_IS_REG(dst)) {
@@ -1103,7 +1161,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *
 	/* Memory. */
 	inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
 	FAIL_IF(!inst);
-	*inst++ = POP_rm;
+	*inst = POP_rm;
 	return SLJIT_SUCCESS;
 }
 
@@ -1123,8 +1181,8 @@ static sljit_s32 emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src
 	else {
 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_FF;
-		*inst |= PUSH_rm;
+		inst[0] = GROUP_FF;
+		inst[1] |= PUSH_rm;
 
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
 		FAIL_IF(!inst);
@@ -1135,8 +1193,24 @@ static sljit_s32 emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src
 	return SLJIT_SUCCESS;
 }
 
+static sljit_s32 sljit_emit_get_return_address(struct sljit_compiler *compiler,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 options = compiler->options;
+	sljit_s32 saveds = compiler->saveds;
+	sljit_s32 scratches = compiler->scratches;
+
+	saveds = ((scratches > 9 ? (scratches - 9) : 0) + (saveds <= 3 ? saveds : 3) - SLJIT_KEPT_SAVEDS_COUNT(options)) * SSIZE_OF(sw);
+
+	/* Saving ebp. */
+	if (!(options & SLJIT_ENTER_REG_ARG))
+		saveds += SSIZE_OF(sw);
+
+	return emit_mov(compiler, dst, dstw, SLJIT_MEM1(SLJIT_SP), compiler->local_size + saveds);
+}
+
 /* --------------------------------------------------------------------- */
-/*  Memory operations                                                    */
+/*  Other operations                                                     */
 /* --------------------------------------------------------------------- */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
@@ -1150,12 +1224,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
-	if (!(reg & REG_PAIR_MASK)) {
-		if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-			return SLJIT_ERR_UNSUPPORTED;
-
+	if (!(reg & REG_PAIR_MASK))
 		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
-	}
 
 	ADJUST_LOCAL_OFFSET(mem, memw);
 
@@ -1221,6 +1291,79 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	sljit_u8 *inst;
+	sljit_s32 reg2;
+	sljit_sw regw, reg2w;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	regw = 0;
+	reg2 = 0;
+	reg2w = 0;
+
+	if (reg & REG_PAIR_MASK) {
+		reg2 = REG_PAIR_SECOND(reg);
+		reg = REG_PAIR_FIRST(reg);
+
+		if (reg == reg2)
+			reg = 0;
+
+		CHECK_EXTRA_REGS(reg2, reg2w, (void)0);
+	}
+
+	CHECK_EXTRA_REGS(reg, regw, (void)0);
+
+	if (op & SLJIT_32) {
+		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, reg, regw);
+		inst[0] = GROUP_0F;
+		inst[1] = GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x;
+		return SLJIT_SUCCESS;
+	}
+
+	if (op == SLJIT_COPY_FROM_F64) {
+		inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
+		FAIL_IF(!inst);
+		INC_SIZE(5);
+
+		inst[0] = GROUP_66;
+		inst[1] = GROUP_0F;
+		inst[2] = PSHUFD_x_xm;
+		inst[3] = U8(MOD_REG | (TMP_FREG << 3) | freg);
+		inst[4] = 1;
+	} else if (reg != 0) {
+		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, 0, reg, regw);
+		inst[0] = GROUP_0F;
+		inst[1] = MOVD_x_rm;
+	}
+
+	if (reg2 != 0) {
+		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, reg2, reg2w);
+		inst[0] = GROUP_0F;
+		inst[1] = GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x;
+	}
+
+	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64) {
+		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
+		FAIL_IF(!inst);
+		INC_SIZE(4);
+
+		inst[0] = GROUP_66;
+		inst[1] = GROUP_0F;
+		inst[2] = PUNPCKLDQ_x_xm;
+		inst[3] = U8(MOD_REG | (freg << 3) | (reg == 0 ? freg : TMP_FREG));
+	} else {
+		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, 0, reg, regw);
+		inst[0] = GROUP_0F;
+		inst[1] = MOVD_rm_x;
+	}
+
+	return SLJIT_SUCCESS;
+}
+
 static sljit_s32 skip_frames_before_return(struct sljit_compiler *compiler)
 {
 	sljit_sw size;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_64.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_64.c
index 7b0d7cea4a..b5efc1fda1 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_64.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_64.c
@@ -37,9 +37,9 @@ static sljit_s32 emit_load_imm64(struct sljit_compiler *compiler, sljit_s32 reg,
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + sizeof(sljit_sw));
 	FAIL_IF(!inst);
 	INC_SIZE(2 + sizeof(sljit_sw));
-	*inst++ = REX_W | ((reg_map[reg] <= 7) ? 0 : REX_B);
-	*inst++ = U8(MOV_r_i32 | (reg_map[reg] & 0x7));
-	sljit_unaligned_store_sw(inst, imm);
+	inst[0] = REX_W | ((reg_map[reg] <= 7) ? 0 : REX_B);
+	inst[1] = U8(MOV_r_i32 | (reg_map[reg] & 0x7));
+	sljit_unaligned_store_sw(inst + 2, imm);
 	return SLJIT_SUCCESS;
 }
 
@@ -157,7 +157,7 @@ static sljit_u8* emit_x86_instruction(struct sljit_compiler *compiler, sljit_uw
 				inst_size += 4;
 		}
 		else if (flags & EX86_SHIFT_INS) {
-			imma &= compiler->mode32 ? 0x1f : 0x3f;
+			SLJIT_ASSERT(imma <= (compiler->mode32 ? 0x1f : 0x3f));
 			if (imma != 1) {
 				inst_size++;
 				flags |= EX86_BYTE_ARG;
@@ -196,6 +196,7 @@ static sljit_u8* emit_x86_instruction(struct sljit_compiler *compiler, sljit_uw
 		*inst++ = 0x66;
 	if (rex)
 		*inst++ = rex;
+
 	buf_ptr = inst + size;
 
 	/* Encode mod/rm byte. */
@@ -370,6 +371,12 @@ static sljit_u8* generate_put_label_code(struct sljit_put_label *put_label, slji
 	return code_ptr;
 }
 
+#ifdef _WIN64
+typedef struct {
+	sljit_sw regs[2];
+} sljit_sse2_reg;
+#endif /* _WIN64 */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
@@ -423,7 +430,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 
 #ifdef _WIN64
 	local_size += SLJIT_LOCALS_OFFSET;
-	saved_float_regs_size = GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, 16);
+	saved_float_regs_size = GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sse2_reg);
 
 	if (saved_float_regs_size > 0) {
 		saved_float_regs_offset = ((local_size + 0xf) & ~0xf);
@@ -533,15 +540,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 		tmp = SLJIT_FS0 - fsaveds;
 		for (i = SLJIT_FS0; i > tmp; i--) {
 			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			*inst++ = GROUP_0F;
-			*inst = MOVAPS_xm_x;
+			inst[0] = GROUP_0F;
+			inst[1] = MOVAPS_xm_x;
 			saved_float_regs_offset += 16;
 		}
 
 		for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
 			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			*inst++ = GROUP_0F;
-			*inst = MOVAPS_xm_x;
+			inst[0] = GROUP_0F;
+			inst[1] = MOVAPS_xm_x;
 			saved_float_regs_offset += 16;
 		}
 	}
@@ -565,7 +572,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 
 #ifdef _WIN64
 	local_size += SLJIT_LOCALS_OFFSET;
-	saved_float_regs_size = GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, 16);
+	saved_float_regs_size = GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sse2_reg);
 
 	if (saved_float_regs_size > 0)
 		local_size = ((local_size + 0xf) & ~0xf) + saved_float_regs_size;
@@ -579,10 +586,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_uw size;
-	sljit_s32 i, tmp;
+	sljit_s32 local_size, i, tmp;
 	sljit_u8 *inst;
 #ifdef _WIN64
 	sljit_s32 saved_float_regs_offset;
@@ -591,7 +598,7 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 #endif /* _WIN64 */
 
 #ifdef _WIN64
-	saved_float_regs_offset = GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, 16);
+	saved_float_regs_offset = GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sse2_reg);
 
 	if (saved_float_regs_offset > 0) {
 		compiler->mode32 = 1;
@@ -600,41 +607,32 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 		tmp = SLJIT_FS0 - fsaveds;
 		for (i = SLJIT_FS0; i > tmp; i--) {
 			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			*inst++ = GROUP_0F;
-			*inst = MOVAPS_x_xm;
+			inst[0] = GROUP_0F;
+			inst[1] = MOVAPS_x_xm;
 			saved_float_regs_offset += 16;
 		}
 
 		for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
 			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			*inst++ = GROUP_0F;
-			*inst = MOVAPS_x_xm;
+			inst[0] = GROUP_0F;
+			inst[1] = MOVAPS_x_xm;
 			saved_float_regs_offset += 16;
 		}
+
+		compiler->mode32 = 0;
 	}
 #endif /* _WIN64 */
 
-	if (compiler->local_size > 0) {
-		if (compiler->local_size <= 127) {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
-			FAIL_IF(!inst);
-			INC_SIZE(4);
-			*inst++ = REX_W;
-			*inst++ = GROUP_BINARY_83;
-			*inst++ = MOD_REG | ADD | 4;
-			*inst = U8(compiler->local_size);
-		}
-		else {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 7);
-			FAIL_IF(!inst);
-			INC_SIZE(7);
-			*inst++ = REX_W;
-			*inst++ = GROUP_BINARY_81;
-			*inst++ = MOD_REG | ADD | 4;
-			sljit_unaligned_store_s32(inst, compiler->local_size);
-		}
+	local_size = compiler->local_size;
+
+	if (is_return_to && compiler->scratches < SLJIT_FIRST_SAVED_REG && (compiler->saveds == SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		local_size += SSIZE_OF(sw);
+		is_return_to = 0;
 	}
 
+	if (local_size > 0)
+		BINARY_IMM32(ADD, local_size, SLJIT_SP, 0);
+
 	tmp = compiler->scratches;
 	for (i = SLJIT_FIRST_SAVED_REG; i <= tmp; i++) {
 		size = reg_map[i] >= 8 ? 2 : 1;
@@ -657,6 +655,9 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
 		POP_REG(reg_lmap[i]);
 	}
 
+	if (is_return_to)
+		BINARY_IMM32(ADD, sizeof(sljit_sw), SLJIT_SP, 0);
+
 	return SLJIT_SUCCESS;
 }
 
@@ -667,7 +668,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	compiler->mode32 = 0;
+
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
 	FAIL_IF(!inst);
@@ -676,6 +679,28 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	compiler->mode32 = 0;
+
+	if ((src & SLJIT_MEM) || (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+
+		EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
+		src = TMP_REG2;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Call / return instructions                                           */
 /* --------------------------------------------------------------------- */
@@ -803,7 +828,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compile
 		PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
@@ -827,12 +852,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
 			src = TMP_REG2;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
 	}
 
 	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
@@ -845,14 +870,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compi
 	return sljit_emit_ijump(compiler, type, src, srcw);
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+static sljit_s32 emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
 {
 	sljit_u8 *inst;
 
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
 	if (FAST_IS_REG(dst)) {
 		if (reg_map[dst] < 8) {
 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
@@ -874,7 +895,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *
 	compiler->mode32 = 1;
 	inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
 	FAIL_IF(!inst);
-	*inst++ = POP_rm;
+	*inst = POP_rm;
 	return SLJIT_SUCCESS;
 }
 
@@ -904,8 +925,8 @@ static sljit_s32 emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src
 		compiler->mode32 = 1;
 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_FF;
-		*inst |= PUSH_rm;
+		inst[0] = GROUP_FF;
+		inst[1] |= PUSH_rm;
 
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
 		FAIL_IF(!inst);
@@ -916,8 +937,18 @@ static sljit_s32 emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src
 	return SLJIT_SUCCESS;
 }
 
+static sljit_s32 sljit_emit_get_return_address(struct sljit_compiler *compiler,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 saved_regs_size;
+
+	compiler->mode32 = 0;
+	saved_regs_size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds - SLJIT_KEPT_SAVEDS_COUNT(compiler->options), 0);
+	return emit_mov(compiler, dst, dstw, SLJIT_MEM1(SLJIT_SP), compiler->local_size + saved_regs_size);
+}
+
 /* --------------------------------------------------------------------- */
-/*  Memory operations                                                    */
+/*  Other operations                                                     */
 /* --------------------------------------------------------------------- */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
@@ -931,12 +962,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
-	if (!(reg & REG_PAIR_MASK)) {
-		if (type & (SLJIT_MEM_PRE | SLJIT_MEM_POST))
-			return SLJIT_ERR_UNSUPPORTED;
-
+	if (!(reg & REG_PAIR_MASK))
 		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
-	}
 
 	ADJUST_LOCAL_OFFSET(mem, memw);
 
@@ -1004,10 +1031,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
 	return SLJIT_SUCCESS;
 }
 
-/* --------------------------------------------------------------------- */
-/*  Extend input                                                         */
-/* --------------------------------------------------------------------- */
-
 static sljit_s32 emit_mov_int(struct sljit_compiler *compiler, sljit_s32 sign,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
@@ -1043,7 +1066,7 @@ static sljit_s32 emit_mov_int(struct sljit_compiler *compiler, sljit_s32 sign,
 		if (sign) {
 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src, srcw);
 			FAIL_IF(!inst);
-			*inst++ = MOVSXD_r_rm;
+			*inst = MOVSXD_r_rm;
 		} else {
 			compiler->mode32 = 1;
 			FAIL_IF(emit_mov(compiler, dst_r, 0, src, srcw));
@@ -1062,6 +1085,41 @@ static sljit_s32 emit_mov_int(struct sljit_compiler *compiler, sljit_s32 sign,
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 freg, sljit_s32 reg)
+{
+	sljit_u8 *inst;
+	sljit_u32 size;
+	sljit_u8 rex = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
+
+	if (!(op & SLJIT_32))
+		rex = REX_W;
+
+	if (freg_map[freg] >= 8)
+		rex |= REX_R;
+
+	if (reg_map[reg] >= 8)
+		rex |= REX_B;
+
+	size = (rex != 0) ? 5 : 4;
+
+	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
+	FAIL_IF(!inst);
+	INC_SIZE(size);
+
+	*inst++ = GROUP_66;
+	if (rex != 0)
+		*inst++ = rex;
+	inst[0] = GROUP_0F;
+	inst[1] = GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x;
+	inst[2] = U8(reg_lmap[reg] | (freg_lmap[freg] << 3) | MOD_REG);
+
+	return SLJIT_SUCCESS;
+}
+
 static sljit_s32 skip_frames_before_return(struct sljit_compiler *compiler)
 {
 	sljit_s32 tmp, size;
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_common.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_common.c
index e595538bce..a9645bc175 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_common.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitNativeX86_common.c
@@ -69,7 +69,7 @@ SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
 
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
-	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 7, 6, 3, 4, 5
+	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 5, 7, 6, 4, 3
 };
 
 #define CHECK_EXTRA_REGS(p, w, do) \
@@ -174,6 +174,8 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define AND_rm_r	0x21
 #define ANDPD_x_xm	0x54
 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
+#define BSF_r_rm	(/* GROUP_0F */ 0xbc)
+#define BSWAP_r		(/* GROUP_0F */ 0xc8)
 #define CALL_i32	0xe8
 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
 #define CDQ		0x99
@@ -187,6 +189,8 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define CVTTSD2SI_r_xm	0x2c
 #define DIV		(/* GROUP_F7 */ 6 << 3)
 #define DIVSD_x_xm	0x5e
+#define FLDS		0xd9
+#define FLDL		0xdd
 #define FSTPS		0xd9
 #define FSTPD		0xdd
 #define INT3		0xcc
@@ -202,6 +206,7 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
 #define LEA_r_m		0x8d
 #define LOOP_i8		0xe2
+#define LZCNT_r_rm	(/* GROUP_F3 */ /* GROUP_0F */ 0xbd)
 #define MOV_r_rm	0x8b
 #define MOV_r_i32	0xb8
 #define MOV_rm_r	0x89
@@ -210,6 +215,8 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define MOV_rm8_r8	0x88
 #define MOVAPS_x_xm	0x28
 #define MOVAPS_xm_x	0x29
+#define MOVD_x_rm	0x6e
+#define MOVD_rm_x	0x7e
 #define MOVSD_x_xm	0x10
 #define MOVSD_xm_x	0x11
 #define MOVSXD_r_rm	0x63
@@ -231,10 +238,14 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define POP_rm		0x8f
 #define POPF		0x9d
 #define PREFETCH	0x18
+#define PSHUFD_x_xm	0x70
+#define PUNPCKLDQ_x_xm	0x62
 #define PUSH_i32	0x68
 #define PUSH_r		0x50
 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
 #define PUSHF		0x9c
+#define ROL		(/* SHIFT */ 0 << 3)
+#define ROR		(/* SHIFT */ 1 << 3)
 #define RET_near	0xc3
 #define RET_i16		0xc2
 #define SBB		(/* BINARY */ 3 << 3)
@@ -243,6 +254,8 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define SBB_rm_r	0x19
 #define SAR		(/* SHIFT */ 7 << 3)
 #define SHL		(/* SHIFT */ 4 << 3)
+#define SHLD		(/* GROUP_0F */ 0xa5)
+#define SHRD		(/* GROUP_0F */ 0xad)
 #define SHR		(/* SHIFT */ 5 << 3)
 #define SUB		(/* BINARY */ 5 << 3)
 #define SUB_EAX_i32	0x2d
@@ -251,6 +264,7 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define SUBSD_x_xm	0x5c
 #define TEST_EAX_i32	0xa9
 #define TEST_rm_r	0x85
+#define TZCNT_r_rm	(/* GROUP_F3 */ /* GROUP_0F */ 0xbc)
 #define UCOMISD_x_xm	0x2e
 #define UNPCKLPD_x_xm	0x14
 #define XCHG_EAX_r	0x90
@@ -262,6 +276,8 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 #define XORPD_x_xm	0x57
 
 #define GROUP_0F	0x0f
+#define GROUP_66	0x66
+#define GROUP_F3	0xf3
 #define GROUP_F7	0xf7
 #define GROUP_FF	0xff
 #define GROUP_BINARY_81	0x81
@@ -283,10 +299,15 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
 /* Multithreading does not affect these static variables, since they store
    built-in CPU features. Therefore they can be overwritten by different threads
    if they detect the CPU features in the same time. */
+#define CPU_FEATURE_DETECTED		0x001
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-static sljit_s32 cpu_has_sse2 = -1;
+#define CPU_FEATURE_SSE2		0x002
 #endif
-static sljit_s32 cpu_has_cmov = -1;
+#define CPU_FEATURE_LZCNT		0x004
+#define CPU_FEATURE_TZCNT		0x008
+#define CPU_FEATURE_CMOV		0x010
+
+static sljit_u32 cpu_feature_list = 0;
 
 #ifdef _WIN32_WCE
 #include <cmnintrin.h>
@@ -319,17 +340,64 @@ static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
 
 static void get_cpu_features(void)
 {
-	sljit_u32 features;
+	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
+	sljit_u32 value;
 
 #if defined(_MSC_VER) && _MSC_VER >= 1400
 
 	int CPUInfo[4];
-	__cpuid(CPUInfo, 1);
-	features = (sljit_u32)CPUInfo[3];
 
-#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
+	__cpuid(CPUInfo, 0);
+	if (CPUInfo[0] >= 7) {
+		__cpuidex(CPUInfo, 7, 0);
+		if (CPUInfo[1] & 0x8)
+			feature_list |= CPU_FEATURE_TZCNT;
+	}
+
+	__cpuid(CPUInfo, (int)0x80000001);
+	if (CPUInfo[2] & 0x20)
+		feature_list |= CPU_FEATURE_LZCNT;
+
+	__cpuid(CPUInfo, 1);
+	value = (sljit_u32)CPUInfo[3];
+
+#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__)
 
 	/* AT&T syntax. */
+	__asm__ (
+		"movl $0x0, %%eax\n"
+		"lzcnt %%eax, %%eax\n"
+		"setnz %%al\n"
+		"movl %%eax, %0\n"
+		: "=g" (value)
+		:
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		: "eax"
+#else
+		: "rax"
+#endif
+	);
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_LZCNT;
+
+	__asm__ (
+		"movl $0x0, %%eax\n"
+		"tzcnt %%eax, %%eax\n"
+		"setnz %%al\n"
+		"movl %%eax, %0\n"
+		: "=g" (value)
+		:
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		: "eax"
+#else
+		: "rax"
+#endif
+	);
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_TZCNT;
+
 	__asm__ (
 		"movl $0x1, %%eax\n"
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
@@ -342,30 +410,54 @@ static void get_cpu_features(void)
 		"pop %%ebx\n"
 #endif
 		"movl %%edx, %0\n"
-		: "=g" (features)
+		: "=g" (value)
 		:
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		: "%eax", "%ecx", "%edx"
+		: "eax", "ecx", "edx"
 #else
-		: "%rax", "%rbx", "%rcx", "%rdx"
+		: "rax", "rbx", "rcx", "rdx"
 #endif
 	);
 
 #else /* _MSC_VER && _MSC_VER >= 1400 */
 
 	/* Intel syntax. */
+	__asm {
+		mov eax, 0
+		lzcnt eax, eax
+		setnz al
+		mov value, eax
+	}
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_LZCNT;
+
+	__asm {
+		mov eax, 0
+		tzcnt eax, eax
+		setnz al
+		mov value, eax
+	}
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_TZCNT;
+
 	__asm {
 		mov eax, 1
 		cpuid
-		mov features, edx
+		mov value, edx
 	}
 
 #endif /* _MSC_VER && _MSC_VER >= 1400 */
 
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-	cpu_has_sse2 = (features >> 26) & 0x1;
+	if (value & 0x4000000)
+		feature_list |= CPU_FEATURE_SSE2;
 #endif
-	cpu_has_cmov = (features >> 15) & 0x1;
+	if (value & 0x8000)
+		feature_list |= CPU_FEATURE_CMOV;
+
+	cpu_feature_list = feature_list;
 }
 
 static sljit_u8 get_jump_code(sljit_uw type)
@@ -454,13 +546,13 @@ static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code
 	else
 		label_addr = jump->u.target - (sljit_uw)executable_offset;
 
-	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
-
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
 		return generate_far_jump_code(jump, code_ptr);
 #endif
 
+	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
+
 	if (type == SLJIT_JUMP) {
 		if (short_jump)
 			*code_ptr++ = JMP_i8;
@@ -586,32 +678,33 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 
 	jump = compiler->jumps;
 	while (jump) {
-		jump_addr = jump->addr + (sljit_uw)executable_offset;
+		if (jump->flags & (PATCH_MB | PATCH_MW)) {
+			if (jump->flags & JUMP_LABEL)
+				jump_addr = jump->u.label->addr;
+			else
+				jump_addr = jump->u.target;
 
-		if (jump->flags & PATCH_MB) {
-			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127);
-			*(sljit_u8*)jump->addr = U8(jump->u.label->addr - (jump_addr + sizeof(sljit_s8)));
-		} else if (jump->flags & PATCH_MW) {
-			if (jump->flags & JUMP_LABEL) {
+			jump_addr -= jump->addr + (sljit_uw)executable_offset;
+
+			if (jump->flags & PATCH_MB) {
+				jump_addr -= sizeof(sljit_s8);
+				SLJIT_ASSERT((sljit_sw)jump_addr >= -128 && (sljit_sw)jump_addr <= 127);
+				*(sljit_u8*)jump->addr = U8(jump_addr);
+			} else {
+				jump_addr -= sizeof(sljit_s32);
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw))));
+				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump_addr);
 #else
-				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
-				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))));
-#endif
-			}
-			else {
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw))));
-#else
-				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
-				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32))));
+				SLJIT_ASSERT((sljit_sw)jump_addr >= HALFWORD_MIN && (sljit_sw)jump_addr <= HALFWORD_MAX);
+				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)jump_addr);
 #endif
 			}
 		}
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		else if (jump->flags & PATCH_MD)
-			sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump->u.label->addr);
+		else if (jump->flags & PATCH_MD) {
+				SLJIT_ASSERT(jump->flags & JUMP_LABEL);
+				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump->u.label->addr);
+		}
 #endif
 
 		jump = jump->next;
@@ -652,9 +745,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 #ifdef SLJIT_IS_FPU_AVAILABLE
 		return SLJIT_IS_FPU_AVAILABLE;
 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-		if (cpu_has_sse2 == -1)
+		if (cpu_feature_list == 0)
 			get_cpu_features();
-		return cpu_has_sse2;
+		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
 #else /* SLJIT_DETECT_SSE2 */
 		return 1;
 #endif /* SLJIT_DETECT_SSE2 */
@@ -662,25 +755,40 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	case SLJIT_HAS_VIRTUAL_REGISTERS:
 		return 1;
-#endif
+#endif /* SLJIT_CONFIG_X86_32 */
 
 	case SLJIT_HAS_CLZ:
-	case SLJIT_HAS_CMOV:
-		if (cpu_has_cmov == -1)
+		if (cpu_feature_list == 0)
 			get_cpu_features();
-		return cpu_has_cmov;
 
+		return (cpu_feature_list & CPU_FEATURE_LZCNT) ? 1 : 2;
+
+	case SLJIT_HAS_CTZ:
+		if (cpu_feature_list == 0)
+			get_cpu_features();
+
+		return (cpu_feature_list & CPU_FEATURE_TZCNT) ? 1 : 2;
+
+	case SLJIT_HAS_CMOV:
+		if (cpu_feature_list == 0)
+			get_cpu_features();
+		return (cpu_feature_list & CPU_FEATURE_CMOV) != 0;
+
+	case SLJIT_HAS_REV:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_PREFETCH:
+	case SLJIT_HAS_COPY_F32:
+	case SLJIT_HAS_COPY_F64:
 		return 1;
 
 	case SLJIT_HAS_SSE2:
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-		if (cpu_has_sse2 == -1)
+		if (cpu_feature_list == 0)
 			get_cpu_features();
-		return cpu_has_sse2;
-#else
+		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
+#else /* !SLJIT_DETECT_SSE2 */
 		return 1;
-#endif
+#endif /* SLJIT_DETECT_SSE2 */
 
 	default:
 		return 0;
@@ -767,14 +875,14 @@ static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler)
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
 	FAIL_IF(!inst);
 	INC_SIZE(4);
-	*inst++ = 0xf3;
-	*inst++ = 0x0f;
-	*inst++ = 0x1e;
+	inst[0] = GROUP_F3;
+	inst[1] = GROUP_0F;
+	inst[2] = 0x1e;
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-	*inst = 0xfb;
-#else
-	*inst = 0xfa;
-#endif
+	inst[3] = 0xfb;
+#else /* !SLJIT_CONFIG_X86_32 */
+	inst[3] = 0xfa;
+#endif /* SLJIT_CONFIG_X86_32 */
 #else /* !SLJIT_CONFIG_X86_CET */
 	SLJIT_UNUSED_ARG(compiler);
 #endif /* SLJIT_CONFIG_X86_CET */
@@ -797,13 +905,13 @@ static SLJIT_INLINE sljit_s32 emit_rdssp(struct sljit_compiler *compiler, sljit_
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
 	FAIL_IF(!inst);
 	INC_SIZE(size);
-	*inst++ = 0xf3;
+	*inst++ = GROUP_F3;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
 #endif
-	*inst++ = 0x0f;
-	*inst++ = 0x1e;
-	*inst = (0x3 << 6) | (0x1 << 3) | (reg_map[reg] & 0x7);
+	inst[0] = GROUP_0F;
+	inst[1] = 0x1e;
+	inst[2] = U8(MOD_REG | (0x1 << 3) | reg_lmap[reg]);
 	return SLJIT_SUCCESS;
 }
 
@@ -821,13 +929,13 @@ static SLJIT_INLINE sljit_s32 emit_incssp(struct sljit_compiler *compiler, sljit
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
 	FAIL_IF(!inst);
 	INC_SIZE(size);
-	*inst++ = 0xf3;
+	*inst++ = GROUP_F3;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
 #endif
-	*inst++ = 0x0f;
-	*inst++ = 0xae;
-	*inst = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7);
+	inst[0] = GROUP_0F;
+	inst[1] = 0xae;
+	inst[2] = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7);
 	return SLJIT_SUCCESS;
 }
 
@@ -855,19 +963,7 @@ static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compile
 	FAIL_IF(emit_rdssp(compiler, TMP_REG1));
 
 	/* Load return address on shadow stack into TMP_REG1. */
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-	SLJIT_ASSERT(reg_map[TMP_REG1] == 5);
-
-	/* Hand code unsupported "mov 0x0(%ebp),%ebp". */
-	inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
-	FAIL_IF(!inst);
-	INC_SIZE(3);
-	*inst++ = 0x8b;
-	*inst++ = 0x6d;
-	*inst = 0;
-#else /* !SLJIT_CONFIG_X86_32 */
 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(TMP_REG1), 0);
-#endif /* SLJIT_CONFIG_X86_32 */
 
 	/* Compare return address against TMP_REG1. */
 	FAIL_IF(emit_cmp_binary (compiler, TMP_REG1, 0, src, srcw));
@@ -895,8 +991,8 @@ static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compile
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
 	FAIL_IF(!inst);
 	INC_SIZE(2);
-	*inst++ = JMP_i8;
-	*inst = size_before_rdssp_inst - compiler->size;
+	inst[0] = JMP_i8;
+	inst[1] = size_before_rdssp_inst - compiler->size;
 
 	*jz_after_cmp_inst = compiler->size - size_jz_after_cmp_inst;
 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
@@ -1049,8 +1145,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
 				FAIL_IF(!inst);
 				INC_SIZE(2);
-				*inst++ = REX_W;
-				*inst = CDQ;
+				inst[0] = REX_W;
+				inst[1] = CDQ;
 			}
 #endif
 		}
@@ -1059,14 +1155,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
 		FAIL_IF(!inst);
 		INC_SIZE(2);
-		*inst++ = GROUP_F7;
-		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
-#else
+		inst[0] = GROUP_F7;
+		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
+#else /* !SLJIT_CONFIG_X86_32 */
 #ifdef _WIN64
 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
-#else
+#else /* !_WIN64 */
 		size = (!compiler->mode32) ? 3 : 2;
-#endif
+#endif /* _WIN64 */
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
 		FAIL_IF(!inst);
 		INC_SIZE(size);
@@ -1075,29 +1171,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
 		else if (op >= SLJIT_DIVMOD_UW)
 			*inst++ = REX_B;
-		*inst++ = GROUP_F7;
-		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
-#else
+		inst[0] = GROUP_F7;
+		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
+#else /* !_WIN64 */
 		if (!compiler->mode32)
 			*inst++ = REX_W;
-		*inst++ = GROUP_F7;
-		*inst = MOD_REG | reg_map[SLJIT_R1];
-#endif
-#endif
+		inst[0] = GROUP_F7;
+		inst[1] = MOD_REG | reg_map[SLJIT_R1];
+#endif /* _WIN64 */
+#endif /* SLJIT_CONFIG_X86_32 */
 		switch (op) {
 		case SLJIT_LMUL_UW:
-			*inst |= MUL;
+			inst[1] |= MUL;
 			break;
 		case SLJIT_LMUL_SW:
-			*inst |= IMUL;
+			inst[1] |= IMUL;
 			break;
 		case SLJIT_DIVMOD_UW:
 		case SLJIT_DIV_UW:
-			*inst |= DIV;
+			inst[1] |= DIV;
 			break;
 		case SLJIT_DIVMOD_SW:
 		case SLJIT_DIV_SW:
-			*inst |= IDIV;
+			inst[1] |= IDIV;
 			break;
 		}
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
@@ -1178,8 +1274,8 @@ static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
 				EMIT_MOV(compiler, dst, 0, src, 0);
 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
 			FAIL_IF(!inst);
-			*inst++ = GROUP_0F;
-			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
+			inst[0] = GROUP_0F;
+			inst[1] = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
 		}
 		else {
 			if (dst != src)
@@ -1207,8 +1303,8 @@ static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_0F;
-		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
+		inst[0] = GROUP_0F;
+		inst[1] = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
 	}
 
 	if (dst & SLJIT_MEM) {
@@ -1278,15 +1374,15 @@ static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
 
 	inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_0F;
-	*inst++ = PREFETCH;
+	inst[0] = GROUP_0F;
+	inst[1] = PREFETCH;
 
 	if (op == SLJIT_PREFETCH_L1)
-		*inst |= (1 << 3);
+		inst[2] |= (1 << 3);
 	else if (op == SLJIT_PREFETCH_L2)
-		*inst |= (2 << 3);
+		inst[2] |= (2 << 3);
 	else if (op == SLJIT_PREFETCH_L3)
-		*inst |= (3 << 3);
+		inst[2] |= (3 << 3);
 
 	return SLJIT_SUCCESS;
 }
@@ -1326,8 +1422,8 @@ static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
 	else {
 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_0F;
-		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
+		inst[0] = GROUP_0F;
+		inst[1] = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
 	}
 
 	if (dst & SLJIT_MEM) {
@@ -1349,8 +1445,8 @@ static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
 		/* Same input and output */
 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_F7;
-		*inst |= opcode;
+		inst[0] = GROUP_F7;
+		inst[1] |= opcode;
 		return SLJIT_SUCCESS;
 	}
 
@@ -1358,107 +1454,146 @@ static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
 		EMIT_MOV(compiler, dst, 0, src, srcw);
 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_F7;
-		*inst |= opcode;
+		inst[0] = GROUP_F7;
+		inst[1] |= opcode;
 		return SLJIT_SUCCESS;
 	}
 
 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_F7;
-	*inst |= opcode;
-	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
-	return SLJIT_SUCCESS;
-}
-
-static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src, sljit_sw srcw)
-{
-	sljit_u8* inst;
-
-	if (FAST_IS_REG(dst)) {
-		EMIT_MOV(compiler, dst, 0, src, srcw);
-		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
-		FAIL_IF(!inst);
-		*inst++ = GROUP_F7;
-		*inst |= NOT_rm;
-		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
-		FAIL_IF(!inst);
-		*inst = OR_r_rm;
-		return SLJIT_SUCCESS;
-	}
-
-	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
-	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
-	FAIL_IF(!inst);
-	*inst++ = GROUP_F7;
-	*inst |= NOT_rm;
-	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
-	FAIL_IF(!inst);
-	*inst = OR_r_rm;
+	inst[0] = GROUP_F7;
+	inst[1] |= opcode;
 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
 	return SLJIT_SUCCESS;
 }
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 static const sljit_sw emit_clz_arg = 32 + 31;
+static const sljit_sw emit_ctz_arg = 32;
 #endif
 
-static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
+static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
 {
 	sljit_u8* inst;
 	sljit_s32 dst_r;
+	sljit_sw max;
 
-	SLJIT_UNUSED_ARG(op_flags);
-
-	if (cpu_has_cmov == -1)
+	if (cpu_feature_list == 0)
 		get_cpu_features();
 
 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
+	if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) {
+		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_F3, dst_r, 0, src, srcw);
+		FAIL_IF(!inst);
+		inst[0] = GROUP_0F;
+		inst[1] = is_clz ? LZCNT_r_rm : TZCNT_r_rm;
+
+		if (dst & SLJIT_MEM)
+			EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
+		return SLJIT_SUCCESS;
+	}
+
 	inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_0F;
-	*inst = BSR_r_rm;
+	inst[0] = GROUP_0F;
+	inst[1] = is_clz ? BSR_r_rm : BSF_r_rm;
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-	if (cpu_has_cmov) {
+	max = is_clz ? (32 + 31) : 32;
+
+	if (cpu_feature_list & CPU_FEATURE_CMOV) {
 		if (dst_r != TMP_REG1) {
-			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 32 + 31);
+			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, max);
 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
 		}
 		else
-			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), (sljit_sw)&emit_clz_arg);
+			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), is_clz ? (sljit_sw)&emit_clz_arg : (sljit_sw)&emit_ctz_arg);
 
 		FAIL_IF(!inst);
-		*inst++ = GROUP_0F;
-		*inst = CMOVE_r_rm;
+		inst[0] = GROUP_0F;
+		inst[1] = CMOVE_r_rm;
 	}
 	else
-		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, 32 + 31));
+		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
 
-	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
+	if (is_clz) {
+		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
+		FAIL_IF(!inst);
+		*(inst + 1) |= XOR;
+	}
 #else
-	if (cpu_has_cmov) {
-		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, !(op_flags & SLJIT_32) ? (64 + 63) : (32 + 31));
+	if (is_clz)
+		max = compiler->mode32 ? (32 + 31) : (64 + 63);
+	else
+		max = compiler->mode32 ? 32 : 64;
+
+	if (cpu_feature_list & CPU_FEATURE_CMOV) {
+		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max);
 
 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_0F;
-		*inst = CMOVE_r_rm;
+		inst[0] = GROUP_0F;
+		inst[1] = CMOVE_r_rm;
 	}
 	else
-		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, !(op_flags & SLJIT_32) ? (64 + 63) : (32 + 31)));
+		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
 
-	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_32) ? 63 : 31, dst_r, 0);
+	if (is_clz) {
+		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, max >> 1, dst_r, 0);
+		FAIL_IF(!inst);
+		*(inst + 1) |= XOR;
+	}
 #endif
 
+	if (dst & SLJIT_MEM)
+		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
+	return SLJIT_SUCCESS;
+}
+
+static sljit_s32 emit_bswap(struct sljit_compiler *compiler,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_u8 *inst;
+	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+	sljit_uw size;
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	sljit_u8 rex = 0;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+	if (src != dst_r)
+		EMIT_MOV(compiler, dst_r, 0, src, srcw);
+
+	size = 2;
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	if (!compiler->mode32)
+		rex = REX_W;
+
+	if (reg_map[dst_r] >= 8)
+		rex |= REX_B;
+
+	if (rex != 0)
+		size++;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
 	FAIL_IF(!inst);
-	*(inst + 1) |= XOR;
+	INC_SIZE(size);
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	if (rex != 0)
+		*inst++ = rex;
+
+	inst[0] = GROUP_0F;
+	inst[1] = BSWAP_r | reg_lmap[dst_r];
+#else /* !SLJIT_CONFIG_X86_64 */
+	inst[0] = GROUP_0F;
+	inst[1] = BSWAP_r | reg_map[dst_r];
+#endif /* SLJIT_CONFIG_X86_64 */
 
 	if (dst & SLJIT_MEM)
 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
@@ -1469,10 +1604,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
 {
-	sljit_s32 op_flags = GET_ALL_FLAGS(op);
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	sljit_s32 dst_is_ereg = 0;
-#endif
+#else /* !SLJIT_CONFIG_X86_32 */
+	sljit_s32 op_flags = GET_ALL_FLAGS(op);
+#endif /* SLJIT_CONFIG_X86_32 */
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
@@ -1483,14 +1619,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	CHECK_EXTRA_REGS(src, srcw, (void)0);
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	compiler->mode32 = op_flags & SLJIT_32;
-#endif
+#endif /* SLJIT_CONFIG_X86_64 */
 
 	op = GET_OPCODE(op);
 
 	if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 		compiler->mode32 = 0;
-#endif
+#endif /* SLJIT_CONFIG_X86_64 */
 
 		if (FAST_IS_REG(src) && src == dst) {
 			if (!TYPE_CAST_NEEDED(op))
@@ -1508,7 +1644,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 					op = SLJIT_MOV_S32;
 			}
 		}
-#endif
+#endif /* SLJIT_CONFIG_X86_64 */
 
 		if (src & SLJIT_IMM) {
 			switch (op) {
@@ -1531,12 +1667,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 			case SLJIT_MOV_S32:
 				srcw = (sljit_s32)srcw;
 				break;
-#endif
+#endif /* SLJIT_CONFIG_X86_64 */
 			}
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 			if (SLJIT_UNLIKELY(dst_is_ereg))
 				return emit_mov(compiler, dst, dstw, src, srcw);
-#endif
+#endif /* SLJIT_CONFIG_X86_32 */
 		}
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
@@ -1544,7 +1680,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
 			dst = TMP_REG1;
 		}
-#endif
+#endif /* SLJIT_CONFIG_X86_32 */
 
 		switch (op) {
 		case SLJIT_MOV:
@@ -1553,7 +1689,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 		case SLJIT_MOV_U32:
 		case SLJIT_MOV_S32:
 		case SLJIT_MOV32:
-#endif
+#endif /* SLJIT_CONFIG_X86_32 */
 			EMIT_MOV(compiler, dst, dstw, src, srcw);
 			break;
 		case SLJIT_MOV_U8:
@@ -1580,24 +1716,22 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 			EMIT_MOV(compiler, dst, dstw, src, srcw);
 			compiler->mode32 = 0;
 			break;
-#endif
+#endif /* SLJIT_CONFIG_X86_64 */
 		}
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
-#endif
+#endif /* SLJIT_CONFIG_X86_32 */
 		return SLJIT_SUCCESS;
 	}
 
 	switch (op) {
-	case SLJIT_NOT:
-		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_Z))
-			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
-		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
-
 	case SLJIT_CLZ:
-		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
+	case SLJIT_CTZ:
+		return emit_clz_ctz(compiler, (op == SLJIT_CLZ), dst, dstw, src, srcw);
+	case SLJIT_REV:
+		return emit_bswap(compiler, dst, dstw, src, srcw);
 	}
 
 	return SLJIT_SUCCESS;
@@ -1795,14 +1929,14 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_0F;
-		*inst = IMUL_r_rm;
+		inst[0] = GROUP_0F;
+		inst[1] = IMUL_r_rm;
 	}
 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_0F;
-		*inst = IMUL_r_rm;
+		inst[0] = GROUP_0F;
+		inst[1] = IMUL_r_rm;
 	}
 	else if (src1 & SLJIT_IMM) {
 		if (src2 & SLJIT_IMM) {
@@ -1846,8 +1980,8 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
 			FAIL_IF(!inst);
-			*inst++ = GROUP_0F;
-			*inst = IMUL_r_rm;
+			inst[0] = GROUP_0F;
+			inst[1] = IMUL_r_rm;
 		}
 #endif
 	}
@@ -1889,8 +2023,8 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
 			FAIL_IF(!inst);
-			*inst++ = GROUP_0F;
-			*inst = IMUL_r_rm;
+			inst[0] = GROUP_0F;
+			inst[1] = IMUL_r_rm;
 		}
 #endif
 	}
@@ -1901,8 +2035,8 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_0F;
-		*inst = IMUL_r_rm;
+		inst[0] = GROUP_0F;
+		inst[1] = IMUL_r_rm;
 	}
 
 	if (dst & SLJIT_MEM)
@@ -2135,6 +2269,9 @@ static sljit_s32 emit_shift(struct sljit_compiler *compiler,
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w)
 {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	sljit_s32 mode32;
+#endif
 	sljit_u8* inst;
 
 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
@@ -2174,41 +2311,62 @@ static sljit_s32 emit_shift(struct sljit_compiler *compiler,
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 		FAIL_IF(!inst);
 		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 	}
-	else if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
+
+	if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
 		if (src1 != dst)
 			EMIT_MOV(compiler, dst, 0, src1, src1w);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		mode32 = compiler->mode32;
+		compiler->mode32 = 0;
+#endif
 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = mode32;
+#endif
 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
 		FAIL_IF(!inst);
 		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
-	}
-	else {
-		/* This case is complex since ecx itself may be used for
-		   addressing, and this case must be supported as well. */
-		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
-		FAIL_IF(!inst);
-		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
-#else
-		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
-		FAIL_IF(!inst);
-		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = 0;
 #endif
-		if (dst != TMP_REG1)
-			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
+		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = mode32;
+#endif
+		return SLJIT_SUCCESS;
 	}
 
+	/* This case is complex since ecx itself may be used for
+	   addressing, and this case must be supported as well. */
+	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
+#else /* !SLJIT_CONFIG_X86_32 */
+	mode32 = compiler->mode32;
+	compiler->mode32 = 0;
+	EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
+	compiler->mode32 = mode32;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
+	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+	FAIL_IF(!inst);
+	*inst |= mode;
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
+#else
+	compiler->mode32 = 0;
+	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
+	compiler->mode32 = mode32;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	if (dst != TMP_REG1)
+		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
+
 	return SLJIT_SUCCESS;
 }
 
@@ -2221,12 +2379,13 @@ static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
 	/* The CPU does not set flags if the shift count is 0. */
 	if (src2 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
+		src2w &= compiler->mode32 ? 0x1f : 0x3f;
+#else /* !SLJIT_CONFIG_X86_64 */
+		src2w &= 0x1f;
+#endif /* SLJIT_CONFIG_X86_64 */
+		if (src2w != 0)
 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
-#else
-		if ((src2w & 0x1f) != 0)
-			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
-#endif
+
 		if (!set_flags)
 			return emit_mov(compiler, dst, dstw, src1, src1w);
 		/* OR dst, src, 0 */
@@ -2305,17 +2464,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_XOR:
+		if (!HAS_FLAGS(op)) {
+			if ((src2 & SLJIT_IMM) && src2w == -1)
+				return emit_unary(compiler, NOT_rm, dst, dstw, src1, src1w);
+			if ((src1 & SLJIT_IMM) && src1w == -1)
+				return emit_unary(compiler, NOT_rm, dst, dstw, src2, src2w);
+		}
+
 		return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
 			dst, dstw, src1, src1w, src2, src2w);
+	case SLJIT_ROTL:
+		return emit_shift_with_flags(compiler, ROL, 0,
+			dst, dstw, src1, src1w, src2, src2w);
+	case SLJIT_ROTR:
+		return emit_shift_with_flags(compiler, ROR, 0,
+			dst, dstw, src1, src1w, src2, src2w);
 	}
 
 	return SLJIT_SUCCESS;
@@ -2350,6 +2525,200 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
 	return emit_test_binary(compiler, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 src1_reg,
+	sljit_s32 src2_reg,
+	sljit_s32 src3, sljit_sw src3w)
+{
+	sljit_s32 is_rotate, is_left, move_src1;
+	sljit_u8* inst;
+	sljit_sw src1w = 0;
+	sljit_sw dstw = 0;
+	/* The whole register must be saved even for 32 bit operations. */
+	sljit_u8 restore_ecx = 0;
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	sljit_sw src2w = 0;
+	sljit_s32 restore_sp4 = 0;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
+	ADJUST_LOCAL_OFFSET(src3, src3w);
+
+	CHECK_EXTRA_REGS(dst_reg, dstw, (void)0);
+	CHECK_EXTRA_REGS(src3, src3w, (void)0);
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	compiler->mode32 = op & SLJIT_32;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+	if (src3 & SLJIT_IMM) {
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		src3w &= 0x1f;
+#else /* !SLJIT_CONFIG_X86_32 */
+		src3w &= (op & SLJIT_32) ? 0x1f : 0x3f;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+		if (src3w == 0)
+			return SLJIT_SUCCESS;
+	}
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	is_rotate = (src1_reg == src2_reg);
+	CHECK_EXTRA_REGS(src1_reg, src1w, (void)0);
+	CHECK_EXTRA_REGS(src2_reg, src2w, (void)0);
+
+	if (is_rotate)
+		return emit_shift(compiler, is_left ? ROL : ROR, dst_reg, dstw, src1_reg, src1w, src3, src3w);
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	if (src2_reg & SLJIT_MEM) {
+		EMIT_MOV(compiler, TMP_REG1, 0, src2_reg, src2w);
+		src2_reg = TMP_REG1;
+	}
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	if (dst_reg == SLJIT_PREF_SHIFT_REG && !(src3 & SLJIT_IMM) && (src3 != SLJIT_PREF_SHIFT_REG || src1_reg != SLJIT_PREF_SHIFT_REG)) {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
+		src1_reg = TMP_REG1;
+		src1w = 0;
+#else /* !SLJIT_CONFIG_X86_64 */
+		if (src2_reg != TMP_REG1) {
+			EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
+			src1_reg = TMP_REG1;
+			src1w = 0;
+		} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
+			restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
+			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
+			EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
+			src1_reg = restore_sp4;
+			src1w = 0;
+		} else {
+			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
+			restore_sp4 = src1_reg;
+		}
+#endif /* SLJIT_CONFIG_X86_64 */
+
+		if (src3 != SLJIT_PREF_SHIFT_REG)
+			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
+	} else {
+		if (src2_reg == SLJIT_PREF_SHIFT_REG && !(src3 & SLJIT_IMM) && src3 != SLJIT_PREF_SHIFT_REG) {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+			compiler->mode32 = 0;
+#endif /* SLJIT_CONFIG_X86_64 */
+			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+			compiler->mode32 = op & SLJIT_32;
+#endif /* SLJIT_CONFIG_X86_64 */
+			src2_reg = TMP_REG1;
+			restore_ecx = 1;
+		}
+
+		move_src1 = 0;
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		if (dst_reg != src1_reg) {
+			if (dst_reg != src3) {
+				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
+				src1_reg = dst_reg;
+				src1w = 0;
+			} else
+				move_src1 = 1;
+		}
+#else /* !SLJIT_CONFIG_X86_64 */
+		if (dst_reg & SLJIT_MEM) {
+			if (src2_reg != TMP_REG1) {
+				EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
+				src1_reg = TMP_REG1;
+				src1w = 0;
+			} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
+				restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
+				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
+				EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
+				src1_reg = restore_sp4;
+				src1w = 0;
+			} else {
+				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
+				restore_sp4 = src1_reg;
+			}
+		} else if (dst_reg != src1_reg) {
+			if (dst_reg != src3) {
+				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
+				src1_reg = dst_reg;
+				src1w = 0;
+			} else
+				move_src1 = 1;
+		}
+#endif /* SLJIT_CONFIG_X86_64 */
+
+		if (!(src3 & SLJIT_IMM) && src3 != SLJIT_PREF_SHIFT_REG) {
+			if (!restore_ecx) {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+				compiler->mode32 = 0;
+				EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
+				compiler->mode32 = op & SLJIT_32;
+				restore_ecx = 1;
+#else /* !SLJIT_CONFIG_X86_64 */
+				if (src1_reg != TMP_REG1 && src2_reg != TMP_REG1) {
+					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
+					restore_ecx = 1;
+				} else {
+					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
+					restore_ecx = 2;
+				}
+#endif /* SLJIT_CONFIG_X86_64 */
+			}
+			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
+		}
+
+		if (move_src1) {
+			EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
+			src1_reg = dst_reg;
+			src1w = 0;
+		}
+	}
+
+	inst = emit_x86_instruction(compiler, 2, src2_reg, 0, src1_reg, src1w);
+	FAIL_IF(!inst);
+	inst[0] = GROUP_0F;
+
+	if (src3 & SLJIT_IMM) {
+		inst[1] = U8((is_left ? SHLD : SHRD) - 1);
+
+		/* Immedate argument is added separately. */
+		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
+		FAIL_IF(!inst);
+		INC_SIZE(1);
+		*inst = U8(src3w);
+	} else
+		inst[1] = U8(is_left ? SHLD : SHRD);
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	if (restore_ecx) {
+		compiler->mode32 = 0;
+		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+	}
+
+	if (src1_reg != dst_reg) {
+		compiler->mode32 = op & SLJIT_32;
+		return emit_mov(compiler, dst_reg, dstw, src1_reg, 0);
+	}
+#else /* !SLJIT_CONFIG_X86_64 */
+	if (restore_ecx)
+		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, restore_ecx == 1 ? TMP_REG1 : SLJIT_MEM1(SLJIT_SP), 0);
+
+	if (src1_reg != dst_reg)
+		EMIT_MOV(compiler, dst_reg, dstw, src1_reg, 0);
+
+	if (restore_sp4)
+		return emit_mov(compiler, restore_sp4, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32));
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -2377,6 +2746,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	CHECK_EXTRA_REGS(dst, dstw, (void)0);
+
+	switch (op) {
+	case SLJIT_FAST_ENTER:
+		return emit_fast_enter(compiler, dst, dstw);
+	case SLJIT_GET_RETURN_ADDRESS:
+		return sljit_emit_get_return_address(compiler, dst, dstw);
+	}
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -2442,8 +2830,8 @@ static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
 
 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_0F;
-	*inst = opcode;
+	inst[0] = GROUP_0F;
+	inst[1] = opcode;
 	return SLJIT_SUCCESS;
 }
 
@@ -2454,8 +2842,8 @@ static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcod
 
 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_0F;
-	*inst = opcode;
+	inst[0] = GROUP_0F;
+	inst[1] = opcode;
 	return SLJIT_SUCCESS;
 }
 
@@ -2475,9 +2863,12 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_comp
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
 {
-	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+	sljit_s32 dst_r;
 	sljit_u8 *inst;
 
+	CHECK_EXTRA_REGS(dst, dstw, (void)0);
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
 		compiler->mode32 = 0;
@@ -2485,8 +2876,8 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_comp
 
 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_0F;
-	*inst = CVTTSD2SI_r_xm;
+	inst[0] = GROUP_0F;
+	inst[1] = CVTTSD2SI_r_xm;
 
 	if (dst & SLJIT_MEM)
 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
@@ -2500,6 +2891,8 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
 	sljit_u8 *inst;
 
+	CHECK_EXTRA_REGS(src, srcw, (void)0);
+
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
 		compiler->mode32 = 0;
@@ -2517,8 +2910,8 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
 
 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_0F;
-	*inst = CVTSI2SD_x_rm;
+	inst[0] = GROUP_0F;
+	inst[1] = CVTSI2SD_x_rm;
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	compiler->mode32 = 1;
@@ -2534,9 +2927,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile
 {
 	switch (GET_FLAG_TYPE(op)) {
 	case SLJIT_ORDERED_LESS:
-	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
 	case SLJIT_UNORDERED_OR_GREATER:
-	case SLJIT_ORDERED_LESS_EQUAL:
 		if (!FAST_IS_REG(src2)) {
 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
 			src2 = TMP_FREG;
@@ -2701,9 +3092,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi
 
 	inst = (sljit_u8*)ensure_buf(compiler, 2);
 	PTR_FAIL_IF(!inst);
-
-	*inst++ = 0;
-	*inst++ = 0;
+	inst[0] = 0;
+	inst[1] = 0;
 
 	return label;
 }
@@ -2731,8 +3121,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile
 	inst = (sljit_u8*)ensure_buf(compiler, 2);
 	PTR_FAIL_IF_NULL(inst);
 
-	*inst++ = 0;
-	*inst++ = 1;
+	inst[0] = 0;
+	inst[1] = 1;
 	return jump;
 }
 
@@ -2763,8 +3153,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 		inst = (sljit_u8*)ensure_buf(compiler, 2);
 		FAIL_IF_NULL(inst);
 
-		*inst++ = 0;
-		*inst++ = 1;
+		inst[0] = 0;
+		inst[1] = 1;
 	}
 	else {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -2773,8 +3163,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi
 #endif
 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
 		FAIL_IF(!inst);
-		*inst++ = GROUP_FF;
-		*inst = U8(*inst | ((type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm));
+		inst[0] = GROUP_FF;
+		inst[1] = U8(inst[1] | ((type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm));
 	}
 	return SLJIT_SUCCESS;
 }
@@ -2784,10 +3174,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 	sljit_s32 type)
 {
 	sljit_u8 *inst;
-	sljit_u8 cond_set = 0;
+	sljit_u8 cond_set;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	sljit_s32 reg;
-#endif
+#endif /* !SLJIT_CONFIG_X86_64 */
 	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
 	sljit_s32 dst_save = dst;
 	sljit_sw dstw_save = dstw;
@@ -2807,13 +3197,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 		FAIL_IF(!inst);
 		INC_SIZE(4 + 3);
 		/* Set low register to conditional flag. */
-		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
-		*inst++ = GROUP_0F;
-		*inst++ = cond_set;
-		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
-		*inst++ = U8(REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B));
-		*inst++ = OR_rm8_r8;
-		*inst++ = U8(MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst]);
+		inst[0] = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
+		inst[1] = GROUP_0F;
+		inst[2] = cond_set;
+		inst[3] = MOD_REG | reg_lmap[TMP_REG1];
+		inst[4] = U8(REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B));
+		inst[5] = OR_rm8_r8;
+		inst[6] = U8(MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst]);
 		return SLJIT_SUCCESS;
 	}
 
@@ -2823,15 +3213,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 	FAIL_IF(!inst);
 	INC_SIZE(4 + 4);
 	/* Set low register to conditional flag. */
-	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
-	*inst++ = GROUP_0F;
-	*inst++ = cond_set;
-	*inst++ = MOD_REG | reg_lmap[reg];
-	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
+	inst[0] = (reg_map[reg] <= 7) ? REX : REX_B;
+	inst[1] = GROUP_0F;
+	inst[2] = cond_set;
+	inst[3] = MOD_REG | reg_lmap[reg];
+	inst[4] = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
 	/* The movzx instruction does not affect flags. */
-	*inst++ = GROUP_0F;
-	*inst++ = MOVZX_r_rm8;
-	*inst = U8(MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg]);
+	inst[5] = GROUP_0F;
+	inst[6] = MOVZX_r_rm8;
+	inst[7] = U8(MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg]);
 
 	if (reg != TMP_REG1)
 		return SLJIT_SUCCESS;
@@ -2844,110 +3234,52 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
 
-#else
+#else /* !SLJIT_CONFIG_X86_64 */
+	SLJIT_ASSERT(reg_map[TMP_REG1] < 4);
+
 	/* The SLJIT_CONFIG_X86_32 code path starts here. */
-	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
-		if (reg_map[dst] <= 4) {
-			/* Low byte is accessible. */
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
-			FAIL_IF(!inst);
-			INC_SIZE(3 + 3);
-			/* Set low byte to conditional flag. */
-			*inst++ = GROUP_0F;
-			*inst++ = cond_set;
-			*inst++ = U8(MOD_REG | reg_map[dst]);
-
-			*inst++ = GROUP_0F;
-			*inst++ = MOVZX_r_rm8;
-			*inst = U8(MOD_REG | (reg_map[dst] << 3) | reg_map[dst]);
-			return SLJIT_SUCCESS;
-		}
-
-		/* Low byte is not accessible. */
-		if (cpu_has_cmov == -1)
-			get_cpu_features();
-
-		if (cpu_has_cmov) {
-			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
-			/* a xor reg, reg operation would overwrite the flags. */
-			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
-
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
-			FAIL_IF(!inst);
-			INC_SIZE(3);
-
-			*inst++ = GROUP_0F;
-			/* cmovcc = setcc - 0x50. */
-			*inst++ = U8(cond_set - 0x50);
-			*inst++ = U8(MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1]);
-			return SLJIT_SUCCESS;
-		}
-
-		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
+	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
+		/* Low byte is accessible. */
+		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
 		FAIL_IF(!inst);
-		INC_SIZE(1 + 3 + 3 + 1);
-		*inst++ = U8(XCHG_EAX_r | reg_map[TMP_REG1]);
-		/* Set al to conditional flag. */
-		*inst++ = GROUP_0F;
-		*inst++ = cond_set;
-		*inst++ = MOD_REG | 0 /* eax */;
+		INC_SIZE(3 + 3);
+		/* Set low byte to conditional flag. */
+		inst[0] = GROUP_0F;
+		inst[1] = cond_set;
+		inst[2] = U8(MOD_REG | reg_map[dst]);
 
-		*inst++ = GROUP_0F;
-		*inst++ = MOVZX_r_rm8;
-		*inst++ = U8(MOD_REG | (reg_map[dst] << 3) | 0 /* eax */);
-		*inst++ = U8(XCHG_EAX_r | reg_map[TMP_REG1]);
+		inst[3] = GROUP_0F;
+		inst[4] = MOVZX_r_rm8;
+		inst[5] = U8(MOD_REG | (reg_map[dst] << 3) | reg_map[dst]);
 		return SLJIT_SUCCESS;
 	}
 
 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
-		SLJIT_ASSERT(reg_map[SLJIT_R0] == 0);
+		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 2);
+		FAIL_IF(!inst);
+		INC_SIZE(3 + 2);
 
-		if (dst != SLJIT_R0) {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
-			FAIL_IF(!inst);
-			INC_SIZE(1 + 3 + 2 + 1);
-			/* Set low register to conditional flag. */
-			*inst++ = U8(XCHG_EAX_r | reg_map[TMP_REG1]);
-			*inst++ = GROUP_0F;
-			*inst++ = cond_set;
-			*inst++ = MOD_REG | 0 /* eax */;
-			*inst++ = OR_rm8_r8;
-			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
-			*inst++ = U8(XCHG_EAX_r | reg_map[TMP_REG1]);
-		}
-		else {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
-			FAIL_IF(!inst);
-			INC_SIZE(2 + 3 + 2 + 2);
-			/* Set low register to conditional flag. */
-			*inst++ = XCHG_r_rm;
-			*inst++ = U8(MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1]);
-			*inst++ = GROUP_0F;
-			*inst++ = cond_set;
-			*inst++ = MOD_REG | 1 /* ecx */;
-			*inst++ = OR_rm8_r8;
-			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
-			*inst++ = XCHG_r_rm;
-			*inst++ = U8(MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1]);
-		}
+		/* Set low byte to conditional flag. */
+		inst[0] = GROUP_0F;
+		inst[1] = cond_set;
+		inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
+
+		inst[3] = OR_rm8_r8;
+		inst[4] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[dst]);
 		return SLJIT_SUCCESS;
 	}
 
-	/* Set TMP_REG1 to the bit. */
-	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
+	inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
 	FAIL_IF(!inst);
-	INC_SIZE(1 + 3 + 3 + 1);
-	*inst++ = U8(XCHG_EAX_r | reg_map[TMP_REG1]);
-	/* Set al to conditional flag. */
-	*inst++ = GROUP_0F;
-	*inst++ = cond_set;
-	*inst++ = MOD_REG | 0 /* eax */;
+	INC_SIZE(3 + 3);
+	/* Set low byte to conditional flag. */
+	inst[0] = GROUP_0F;
+	inst[1] = cond_set;
+	inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
 
-	*inst++ = GROUP_0F;
-	*inst++ = MOVZX_r_rm8;
-	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
-
-	*inst++ = U8(XCHG_EAX_r | reg_map[TMP_REG1]);
+	inst[3] = GROUP_0F;
+	inst[4] = MOVZX_r_rm8;
+	inst[5] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[TMP_REG1]);
 
 	if (GET_OPCODE(op) < SLJIT_ADD)
 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
@@ -2967,7 +3299,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-	dst_reg &= ~SLJIT_32;
+	type &= ~SLJIT_32;
 
 	if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV) || (dst_reg >= SLJIT_R3 && dst_reg <= SLJIT_S3))
 		return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
@@ -2980,8 +3312,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	CHECK_EXTRA_REGS(src, srcw, (void)0);
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-	compiler->mode32 = dst_reg & SLJIT_32;
-	dst_reg &= ~SLJIT_32;
+	compiler->mode32 = type & SLJIT_32;
+	type &= ~SLJIT_32;
 #endif
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
@@ -2992,8 +3324,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 
 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
 	FAIL_IF(!inst);
-	*inst++ = GROUP_0F;
-	*inst = U8(get_jump_code((sljit_uw)type) - 0x40);
+	inst[0] = GROUP_0F;
+	inst[1] = U8(get_jump_code((sljit_uw)type) - 0x40);
 	return SLJIT_SUCCESS;
 }
 
@@ -3060,8 +3392,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compi
 	inst = (sljit_u8*)ensure_buf(compiler, 2);
 	PTR_FAIL_IF(!inst);
 
-	*inst++ = 0;
-	*inst++ = 2;
+	inst[0] = 0;
+	inst[1] = 2;
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	if (dst & SLJIT_MEM)
@@ -3114,8 +3446,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label* sljit_emit_put_label(struct slj
 	inst = (sljit_u8*)ensure_buf(compiler, 2);
 	PTR_FAIL_IF(!inst);
 
-	*inst++ = 0;
-	*inst++ = 3;
+	inst[0] = 0;
+	inst[1] = 3;
 
 	return put_label;
 }
diff --git a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitProtExecAllocator.c b/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitProtExecAllocator.c
deleted file mode 100644
index 915411fbed..0000000000
--- a/waterbox/ares64/ares/thirdparty/sljit/sljit_src/sljitProtExecAllocator.c
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- *    Stack-less Just-In-Time compiler
- *
- *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are
- * permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice, this list of
- *      conditions and the following disclaimer.
- *
- *   2. Redistributions in binary form must reproduce the above copyright notice, this list
- *      of conditions and the following disclaimer in the documentation and/or other materials
- *      provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
-   This file contains a simple executable memory allocator
-
-   It is assumed, that executable code blocks are usually medium (or sometimes
-   large) memory blocks, and the allocator is not too frequently called (less
-   optimized than other allocators). Thus, using it as a generic allocator is
-   not suggested.
-
-   How does it work:
-     Memory is allocated in continuous memory areas called chunks by alloc_chunk()
-     Chunk format:
-     [ block ][ block ] ... [ block ][ block terminator ]
-
-   All blocks and the block terminator is started with block_header. The block
-   header contains the size of the previous and the next block. These sizes
-   can also contain special values.
-     Block size:
-       0 - The block is a free_block, with a different size member.
-       1 - The block is a block terminator.
-       n - The block is used at the moment, and the value contains its size.
-     Previous block size:
-       0 - This is the first block of the memory chunk.
-       n - The size of the previous block.
-
-   Using these size values we can go forward or backward on the block chain.
-   The unused blocks are stored in a chain list pointed by free_blocks. This
-   list is useful if we need to find a suitable memory area when the allocator
-   is called.
-
-   When a block is freed, the new free block is connected to its adjacent free
-   blocks if possible.
-
-     [ free block ][ used block ][ free block ]
-   and "used block" is freed, the three blocks are connected together:
-     [           one big free block           ]
-*/
-
-/* --------------------------------------------------------------------- */
-/*  System (OS) functions                                                */
-/* --------------------------------------------------------------------- */
-
-/* 64 KByte. */
-#define CHUNK_SIZE	(sljit_uw)0x10000
-
-struct chunk_header {
-	void *executable;
-};
-
-/*
-   alloc_chunk / free_chunk :
-     * allocate executable system memory chunks
-     * the size is always divisible by CHUNK_SIZE
-   SLJIT_ALLOCATOR_LOCK / SLJIT_ALLOCATOR_UNLOCK :
-     * provided as part of sljitUtils
-     * only the allocator requires this lock, sljit is fully thread safe
-       as it only uses local variables
-*/
-
-#ifndef __NetBSD__
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifndef O_NOATIME
-#define O_NOATIME 0
-#endif
-
-/* this is a linux extension available since kernel 3.11 */
-#ifndef O_TMPFILE
-#define O_TMPFILE 020200000
-#endif
-
-#ifndef _GNU_SOURCE
-char *secure_getenv(const char *name);
-int mkostemp(char *template, int flags);
-#endif
-
-static SLJIT_INLINE int create_tempfile(void)
-{
-	int fd;
-	char tmp_name[256];
-	size_t tmp_name_len = 0;
-	char *dir;
-	struct stat st;
-#if defined(SLJIT_SINGLE_THREADED) && SLJIT_SINGLE_THREADED
-	mode_t mode;
-#endif
-
-#ifdef HAVE_MEMFD_CREATE
-	/* this is a GNU extension, make sure to use -D_GNU_SOURCE */
-	fd = memfd_create("sljit", MFD_CLOEXEC);
-	if (fd != -1) {
-		fchmod(fd, 0);
-		return fd;
-	}
-#endif
-
-	dir = secure_getenv("TMPDIR");
-
-	if (dir) {
-		tmp_name_len = strlen(dir);
-		if (tmp_name_len > 0 && tmp_name_len < sizeof(tmp_name)) {
-			if ((stat(dir, &st) == 0) && S_ISDIR(st.st_mode))
-				strcpy(tmp_name, dir);
-		}
-	}
-
-#ifdef P_tmpdir
-	if (!tmp_name_len) {
-		tmp_name_len = strlen(P_tmpdir);
-		if (tmp_name_len > 0 && tmp_name_len < sizeof(tmp_name))
-			strcpy(tmp_name, P_tmpdir);
-	}
-#endif
-	if (!tmp_name_len) {
-		strcpy(tmp_name, "/tmp");
-		tmp_name_len = 4;
-	}
-
-	SLJIT_ASSERT(tmp_name_len > 0 && tmp_name_len < sizeof(tmp_name));
-
-	if (tmp_name[tmp_name_len - 1] == '/')
-		tmp_name[--tmp_name_len] = '\0';
-
-#ifdef __linux__
-	/*
-	 * the previous trimming might had left an empty string if TMPDIR="/"
-	 * so work around the problem below
-	 */
-	fd = open(tmp_name_len ? tmp_name : "/",
-		O_TMPFILE | O_EXCL | O_RDWR | O_NOATIME | O_CLOEXEC, 0);
-	if (fd != -1)
-		return fd;
-#endif
-
-	if (tmp_name_len + 7 >= sizeof(tmp_name))
-		return -1;
-
-	strcpy(tmp_name + tmp_name_len, "/XXXXXX");
-#if defined(SLJIT_SINGLE_THREADED) && SLJIT_SINGLE_THREADED
-	mode = umask(0777);
-#endif
-	fd = mkostemp(tmp_name, O_CLOEXEC | O_NOATIME);
-#if defined(SLJIT_SINGLE_THREADED) && SLJIT_SINGLE_THREADED
-	umask(mode);
-#else
-	fchmod(fd, 0);
-#endif
-
-	if (fd == -1)
-		return -1;
-
-	if (unlink(tmp_name)) {
-		close(fd);
-		return -1;
-	}
-
-	return fd;
-}
-
-static SLJIT_INLINE struct chunk_header* alloc_chunk(sljit_uw size)
-{
-	struct chunk_header *retval;
-	int fd;
-
-	fd = create_tempfile();
-	if (fd == -1)
-		return NULL;
-
-	if (ftruncate(fd, (off_t)size)) {
-		close(fd);
-		return NULL;
-	}
-
-	retval = (struct chunk_header *)mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-
-	if (retval == MAP_FAILED) {
-		close(fd);
-		return NULL;
-	}
-
-	retval->executable = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0);
-
-	if (retval->executable == MAP_FAILED) {
-		munmap((void *)retval, size);
-		close(fd);
-		return NULL;
-	}
-
-	close(fd);
-	return retval;
-}
-#else
-/*
- * MAP_REMAPDUP is a NetBSD extension available sinde 8.0, make sure to
- * adjust your feature macros (ex: -D_NETBSD_SOURCE) as needed
- */
-static SLJIT_INLINE struct chunk_header* alloc_chunk(sljit_uw size)
-{
-	struct chunk_header *retval;
-
-	retval = (struct chunk_header *)mmap(NULL, size,
-			PROT_READ | PROT_WRITE | PROT_MPROTECT(PROT_EXEC),
-			MAP_ANON | MAP_SHARED, -1, 0);
-
-	if (retval == MAP_FAILED)
-		return NULL;
-
-	retval->executable = mremap(retval, size, NULL, size, MAP_REMAPDUP);
-	if (retval->executable == MAP_FAILED) {
-		munmap((void *)retval, size);
-		return NULL;
-	}
-
-	if (mprotect(retval->executable, size, PROT_READ | PROT_EXEC) == -1) {
-		munmap(retval->executable, size);
-		munmap((void *)retval, size);
-		return NULL;
-	}
-
-	return retval;
-}
-#endif /* NetBSD */
-
-static SLJIT_INLINE void free_chunk(void *chunk, sljit_uw size)
-{
-	struct chunk_header *header = ((struct chunk_header *)chunk) - 1;
-
-	munmap(header->executable, size);
-	munmap((void *)header, size);
-}
-
-/* --------------------------------------------------------------------- */
-/*  Common functions                                                     */
-/* --------------------------------------------------------------------- */
-
-#define CHUNK_MASK	(~(CHUNK_SIZE - 1))
-
-struct block_header {
-	sljit_uw size;
-	sljit_uw prev_size;
-	sljit_sw executable_offset;
-};
-
-struct free_block {
-	struct block_header header;
-	struct free_block *next;
-	struct free_block *prev;
-	sljit_uw size;
-};
-
-#define AS_BLOCK_HEADER(base, offset) \
-	((struct block_header*)(((sljit_u8*)base) + offset))
-#define AS_FREE_BLOCK(base, offset) \
-	((struct free_block*)(((sljit_u8*)base) + offset))
-#define MEM_START(base)		((void*)((base) + 1))
-#define ALIGN_SIZE(size)	(((size) + sizeof(struct block_header) + 7u) & ~(sljit_uw)7)
-
-static struct free_block* free_blocks;
-static sljit_uw allocated_size;
-static sljit_uw total_size;
-
-static SLJIT_INLINE void sljit_insert_free_block(struct free_block *free_block, sljit_uw size)
-{
-	free_block->header.size = 0;
-	free_block->size = size;
-
-	free_block->next = free_blocks;
-	free_block->prev = NULL;
-	if (free_blocks)
-		free_blocks->prev = free_block;
-	free_blocks = free_block;
-}
-
-static SLJIT_INLINE void sljit_remove_free_block(struct free_block *free_block)
-{
-	if (free_block->next)
-		free_block->next->prev = free_block->prev;
-
-	if (free_block->prev)
-		free_block->prev->next = free_block->next;
-	else {
-		SLJIT_ASSERT(free_blocks == free_block);
-		free_blocks = free_block->next;
-	}
-}
-
-SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
-{
-	struct chunk_header *chunk_header;
-	struct block_header *header;
-	struct block_header *next_header;
-	struct free_block *free_block;
-	sljit_uw chunk_size;
-	sljit_sw executable_offset;
-
-	SLJIT_ALLOCATOR_LOCK();
-	if (size < (64 - sizeof(struct block_header)))
-		size = (64 - sizeof(struct block_header));
-	size = ALIGN_SIZE(size);
-
-	free_block = free_blocks;
-	while (free_block) {
-		if (free_block->size >= size) {
-			chunk_size = free_block->size;
-			if (chunk_size > size + 64) {
-				/* We just cut a block from the end of the free block. */
-				chunk_size -= size;
-				free_block->size = chunk_size;
-				header = AS_BLOCK_HEADER(free_block, chunk_size);
-				header->prev_size = chunk_size;
-				header->executable_offset = free_block->header.executable_offset;
-				AS_BLOCK_HEADER(header, size)->prev_size = size;
-			}
-			else {
-				sljit_remove_free_block(free_block);
-				header = (struct block_header*)free_block;
-				size = chunk_size;
-			}
-			allocated_size += size;
-			header->size = size;
-			SLJIT_ALLOCATOR_UNLOCK();
-			return MEM_START(header);
-		}
-		free_block = free_block->next;
-	}
-
-	chunk_size = sizeof(struct chunk_header) + sizeof(struct block_header);
-	chunk_size = (chunk_size + size + CHUNK_SIZE - 1) & CHUNK_MASK;
-
-	chunk_header = alloc_chunk(chunk_size);
-	if (!chunk_header) {
-		SLJIT_ALLOCATOR_UNLOCK();
-		return NULL;
-	}
-
-	executable_offset = (sljit_sw)((sljit_u8*)chunk_header->executable - (sljit_u8*)chunk_header);
-
-	chunk_size -= sizeof(struct chunk_header) + sizeof(struct block_header);
-	total_size += chunk_size;
-
-	header = (struct block_header *)(chunk_header + 1);
-
-	header->prev_size = 0;
-	header->executable_offset = executable_offset;
-	if (chunk_size > size + 64) {
-		/* Cut the allocated space into a free and a used block. */
-		allocated_size += size;
-		header->size = size;
-		chunk_size -= size;
-
-		free_block = AS_FREE_BLOCK(header, size);
-		free_block->header.prev_size = size;
-		free_block->header.executable_offset = executable_offset;
-		sljit_insert_free_block(free_block, chunk_size);
-		next_header = AS_BLOCK_HEADER(free_block, chunk_size);
-	}
-	else {
-		/* All space belongs to this allocation. */
-		allocated_size += chunk_size;
-		header->size = chunk_size;
-		next_header = AS_BLOCK_HEADER(header, chunk_size);
-	}
-	next_header->size = 1;
-	next_header->prev_size = chunk_size;
-	next_header->executable_offset = executable_offset;
-	SLJIT_ALLOCATOR_UNLOCK();
-	return MEM_START(header);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
-{
-	struct block_header *header;
-	struct free_block* free_block;
-
-	SLJIT_ALLOCATOR_LOCK();
-	header = AS_BLOCK_HEADER(ptr, -(sljit_sw)sizeof(struct block_header));
-	header = AS_BLOCK_HEADER(header, -header->executable_offset);
-	allocated_size -= header->size;
-
-	/* Connecting free blocks together if possible. */
-
-	/* If header->prev_size == 0, free_block will equal to header.
-	   In this case, free_block->header.size will be > 0. */
-	free_block = AS_FREE_BLOCK(header, -(sljit_sw)header->prev_size);
-	if (SLJIT_UNLIKELY(!free_block->header.size)) {
-		free_block->size += header->size;
-		header = AS_BLOCK_HEADER(free_block, free_block->size);
-		header->prev_size = free_block->size;
-	}
-	else {
-		free_block = (struct free_block*)header;
-		sljit_insert_free_block(free_block, header->size);
-	}
-
-	header = AS_BLOCK_HEADER(free_block, free_block->size);
-	if (SLJIT_UNLIKELY(!header->size)) {
-		free_block->size += ((struct free_block*)header)->size;
-		sljit_remove_free_block((struct free_block*)header);
-		header = AS_BLOCK_HEADER(free_block, free_block->size);
-		header->prev_size = free_block->size;
-	}
-
-	/* The whole chunk is free. */
-	if (SLJIT_UNLIKELY(!free_block->header.prev_size && header->size == 1)) {
-		/* If this block is freed, we still have (allocated_size / 2) free space. */
-		if (total_size - free_block->size > (allocated_size * 3 / 2)) {
-			total_size -= free_block->size;
-			sljit_remove_free_block(free_block);
-			free_chunk(free_block, free_block->size +
-				sizeof(struct chunk_header) +
-				sizeof(struct block_header));
-		}
-	}
-
-	SLJIT_ALLOCATOR_UNLOCK();
-}
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void)
-{
-	struct free_block* free_block;
-	struct free_block* next_free_block;
-
-	SLJIT_ALLOCATOR_LOCK();
-
-	free_block = free_blocks;
-	while (free_block) {
-		next_free_block = free_block->next;
-		if (!free_block->header.prev_size && 
-				AS_BLOCK_HEADER(free_block, free_block->size)->size == 1) {
-			total_size -= free_block->size;
-			sljit_remove_free_block(free_block);
-			free_chunk(free_block, free_block->size +
-				sizeof(struct chunk_header) +
-				sizeof(struct block_header));
-		}
-		free_block = next_free_block;
-	}
-
-	SLJIT_ASSERT((total_size && free_blocks) || (!total_size && !free_blocks));
-	SLJIT_ALLOCATOR_UNLOCK();
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr)
-{
-	return ((struct block_header *)(ptr))[-1].executable_offset;
-}
diff --git a/waterbox/ares64/ares/thirdparty/sljit/test_src/sljitTest.c b/waterbox/ares64/ares/thirdparty/sljit/test_src/sljitTest.c
index d20695e9ce..1a133a7a4d 100644
--- a/waterbox/ares64/ares/thirdparty/sljit/test_src/sljitTest.c
+++ b/waterbox/ares64/ares/thirdparty/sljit/test_src/sljitTest.c
@@ -74,6 +74,9 @@ union executable_code {
 	void (SLJIT_FUNC *test73_f2)(sljit_sw a, sljit_sw b, sljit_s32 c, sljit_s32 d);
 	void (SLJIT_FUNC *test73_f3)(sljit_f64 a, sljit_f64 b, sljit_f64 c, sljit_sw d);
 	void (SLJIT_FUNC *test73_f4)(sljit_f64 a, sljit_f64 b, sljit_sw c, sljit_sw d);
+
+	sljit_f32 (SLJIT_FUNC *test81_f1)(sljit_sw a);
+	sljit_f64 (SLJIT_FUNC *test81_f2)(sljit_sw a);
 };
 typedef union executable_code executable_code;
 
@@ -94,6 +97,12 @@ static sljit_s32 silent = 0;
 		return; \
 	}
 
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+#define WCONST(const64, const32) ((sljit_sw)SLJIT_W(const64))
+#else /* !SLJIT_64BIT_ARCHITECTURE */
+#define WCONST(const64, const32) ((sljit_sw)const32)
+#endif /* SLJIT_64BIT_ARCHITECTURE */
+
 static void cond_set(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_s32 type)
 {
 	/* Testing both sljit_emit_op_flags and sljit_emit_jump. */
@@ -213,7 +222,7 @@ static void test2(void)
 	/* Test mov. */
 	executable_code code;
 	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
-	sljit_sw buf[8];
+	sljit_sw buf[10];
 	static sljit_sw data[2] = { 0, -9876 };
 
 	if (verbose)
@@ -232,24 +241,38 @@ static void test2(void)
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 2, 0, 0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 9999);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_S0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM2(SLJIT_S1, SLJIT_R1), 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, 2);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_S1, SLJIT_S0), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_S1, SLJIT_R1), 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, 3);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_S1, SLJIT_S0), SLJIT_WORD_SHIFT, SLJIT_MEM0(), (sljit_sw)&buf);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_R0), (sljit_sw)&data);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf - 0x12345678);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_R0), 0x12345678);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 3456);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf - 0xff890 + 6 * (sljit_sw)sizeof(sljit_sw));
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0xff890, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf + 0xff890 + 7 * (sljit_sw)sizeof(sljit_sw));
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), -0xff890, SLJIT_R0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf - 0xfff0ff + 8 * (sljit_sw)sizeof(sljit_sw));
+	/* buf[8] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0xfff0ff, SLJIT_IMM, 7896);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf + 0xfff100 + 9 * (sljit_sw)sizeof(sljit_sw));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -2450);
+	/* buf[9] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), -0xfff100, SLJIT_R1, 0);
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_R2, 0);
 
 	code.code = sljit_generate_code(compiler);
@@ -262,8 +285,10 @@ static void test2(void)
 	FAILED(buf[3] != 5678, "test2 case 4 failed\n");
 	FAILED(buf[4] != -9876, "test2 case 5 failed\n");
 	FAILED(buf[5] != 5678, "test2 case 6 failed\n");
-	FAILED(buf[6] != 3456, "test2 case 6 failed\n");
-	FAILED(buf[7] != 3456, "test2 case 6 failed\n");
+	FAILED(buf[6] != 3456, "test2 case 7 failed\n");
+	FAILED(buf[7] != 3456, "test2 case 8 failed\n");
+	FAILED(buf[8] != 7896, "test2 case 9 failed\n");
+	FAILED(buf[9] != -2450, "test2 case 10 failed\n");
 
 	sljit_free_code(code.code, NULL);
 	successful_tests++;
@@ -287,13 +312,16 @@ static void test3(void)
 	buf[4] = 0x12345678;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 1, 0, 0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), 0);
-	sljit_emit_op1(compiler, SLJIT_NOT, SLJIT_MEM0(), (sljit_sw)&buf[1], SLJIT_MEM0(), (sljit_sw)&buf[1]);
-	sljit_emit_op1(compiler, SLJIT_NOT, SLJIT_RETURN_REG, 0, SLJIT_MEM1(SLJIT_S0), 0);
-	sljit_emit_op1(compiler, SLJIT_NOT, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2);
+	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_MEM0(), (sljit_sw)&buf[1], SLJIT_MEM0(), (sljit_sw)&buf[1], SLJIT_IMM, -1);
+	/* buf[3] */
+	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_RETURN_REG, 0, SLJIT_IMM, -1, SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_IMM, -1);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf[4] - 0xff0000 - 0x20);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, (sljit_sw)&buf[4] - 0xff0000);
-	sljit_emit_op1(compiler, SLJIT_NOT, SLJIT_MEM1(SLJIT_R1), 0xff0000 + 0x20, SLJIT_MEM1(SLJIT_R2), 0xff0000);
+	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_MEM1(SLJIT_R1), 0xff0000 + 0x20, SLJIT_IMM, -1, SLJIT_MEM1(SLJIT_R2), 0xff0000);
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
 
 	code.code = sljit_generate_code(compiler);
@@ -326,11 +354,14 @@ static void test4(void)
 	buf[3] = 0;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(W, P, W), 3, 2, 0, 0, 0);
-	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_IMM, 0, SLJIT_S1, 0);
+	/* buf[0] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM0(), (sljit_sw)&buf[0], SLJIT_IMM, 0, SLJIT_MEM0(), (sljit_sw)&buf[1]);
+	/* buf[2] */
+	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_IMM, 0, SLJIT_S1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 299);
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_IMM, 0, SLJIT_R1, 0);
-	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0, SLJIT_S1, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2);
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
 
 	code.code = sljit_generate_code(compiler);
@@ -370,22 +401,32 @@ static void test5(void)
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 2, 0, 0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 50);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 1, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 1, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 0);
+	/* buf[0] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_sw) + 2);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 50);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+	/* buf[5] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_IMM, 4, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_IMM, 50, SLJIT_R1, 0);
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_IMM, 50, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw));
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R1, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R1, 0, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw));
+	/* buf[5] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw));
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0x1e7d39f2);
+	/* buf[6] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_R1, 0, SLJIT_IMM, 0x23de7c06);
+	/* buf[7] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_IMM, 0x3d72e452, SLJIT_R1, 0);
+	/* buf[8] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_IMM, -43, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw));
+	/* Return value */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_IMM, 1000, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 1430);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_IMM, -99, SLJIT_R0, 0);
@@ -436,32 +477,44 @@ static void test6(void)
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 1, 0, 0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
 	sljit_emit_op2(compiler, SLJIT_ADD | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, -1);
+	/* buf[0] */
 	sljit_emit_op2(compiler, SLJIT_ADDC, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_IMM, 0, SLJIT_IMM, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_R0, 0);
+	/* buf[1] */
 	sljit_emit_op2(compiler, SLJIT_ADDC, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_IMM, 4);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 100);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_R0, 0, SLJIT_IMM, 50);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 6000);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_IMM, 10);
 	sljit_emit_op2(compiler, SLJIT_SUBC, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_IMM, 5);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 100);
-	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2);
+	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 5000);
+	sljit_emit_op2(compiler, SLJIT_SUBC, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_R1, 0);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 5000);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_IMM, 6000, SLJIT_R0, 0);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 100);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 32768);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7, SLJIT_R1, 0);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -32767);
+	/* buf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 8, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x52cd3bf4);
+	/* buf[9] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 9, SLJIT_R0, 0, SLJIT_IMM, 0x3da297c6);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 6000);
+	/* buf[10] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 10, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 10, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, 10);
@@ -513,23 +566,33 @@ static void test7(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 1, 0, 0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0xf0C000);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_R0, 0, SLJIT_IMM, 0x308f);
+	/* buf[0] */
 	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_AND, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_IMM, 0xf0f0f0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0xC0F0);
 	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5);
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 0xff0000);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_R0, 0);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 0xC0F0);
 	sljit_emit_op2(compiler, SLJIT_AND, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5);
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5, SLJIT_R2, 0, SLJIT_IMM, 0xff0000);
+	/* buf[1] */
 	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_IMM, 0xFFFFFF, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
+	/* buf[6] */
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6, SLJIT_IMM, (sljit_sw)0xa56c82c0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7);
 	sljit_emit_op2(compiler, SLJIT_XOR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7, SLJIT_IMM, (sljit_sw)0xff00ff00, SLJIT_R0, 0);
+	/* Return vaue */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0xff00ff00);
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 0x0f);
 	sljit_emit_op2(compiler, SLJIT_AND, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0x888888, SLJIT_R1, 0);
+
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
 
 	code.code = sljit_generate_code(compiler);
@@ -593,10 +656,10 @@ static void test8(void)
 	/* buf[6] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6, SLJIT_OVERFLOW);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
-	sljit_emit_op1(compiler, SLJIT_NOT | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R0, 0);
+	sljit_emit_op2(compiler, SLJIT_XOR | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -1);
 	/* buf[7] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7, SLJIT_ZERO);
-	sljit_emit_op1(compiler, SLJIT_NOT | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_R1, 0);
+	sljit_emit_op2(compiler, SLJIT_XOR | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, -1, SLJIT_R1, 0);
 	/* buf[8] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 8, SLJIT_ZERO);
 	sljit_emit_op2u(compiler, SLJIT_AND | SLJIT_SET_Z, SLJIT_IMM, 0xffff, SLJIT_R0, 0);
@@ -688,7 +751,8 @@ static void test9(void)
 	/* Test shift. */
 	executable_code code;
 	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
-	sljit_sw buf[13];
+	sljit_sw buf[15];
+	sljit_s32 i;
 #ifdef SLJIT_PREF_SHIFT_REG
 	sljit_s32 shift_reg = SLJIT_PREF_SHIFT_REG;
 #else
@@ -701,57 +765,61 @@ static void test9(void)
 		printf("Run test9\n");
 
 	FAILED(!compiler, "cannot create compiler\n");
-	buf[0] = 0;
-	buf[1] = 0;
-	buf[2] = 0;
-	buf[3] = 0;
+
+	for (i = 0; i < 15; i++)
+		buf[i] = -1;
+
 	buf[4] = 1 << 10;
-	buf[5] = 0;
-	buf[6] = 0;
-	buf[7] = 0;
-	buf[8] = 0;
 	buf[9] = 3;
-	buf[10] = 0;
-	buf[11] = 0;
-	buf[12] = 0;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 4, 2, 0, 0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0xf);
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 3);
 	sljit_emit_op2(compiler, SLJIT_LSHR, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
+	/* buf[1] */
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R1, 0, SLJIT_IMM, 1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -64);
 	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 2);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_R0, 0, shift_reg, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 0xff);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 4);
 	sljit_emit_op2(compiler, SLJIT_SHL, shift_reg, 0, shift_reg, 0, SLJIT_R0, 0);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, shift_reg, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 0xff);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 8);
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_LSHR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_R0, 0);
+	/* buf[5] */
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5, shift_reg, 0, SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, 0xf);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2);
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_R0, 0);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6, SLJIT_S1, 0);
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R0, 0, SLJIT_S1, 0, SLJIT_R0, 0);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 0xf00);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 4);
 	sljit_emit_op2(compiler, SLJIT_LSHR, SLJIT_R1, 0, SLJIT_R2, 0, SLJIT_R0, 0);
+	/* buf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 8, SLJIT_R1, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)buf);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 9);
+	/* buf[9] */
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), SLJIT_WORD_SHIFT);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 4);
 	sljit_emit_op2(compiler, SLJIT_SHL, shift_reg, 0, SLJIT_IMM, 2, shift_reg, 0);
+	/* buf[10] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 10, shift_reg, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0xa9);
@@ -774,6 +842,7 @@ static void test9(void)
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R0, 0, SLJIT_R0, 0);
 #endif
+	/* buf[11] */
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 11, SLJIT_R1, 0, SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 0);
@@ -788,8 +857,22 @@ static void test9(void)
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x630000);
 	sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_R0, 0, SLJIT_R0, 0, shift_reg, 0);
+	/* buf[12] */
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 12, SLJIT_R1, 0, SLJIT_R0, 0);
 
+	/* Test shift_reg keeps 64 bit value after 32 bit operation. */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -3062);
+	sljit_emit_op2(compiler, SLJIT_ASHR32, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_R1, 0);
+	/* buf[13] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 13, shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -4691);
+	sljit_emit_op2(compiler, SLJIT_LSHR32, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_R0, 0);
+	/* buf[14] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 14, shift_reg, 0);
+
 	sljit_emit_return_void(compiler);
 
 	code.code = sljit_generate_code(compiler);
@@ -810,6 +893,8 @@ static void test9(void)
 	FAILED(buf[10] != 32, "test9 case 11 failed\n");
 	FAILED(buf[11] != 0x4ae37da9, "test9 case 12 failed\n");
 	FAILED(buf[12] != 0x63f65c, "test9 case 13 failed\n");
+	FAILED(buf[13] != -3062, "test9 case 14 failed\n");
+	FAILED(buf[14] != -4691, "test9 case 15 failed\n");
 
 	sljit_free_code(code.code, NULL);
 	successful_tests++;
@@ -837,23 +922,30 @@ static void test10(void)
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 1, 0, 0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 5);
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 7);
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_R0, 0, SLJIT_R2, 0, SLJIT_IMM, 8);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_R0, 0, SLJIT_IMM, -3, SLJIT_IMM, -4);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -2);
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_sw) / 2);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf[3]);
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 1, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 1, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 9);
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_R0, 0);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5, SLJIT_R0, 0);
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 3);
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, SLJIT_W(0x123456789));
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6, SLJIT_R0, 0);
 #endif
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 11, SLJIT_IMM, 10);
@@ -915,6 +1007,7 @@ static void test11(void)
 	SLJIT_ASSERT(!sljit_alloc_memory(compiler, 0));
 	SLJIT_ASSERT(!sljit_alloc_memory(compiler, 16 * sizeof(sljit_sw) + 1));
 
+	/* buf[0] */
 	const1 = sljit_emit_const(compiler, SLJIT_MEM0(), (sljit_sw)&buf[0], -0x81b9);
 
 	value = sljit_alloc_memory(compiler, 16 * sizeof(sljit_sw));
@@ -924,8 +1017,10 @@ static void test11(void)
 		memset(value, 255, 16 * sizeof(sljit_sw));
 	}
 
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2);
 	const2 = sljit_emit_const(compiler, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT - 1, -65535);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf[0] + 2 * (sljit_sw)sizeof(sljit_sw) - 2);
 	const3 = sljit_emit_const(compiler, SLJIT_MEM1(SLJIT_R0), 0, word_value1);
 
@@ -936,6 +1031,7 @@ static void test11(void)
 		memset(value, 255, 16);
 	}
 
+	/* Return vaue */
 	const4 = sljit_emit_const(compiler, SLJIT_RETURN_REG, 0, (sljit_sw)0xf7afcdb7);
 
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
@@ -954,9 +1050,13 @@ static void test11(void)
 	FAILED(buf[1] != -65535, "test11 case 3 failed\n");
 	FAILED(buf[2] != word_value1, "test11 case 4 failed\n");
 
+	/* buf[0] */
 	sljit_set_const(const1_addr, -1, executable_offset);
+	/* buf[1] */
 	sljit_set_const(const2_addr, word_value2, executable_offset);
+	/* buf[2] */
 	sljit_set_const(const3_addr, (sljit_sw)0xbab0fea1, executable_offset);
+	/* Return vaue */
 	sljit_set_const(const4_addr, -60089, executable_offset);
 
 	FAILED(code.func1((sljit_sw)&buf) != -60089, "test11 case 5 failed\n");
@@ -1061,7 +1161,7 @@ static void test13(void)
 {
 	/* Test fpu monadic functions. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	sljit_f64 buf[7];
 	sljit_sw buf2[6];
 
@@ -1072,12 +1172,12 @@ static void test13(void)
 		if (verbose)
 			printf("no fpu available, test13 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
+	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
+
 	buf[0] = 7.75;
 	buf[1] = -4.5;
 	buf[2] = 0.0;
@@ -1094,30 +1194,41 @@ static void test13(void)
 	buf2[5] = 10;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, P, P), 3, 2, 6, 0, 0);
+	/* buf[2] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM0(), (sljit_sw)&buf[2], SLJIT_MEM0(), (sljit_sw)&buf[1]);
+	/* buf[3] */
 	sljit_emit_fop1(compiler, SLJIT_ABS_F64, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_f64), SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64));
+	/* buf[4] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, SLJIT_MEM0(), (sljit_sw)&buf[0]);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2 * sizeof(sljit_f64));
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 0);
 	sljit_emit_fop1(compiler, SLJIT_NEG_F64, SLJIT_FR2, 0, SLJIT_FR0, 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR3, 0, SLJIT_FR2, 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM0(), (sljit_sw)&buf[4], SLJIT_FR3, 0);
+	/* buf[5] */
 	sljit_emit_fop1(compiler, SLJIT_ABS_F64, SLJIT_FR4, 0, SLJIT_FR1, 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_f64), SLJIT_FR4, 0);
+	/* buf[6] */
 	sljit_emit_fop1(compiler, SLJIT_NEG_F64, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_f64), SLJIT_FR4, 0);
 
+	/* buf2[0] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR5, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_GREATER, SLJIT_FR5, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64));
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_F_GREATER);
+	/* buf2[1] */
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_GREATER, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64), SLJIT_FR5, 0);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_sw), SLJIT_F_GREATER);
+	/* buf2[2] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_FR5, 0);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_EQUAL, SLJIT_FR1, 0, SLJIT_FR1, 0);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_sw), SLJIT_F_EQUAL);
+	/* buf2[3] */
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_LESS, SLJIT_FR1, 0, SLJIT_FR1, 0);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_sw), SLJIT_F_LESS);
+	/* buf2[4] */
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_EQUAL, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64));
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_sw), SLJIT_F_EQUAL);
+	/* buf2[5] */
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_NOT_EQUAL, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64));
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_sw), SLJIT_F_NOT_EQUAL);
 
@@ -1149,7 +1260,7 @@ static void test14(void)
 {
 	/* Test fpu diadic functions. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	sljit_f64 buf[15];
 
 	if (verbose)
@@ -1159,10 +1270,12 @@ static void test14(void)
 		if (verbose)
 			printf("no fpu available, test14 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
 	buf[0] = 7.25;
 	buf[1] = 3.5;
 	buf[2] = 1.75;
@@ -1179,45 +1292,56 @@ static void test14(void)
 	buf[13] = 4.0;
 	buf[14] = 0.0;
 
-	FAILED(!compiler, "cannot create compiler\n");
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 3, 1, 6, 0, 0);
 
 	/* ADD */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_f64));
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64));
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 2);
+	/* buf[3] */
 	sljit_emit_fop2(compiler, SLJIT_ADD_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 3, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_fop2(compiler, SLJIT_ADD_F64, SLJIT_FR0, 0, SLJIT_FR0, 0, SLJIT_FR1, 0);
 	sljit_emit_fop2(compiler, SLJIT_ADD_F64, SLJIT_FR1, 0, SLJIT_FR0, 0, SLJIT_FR1, 0);
+	/* buf[4] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 4, SLJIT_FR0, 0);
+	/* buf[5] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 5, SLJIT_FR1, 0);
 
 	/* SUB */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR3, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 2);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 2);
+	/* buf[6] */
 	sljit_emit_fop2(compiler, SLJIT_SUB_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 6, SLJIT_FR3, 0, SLJIT_MEM2(SLJIT_S0, SLJIT_R1), SLJIT_F64_SHIFT);
 	sljit_emit_fop2(compiler, SLJIT_SUB_F64, SLJIT_FR2, 0, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 2);
 	sljit_emit_fop2(compiler, SLJIT_SUB_F64, SLJIT_FR3, 0, SLJIT_FR2, 0, SLJIT_FR3, 0);
+	/* buf[7] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 7, SLJIT_FR2, 0);
+	/* buf[8] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 8, SLJIT_FR3, 0);
 
 	/* MUL */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 1);
+	/* buf[9] */
 	sljit_emit_fop2(compiler, SLJIT_MUL_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 9, SLJIT_MEM2(SLJIT_S0, SLJIT_R1), SLJIT_F64_SHIFT, SLJIT_FR1, 0);
 	sljit_emit_fop2(compiler, SLJIT_MUL_F64, SLJIT_FR1, 0, SLJIT_FR1, 0, SLJIT_FR2, 0);
 	sljit_emit_fop2(compiler, SLJIT_MUL_F64, SLJIT_FR5, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 2, SLJIT_FR2, 0);
+	/* buf[10] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 10, SLJIT_FR1, 0);
+	/* buf[11] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 11, SLJIT_FR5, 0);
 
 	/* DIV */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR5, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 12);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 13);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR4, 0, SLJIT_FR5, 0);
+	/* buf[12] */
 	sljit_emit_fop2(compiler, SLJIT_DIV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 12, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 12, SLJIT_FR1, 0);
 	sljit_emit_fop2(compiler, SLJIT_DIV_F64, SLJIT_FR5, 0, SLJIT_FR5, 0, SLJIT_FR1, 0);
 	sljit_emit_fop2(compiler, SLJIT_DIV_F64, SLJIT_FR4, 0, SLJIT_FR1, 0, SLJIT_FR4, 0);
+	/* buf[13] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 13, SLJIT_FR5, 0);
+	/* buf[14] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64) * 14, SLJIT_FR4, 0);
 
 	sljit_emit_return_void(compiler);
@@ -1249,13 +1373,18 @@ static sljit_sw func(sljit_sw a, sljit_sw b, sljit_sw c)
 	return a + b + c + 5;
 }
 
+static sljit_sw func4(sljit_sw a, sljit_sw b, sljit_sw c, sljit_sw d)
+{
+	return func(a, b, c) + d;
+}
+
 static void test15(void)
 {
 	/* Test function call. */
 	executable_code code;
 	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
 	struct sljit_jump* jump = NULL;
-	sljit_sw buf[7];
+	sljit_sw buf[9];
 
 	if (verbose)
 		printf("Run test15\n");
@@ -1267,16 +1396,20 @@ static void test15(void)
 	buf[3] = 0;
 	buf[4] = 0;
 	buf[5] = 0;
-	buf[6] = SLJIT_FUNC_ADDR(func);
+	buf[6] = 0;
+	buf[7] = 0;
+	buf[8] = SLJIT_FUNC_ADDR(func);
 
-	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 4, 1, 0, 0, 0);
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 4, 2, 0, 0, 0);
 
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 5);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 7);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, -3);
 	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_IMM, SLJIT_FUNC_ADDR(func));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_RETURN_REG, 0);
 
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -5);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -10);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 2);
@@ -1284,24 +1417,28 @@ static void test15(void)
 	sljit_set_target(jump, (sljit_uw)-1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
 
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, SLJIT_FUNC_ADDR(func));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 40);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, -3);
 	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
 
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -60);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, SLJIT_FUNC_ADDR(func));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, -30);
 	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
 
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 10);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 16);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, SLJIT_FUNC_ADDR(func));
 	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_R2, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
 
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 100);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 110);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 120);
@@ -1309,11 +1446,29 @@ static void test15(void)
 	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_R3, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
 
+	/* buf[6] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 2);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 3);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, SLJIT_FUNC_ADDR(func));
+	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_S1, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
+
+	/* buf[7] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 2);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 3);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, -6);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, SLJIT_FUNC_ADDR(func4));
+	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W), SLJIT_S1, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
+
+	/* buf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -10);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -16);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 6);
-	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw));
-	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
+	sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(W, W, W, W), SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
 
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
 
@@ -1329,7 +1484,9 @@ static void test15(void)
 	FAILED(buf[3] != SLJIT_FUNC_ADDR(func) - 85, "test15 case 5 failed\n");
 	FAILED(buf[4] != SLJIT_FUNC_ADDR(func) + 31, "test15 case 6 failed\n");
 	FAILED(buf[5] != 335, "test15 case 7 failed\n");
-	FAILED(buf[6] != -15, "test15 case 8 failed\n");
+	FAILED(buf[6] != 11, "test15 case 8 failed\n");
+	FAILED(buf[7] != 5, "test15 case 9 failed\n");
+	FAILED(buf[8] != -15, "test15 case 10 failed\n");
 
 	sljit_free_code(code.code, NULL);
 	successful_tests++;
@@ -1418,11 +1575,12 @@ static void test17(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 3, 1, 0, 0, 0);
 	for (i = 0; i <= 0xfff; i++) {
-		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0x81818000 | i);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0x81818000 | i);
 		if ((i & 0x3ff) == 0)
+			/* buf[0-3] */
 			sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), (i >> 10) * (sljit_sw)sizeof(sljit_sw), SLJIT_R0, 0);
 	}
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_return_void(compiler);
 
@@ -1471,42 +1629,51 @@ static void test18(void)
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 3, 2, 0, 0, 0);
 
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_IMM, SLJIT_W(0x1122334455667788));
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_IMM, SLJIT_W(0x1122334455667788));
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(1000000000000));
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(1000000000000));
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_IMM, SLJIT_W(5000000000000), SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, SLJIT_W(0x1108080808));
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_ADD32, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_R1, 0, SLJIT_IMM, SLJIT_W(0x1120202020));
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0x1108080808));
 	sljit_emit_op2u(compiler, SLJIT_AND | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0x1120202020));
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_ZERO);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5, SLJIT_S1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op2u(compiler, SLJIT_AND32 | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0x1120202020));
+	/* buf[6] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6, SLJIT_ZERO);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0x1108080808));
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0x2208080808));
+	/* buf[7] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7, SLJIT_LESS);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op2u(compiler, SLJIT_AND32 | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0x1104040404));
+	/* buf[8] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 8, SLJIT_NOT_ZERO);
 
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 4);
+	/* buf[9] */
 	sljit_emit_op2(compiler, SLJIT_SHL32, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 9, SLJIT_IMM, SLJIT_W(0xffff0000), SLJIT_R0, 0);
-
+	/* buf[10] */
 	sljit_emit_op2(compiler, SLJIT_MUL32, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 10, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 10, SLJIT_IMM, -1);
-#else
-	/* 32 bit operations. */
-
+#else /* !SLJIT_64BIT_ARCHITECTURE */
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_IMM, 0x11223344);
+	/* buf[1] */
 	sljit_emit_op2(compiler, SLJIT_ADD32, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_IMM, 0x44332211);
-
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_emit_return_void(compiler);
 
@@ -1519,16 +1686,16 @@ static void test18(void)
 	FAILED(buf[0] != SLJIT_W(0x1122334455667788), "test18 case 1 failed\n");
 #if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
 	FAILED(buf[1] != 0x55667788, "test18 case 2 failed\n");
-#else
+#else /* !SLJIT_LITTLE_ENDIAN */
 	FAILED(buf[1] != SLJIT_W(0x5566778800000000), "test18 case 2 failed\n");
-#endif
+#endif /* SLJIT_LITTLE_ENDIAN */
 	FAILED(buf[2] != SLJIT_W(2000000000000), "test18 case 3 failed\n");
 	FAILED(buf[3] != SLJIT_W(4000000000000), "test18 case 4 failed\n");
 #if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
 	FAILED(buf[4] != 0x28282828, "test18 case 5 failed\n");
-#else
+#else /* !SLJIT_LITTLE_ENDIAN */
 	FAILED(buf[4] != SLJIT_W(0x2828282800000000), "test18 case 5 failed\n");
-#endif
+#endif /* SLJIT_LITTLE_ENDIAN */
 	FAILED(buf[5] != 0, "test18 case 6 failed\n");
 	FAILED(buf[6] != 1, "test18 case 7 failed\n");
 	FAILED(buf[7] != 1, "test18 case 8 failed\n");
@@ -1536,14 +1703,14 @@ static void test18(void)
 #if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
 	FAILED(buf[9] != (sljit_sw)0xfff00000, "test18 case 10 failed\n");
 	FAILED(buf[10] != (sljit_sw)0xffffffff, "test18 case 11 failed\n");
-#else
+#else /* !SLJIT_LITTLE_ENDIAN */
 	FAILED(buf[9] != (sljit_sw)SLJIT_W(0xfff0000000000000), "test18 case 10 failed\n");
 	FAILED(buf[10] != (sljit_sw)SLJIT_W(0xffffffff00000000), "test18 case 11 failed\n");
-#endif
-#else
+#endif /* SLJIT_LITTLE_ENDIAN */
+#else /* !SLJIT_64BIT_ARCHITECTURE */
 	FAILED(buf[0] != 0x11223344, "test18 case 1 failed\n");
 	FAILED(buf[1] != 0x44332211, "test18 case 2 failed\n");
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_free_code(code.code, NULL);
 	successful_tests++;
@@ -1570,14 +1737,21 @@ static void test19(void)
 	buf[7] = 0;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 3, 1, 0, 0, 0);
+	/* buf[0] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM0(), (sljit_sw)&buf[2], SLJIT_MEM0(), (sljit_sw)&buf[1], SLJIT_MEM0(), (sljit_sw)&buf[0]);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, sizeof(sljit_sw));
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_MEM1(SLJIT_R0), (sljit_sw)&buf[0], SLJIT_MEM1(SLJIT_R1), (sljit_sw)&buf[0]);
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_MEM0(), (sljit_sw)&buf[0], SLJIT_IMM, 2);
+	/* buf[5] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_R0), (sljit_sw)&buf[0] + 4 * (sljit_sw)sizeof(sljit_sw));
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7, SLJIT_IMM, 10);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 7);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_R1), (sljit_sw)&buf[5], SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_MEM1(SLJIT_R1), (sljit_sw)&buf[5]);
 
@@ -1627,18 +1801,24 @@ static void test20(void)
 	buf[5] = -12345;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 5, 5, 0, 0, 4 * sizeof(sljit_sw));
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_uw), SLJIT_MEM1(SLJIT_S0), 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_uw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, -1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, -1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S3, 0, SLJIT_IMM, -1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S4, 0, SLJIT_IMM, -1);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_uw), SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_uw));
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_uw), SLJIT_MEM1(SLJIT_SP), sizeof(sljit_uw), SLJIT_MEM1(SLJIT_SP), 0);
 	sljit_get_local_base(compiler, SLJIT_R0, 0, -offset_value);
 	sljit_get_local_base(compiler, SLJIT_MEM1(SLJIT_S0), 0, -0x1234);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_S0), 0);
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_uw), SLJIT_MEM1(SLJIT_R0), offset_value, SLJIT_MEM1(SLJIT_R1), 0x1234 + sizeof(sljit_sw));
+	/* buf[5] */
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_uw));
 	/* Dummy last instructions. */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, 23);
@@ -1706,6 +1886,7 @@ static void test21(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 2, 0, 0, 2 * sizeof(sljit_sw));
 
+	/* Return value */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 10);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), 0, SLJIT_MEM1(SLJIT_SP), 0);
 
@@ -1727,7 +1908,9 @@ static void test21(void)
 	sljit_set_context(compiler, 0, 1, 3, 2, 0, 0, 2 * sizeof(sljit_sw));
 
 	sljit_emit_op0(compiler, SLJIT_ENDBR);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_SP), 0);
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_SP), 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
 
@@ -1782,39 +1965,55 @@ static void test22(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS3(VOID, P, P, P), 3, 3, 0, 0, 0);
 
+	/* sbuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_IMM, -13);
+	/* sbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U16, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s16), SLJIT_IMM, 0x1234);
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s16));
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_IMM, 2 * sizeof(sljit_s16));
+	/* sbuf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U16, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s16), SLJIT_MEM1(SLJIT_S1), -(sljit_sw)sizeof(sljit_s16));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0xff0000 + 8000);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 2);
+	/* sbuf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM2(SLJIT_S1, SLJIT_R1), 1, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R2, 0, SLJIT_S1, 0, SLJIT_IMM, 0x1234 - 3 * sizeof(sljit_s16));
+	/* sbuf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM1(SLJIT_R2), 0x1234, SLJIT_IMM, -9317);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S1, 0, SLJIT_IMM, 0x1234 + 4 * sizeof(sljit_s16));
+	/* sbuf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM1(SLJIT_R2), -0x1234, SLJIT_IMM, -9317);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R2, 0, SLJIT_S1, 0, SLJIT_IMM, 0x12348 - 5 * sizeof(sljit_s16));
+	/* sbuf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM1(SLJIT_R2), 0x12348, SLJIT_IMM, -8888);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S1, 0, SLJIT_IMM, 0x12348 + 6 * sizeof(sljit_s16));
+	/* sbuf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM1(SLJIT_R2), -0x12348, SLJIT_IMM, -8888);
 
+	/* bbuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_MEM1(SLJIT_S2), 0, SLJIT_IMM, -45);
+	/* sbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_s8), SLJIT_IMM, 0x12);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 4 * sizeof(sljit_s8));
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_S2), 2 * sizeof(sljit_s8));
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_S1, 0, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_S1, 0, SLJIT_S1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_R2, 0, SLJIT_S1, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R2, 0);
+	/* bbuf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_S2), 3 * sizeof(sljit_s8), SLJIT_S1, 0);
+	/* bbuf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(SLJIT_S2, SLJIT_R0), 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op1(compiler, SLJIT_MOV_U16, SLJIT_R0, 0, SLJIT_IMM, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 	sljit_emit_return_void(compiler);
@@ -1886,40 +2085,54 @@ static void test23(void)
 	ibuf[4] = 658923;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(W, P, P), 3, 3, 0, 0, 0);
+	/* ibuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_IMM, 34567);
+	/* ibuf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 4);
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), 0, SLJIT_IMM, -7654);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, garbage);
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s32));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, garbage);
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, garbage);
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_s32));
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x0f00f00);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R1, 0, SLJIT_S0, 0, SLJIT_IMM, 0x7777);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0x7777 + 3 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S0, 0, SLJIT_IMM, 0x7777);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), -0x7777 + 4 * (sljit_sw)sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[5] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S0, 0, SLJIT_IMM, 5 * sizeof(sljit_sw));
 	sljit_emit_op2(compiler, SLJIT_LSHR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_R1, SLJIT_R1), 0, SLJIT_IMM, 16);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0);
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), 1, SLJIT_IMM, 64, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), 1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM0(), (sljit_sw)&buf[7], SLJIT_IMM, 0x123456);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), (sljit_sw)&buf[6], SLJIT_MEM0(), (sljit_sw)&buf[7]);
+	/* buf[7] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S0, 0, SLJIT_IMM, 5 * sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_R1, 0);
+	/* buf[8] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S0, 0, SLJIT_IMM, 7 * sizeof(sljit_sw));
 	sljit_emit_op2(compiler, SLJIT_LSHR, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM2(SLJIT_R2, SLJIT_R2), 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf[8] - 0x12340);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0x12340, SLJIT_R2, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_R0), 0x12340, SLJIT_MEM1(SLJIT_R2), 3 * sizeof(sljit_sw), SLJIT_IMM, 6);
+	/* ibuf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_s32), SLJIT_IMM, 0x12345678);
+
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0x2bd700 | 243);
 	sljit_emit_return(compiler, SLJIT_MOV_S8, SLJIT_R1, 0);
 
@@ -1998,47 +2211,62 @@ static void test24(void)
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS3(VOID, P, P, P), 3, 3, 0, 0, 0);
 
 	/* Nothing should be updated. */
+	/* sbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM0(), (sljit_sw)&sbuf[1], SLJIT_MEM0(), (sljit_sw)&sbuf[0]);
+	/* bbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_MEM0(), (sljit_sw)&bbuf[1], SLJIT_MEM0(), (sljit_sw)&bbuf[0]);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2);
+	/* sbuf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U16, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), 1, SLJIT_MEM0(), (sljit_sw)&sbuf[3]);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf[0]);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 2);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM2(SLJIT_R0, SLJIT_R2), SLJIT_WORD_SHIFT, SLJIT_MEM0(), (sljit_sw)&buf[0], SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_s8));
+	/* bbuf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_R0), (sljit_sw)&bbuf[1], SLJIT_MEM1(SLJIT_R0), (sljit_sw)&bbuf[2]);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, sizeof(sljit_s16));
+	/* sbuf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S16, SLJIT_MEM1(SLJIT_R1), (sljit_sw)&sbuf[3], SLJIT_R1, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 3);
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 4);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_S0, 0);
+	/* buf[4] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT);
 
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S0, 0, SLJIT_IMM, 9 * sizeof(sljit_sw));
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S0, 0, SLJIT_IMM, 4 * sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, -(4 << SLJIT_WORD_SHIFT));
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_R0, SLJIT_R2), 0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf - 0x7fff8000 + 6 * (sljit_sw)sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 952467);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0x7fff8000, SLJIT_R1, 0);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0x7fff8000 + sizeof(sljit_sw), SLJIT_MEM1(SLJIT_R0), 0x7fff8000);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf + 0x7fff7fff + 6 * (sljit_sw)sizeof(sljit_sw));
+	/* buf[8] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_R0), -0x7fff7fff + 2 * (sljit_sw)sizeof(sljit_sw), SLJIT_MEM1(SLJIT_R0), -0x7fff7fff + (sljit_sw)sizeof(sljit_sw), SLJIT_MEM1(SLJIT_R0), -0x7fff7fff);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&bbuf - 0x7fff7ffe + 3 * (sljit_sw)sizeof(sljit_s8));
+	/* bbuf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_MEM1(SLJIT_R0), 0x7fff7fff, SLJIT_MEM1(SLJIT_R0), 0x7fff7ffe);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&bbuf + 0x7fff7fff + 5 * (sljit_sw)sizeof(sljit_s8));
+	/* bbuf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_MEM1(SLJIT_R0), -0x7fff7fff, SLJIT_MEM1(SLJIT_R0), -0x7fff8000);
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&bbuf - SLJIT_W(0x123456123456));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&bbuf - SLJIT_W(0x123456123456));
+	/* bbuf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_MEM1(SLJIT_R0), SLJIT_W(0x123456123456) + 6 * sizeof(sljit_s8), SLJIT_MEM1(SLJIT_R1), SLJIT_W(0x123456123456));
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_emit_return_void(compiler);
 
@@ -2138,7 +2366,7 @@ static void test25(void)
 	FAILED(buf[13] != SLJIT_W(0x07fff00ffff00000), "test25 case 14 failed\n");
 
 	sljit_free_code(code.code, NULL);
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 	successful_tests++;
 }
 
@@ -2176,22 +2404,29 @@ static void test26(void)
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, 3);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_IMM, 1);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), -3);
+	/* ibuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32) - 1, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), -1);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) - 3, SLJIT_R0, 0);
 
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S0, 0, SLJIT_IMM, 100);
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_MEM1(SLJIT_R0), (sljit_sw)sizeof(sljit_sw) * 2 - 103, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2 - 3, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 3 - 3);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S1, 0, SLJIT_IMM, 100);
+	/* ibuf[2] */
 	sljit_emit_op2(compiler, SLJIT_MUL32, SLJIT_MEM1(SLJIT_R0), (sljit_sw)sizeof(sljit_s32) * 2 - 101, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32) * 2 - 1, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32) * 3 - 1);
 
 	if (sljit_has_cpu_feature(SLJIT_HAS_FPU)) {
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S2, 0, SLJIT_S2, 0, SLJIT_IMM, 3);
+		/* dbuf[1] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_f64) - 3, SLJIT_MEM1(SLJIT_S2), -3);
+		/* dbuf[2] */
 		sljit_emit_fop2(compiler, SLJIT_ADD_F64, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_f64) * 2 - 3, SLJIT_MEM1(SLJIT_S2), -3, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_f64) - 3);
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S2, 0, SLJIT_IMM, 2);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sizeof(sljit_f64) * 3 - 4) >> 1);
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
+		/* dbuf[3] */
 		sljit_emit_fop2(compiler, SLJIT_DIV_F64, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_f64) * 3 - 5, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_f64) * 2 - 3, SLJIT_MEM2(SLJIT_R2, SLJIT_R1), 1);
 	}
 
@@ -2254,6 +2489,7 @@ static void test27(void)
 	/* 3 arguments passed, 3 arguments used. */
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 4, 3, 0, 0, 0);
 
+	/* buf[0] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, 1);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x1001);
@@ -2264,6 +2500,7 @@ static void test27(void)
 	sljit_emit_op0(compiler, SLJIT_ENDBR); /* ENDBR should keep the flags. */
 	sljit_emit_op0(compiler, SLJIT_NOP); /* Nop should keep the flags. */
 	SET_NEXT_BYTE(SLJIT_GREATER);
+	/* buf[2] */
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_LESS);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_R0, 0);
@@ -2272,6 +2509,7 @@ static void test27(void)
 	sljit_emit_op0(compiler, SLJIT_ENDBR); /* ENDBR should keep the flags. */
 	sljit_emit_op0(compiler, SLJIT_NOP); /* Nop should keep the flags. */
 	SET_NEXT_BYTE(SLJIT_GREATER);
+	/* buf[4] */
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_LESS);
 
@@ -2281,37 +2519,47 @@ static void test27(void)
 	/* 0x100000010 on 64 bit machines, 0x10 on 32 bit machines. */
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_R0, 0, SLJIT_IMM, 0x80);
 	SET_NEXT_BYTE(SLJIT_GREATER);
+	/* buf[6] */
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_IMM, 0x80);
 	SET_NEXT_BYTE(SLJIT_LESS);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_GREATER, SLJIT_R0, 0, SLJIT_IMM, 0x80);
 	SET_NEXT_BYTE(SLJIT_GREATER);
+	/* buf[7] */
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_IMM, 0x80);
 	SET_NEXT_BYTE(SLJIT_LESS);
 
+	/* buf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
 	/* 0xff..ff on all machines. */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
 	SET_NEXT_BYTE(SLJIT_LESS_EQUAL);
+	/* buf[9] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
 	SET_NEXT_BYTE(SLJIT_GREATER_EQUAL);
+	/* buf[10] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_GREATER, SLJIT_R2, 0, SLJIT_R1, 0, SLJIT_IMM, -1);
 	SET_NEXT_BYTE(SLJIT_SIG_GREATER);
+	/* buf[12] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, -1);
 	SET_NEXT_BYTE(SLJIT_SIG_LESS);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R1, 0, SLJIT_R0, 0);
 	SET_NEXT_BYTE(SLJIT_EQUAL);
+	/* buf[14] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_R0, 0);
 	SET_NEXT_BYTE(SLJIT_NOT_EQUAL);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, -2);
 	SET_NEXT_BYTE(SLJIT_OVERFLOW);
+	/* buf[16] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, -2);
 	SET_NEXT_BYTE(SLJIT_NOT_OVERFLOW);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, -2);
 	SET_NEXT_BYTE(SLJIT_GREATER_EQUAL);
+	/* buf[17] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, -2);
 	SET_NEXT_BYTE(SLJIT_LESS_EQUAL);
+	/* buf[20] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)((sljit_uw)1 << ((8 * sizeof(sljit_uw)) - 1)));
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
 	SET_NEXT_BYTE(SLJIT_SIG_LESS);
@@ -2320,9 +2568,11 @@ static void test27(void)
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_GREATER_EQUAL, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, -2);
 	SET_NEXT_BYTE(SLJIT_SIG_GREATER_EQUAL);
+	/* buf[21] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_GREATER, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2);
 	SET_NEXT_BYTE(SLJIT_SIG_GREATER);
 
+	/* buf[22] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0x80000000);
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 16);
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 16);
@@ -2330,15 +2580,18 @@ static void test27(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)0xffffffff);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_OVERFLOW);
+	/* buf[24] */
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_NOT_OVERFLOW);
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R1, 0, SLJIT_R1, 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_OVERFLOW);
+	/* buf[25] */
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_NOT_OVERFLOW);
 
+	/* buf[26] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op2u(compiler, SLJIT_SUBC | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_IMM, 0);
@@ -2346,6 +2599,7 @@ static void test27(void)
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_S0), 1, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, 1);
 
+	/* buf[27] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
 	sljit_emit_op2u(compiler, SLJIT_ADD | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op2u(compiler, SLJIT_ADDC | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_IMM, 1);
@@ -2353,10 +2607,12 @@ static void test27(void)
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_S0), 1, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, 1);
 
+	/* buf[28] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, (8 * sizeof(sljit_sw)) - 1);
 	sljit_emit_op2u(compiler, SLJIT_ADD | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, 0);
 	SET_NEXT_BYTE(SLJIT_EQUAL);
+	/* buf[34] */
 	sljit_emit_op2u(compiler, SLJIT_ADD | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_R0, 0);
 	SET_NEXT_BYTE(SLJIT_EQUAL);
 
@@ -2380,16 +2636,20 @@ static void test27(void)
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_SUBC | SLJIT_SET_CARRY, SLJIT_R2, 0, SLJIT_IMM, 1, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_S0), 1, SLJIT_R2, 0);
+	/* buf[35] */
 	sljit_emit_op2u(compiler, SLJIT_SUBC | SLJIT_SET_CARRY, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_op2(compiler, SLJIT_SUBC, SLJIT_R2, 0, SLJIT_IMM, 1, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_S0), 2, SLJIT_R2, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, 2);
 
+	/* buf[36] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -34);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_IMM, 0x1234);
 	SET_NEXT_BYTE(SLJIT_LESS);
+	/* buf[37] */
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R0, 0, SLJIT_IMM, 0x1234);
 	SET_NEXT_BYTE(SLJIT_SIG_LESS);
+	/* buf[38] */
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0x12300000000) - 43);
 #else
@@ -2398,6 +2658,7 @@ static void test27(void)
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, -96);
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_LESS);
+	/* buf[39] */
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_SIG_GREATER, SLJIT_R0, 0, SLJIT_R1, 0);
 	SET_NEXT_BYTE(SLJIT_SIG_GREATER);
 
@@ -2490,12 +2751,15 @@ static void test28(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, -234);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
 	sljit_emit_op2(compiler, SLJIT_MUL, SLJIT_S3, 0, SLJIT_R3, 0, SLJIT_R4, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_S3, 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_S3, 0, SLJIT_IMM, 0);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_S3, 0, SLJIT_NOT_ZERO);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_S3, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S4, 0, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw));
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_S4, 0, SLJIT_S4, 0, SLJIT_R4, 0);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_S4, 0);
 
 	const1 = sljit_emit_const(compiler, SLJIT_S3, 0, 0);
@@ -2503,6 +2767,7 @@ static void test28(void)
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_S3, 0, SLJIT_S3, 0, SLJIT_IMM, 100);
 	label = sljit_emit_label(compiler);
 	sljit_emit_op0(compiler, SLJIT_ENDBR);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_S3, 0);
 
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_R4, 0);
@@ -2569,7 +2834,7 @@ static void test29(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_uw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, SLJIT_W(0xcef97a70b5));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_uw), SLJIT_R4, 0);
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -187);
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_R0, 0, SLJIT_R1, 0);
@@ -2610,7 +2875,7 @@ static void test29(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, SLJIT_W(0xcef97a70b5));
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_R3, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 23 * sizeof(sljit_uw), SLJIT_R4, 0);
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, 0x9faa5);
 	sljit_emit_op1(compiler, SLJIT_MOV_S8, SLJIT_S2, 0, SLJIT_S2, 0);
@@ -2637,7 +2902,7 @@ static void test29(void)
 	FAILED(buf[9] != SLJIT_W(-1938520854), "test29 case 10 failed\n");
 	FAILED(buf[10] != SLJIT_W(3236202668), "test29 case 11 failed\n");
 	FAILED(buf[11] != SLJIT_W(0xf97a70b5), "test29 case 12 failed\n");
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	FAILED(buf[12] != 69, "test29 case 13 failed\n");
 	FAILED(buf[13] != -93, "test29 case 14 failed\n");
@@ -2653,7 +2918,7 @@ static void test29(void)
 	FAILED(buf[21] != SLJIT_W(-1938520854), "test29 case 22 failed\n");
 	FAILED(buf[22] != SLJIT_W(3236202668), "test29 case 23 failed\n");
 	FAILED(buf[23] != SLJIT_W(0xf97a70b5), "test29 case 24 failed\n");
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	FAILED(buf[24] != -91, "test29 case 25 failed\n");
 
@@ -2706,6 +2971,7 @@ static void test30(void)
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_S1, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_S2, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_S3, 0);
+	/* buf[0] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0, SLJIT_S4, 0);
 
 	sljit_emit_return_void(compiler);
@@ -2744,37 +3010,47 @@ static void test31(void)
 
 	FAILED(!compiler, "cannot create compiler\n");
 
+	/* buf[0] */
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 3, 5, 0, 0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0);
 	sljit_emit_op2u(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_IMM, -45);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_NOT_OVERFLOW);
+	/* buf[1] */
 	sljit_emit_op2u(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_IMM, -45);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_OVERFLOW);
 
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, big_word);
 	sljit_emit_op2(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R2, 0, SLJIT_S2, 0, SLJIT_IMM, -2);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 33); /* Should not change flags. */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0); /* Should not change flags. */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_OVERFLOW);
+	/* buf[3] */
 	sljit_emit_op2(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R2, 0, SLJIT_S2, 0, SLJIT_IMM, -2);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_NOT_OVERFLOW);
 
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_S3, 0, SLJIT_IMM, 0x3f6b0);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_S4, 0, SLJIT_IMM, 0x2a783);
 	sljit_emit_op2(compiler, SLJIT_MUL32 | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_S3, 0, SLJIT_S4, 0);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_OVERFLOW);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R1, 0);
 
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, big_word2);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_R1, 0);
 	sljit_emit_op2(compiler, SLJIT_MUL32 | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 23);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_OVERFLOW);
 
+	/* buf[7] */
 	sljit_emit_op2u(compiler, SLJIT_MUL32 | SLJIT_SET_OVERFLOW, SLJIT_R2, 0, SLJIT_IMM, -23);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_NOT_OVERFLOW);
+	/* buf[8] */
 	sljit_emit_op2u(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R2, 0, SLJIT_IMM, -23);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_NOT_OVERFLOW);
 
+	/* buf[9] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 67);
 	sljit_emit_op2(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, -23);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_R1, 0);
@@ -2811,7 +3087,7 @@ static void test32(void)
 {
 	/* Floating point set flags. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	sljit_s32 i;
 
 	sljit_sw buf[16];
@@ -2826,6 +3102,16 @@ static void test32(void)
 	if (verbose)
 		printf("Run test32\n");
 
+	if (!sljit_has_cpu_feature(SLJIT_HAS_FPU)) {
+		if (verbose)
+			printf("no fpu available, test32 skipped\n");
+		successful_tests++;
+		return;
+	}
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
 	for (i = 0; i < 16; i++)
 		buf[i] = 5;
 
@@ -2837,16 +3123,6 @@ static void test32(void)
 	dbuf[2].value = -13.0;
 	dbuf[3].value = 27.0;
 
-	if (!sljit_has_cpu_feature(SLJIT_HAS_FPU)) {
-		if (verbose)
-			printf("no fpu available, test32 skipped\n");
-		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
-		return;
-	}
-
-	FAILED(!compiler, "cannot create compiler\n");
 	SLJIT_ASSERT(sizeof(sljit_f64) == 8 && sizeof(sljit_s32) == 4 && sizeof(dbuf[0]) == 8);
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, P, P), 1, 2, 4, 0, 0);
@@ -2854,39 +3130,53 @@ static void test32(void)
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S1), 0);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_UNORDERED, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_f64), SLJIT_FR0, 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_f64));
+	/* buf[0] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_UNORDERED);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_ORDERED, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_f64), SLJIT_FR0, 0);
+	/* buf[1] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_ORDERED);
 
 	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_f64));
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_UNORDERED, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[2] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_UNORDERED);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_ORDERED, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[3] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_ORDERED);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_LESS, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[4] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_F_LESS);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_GREATER_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[5] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_F_GREATER_EQUAL);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_GREATER, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[6] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_F_GREATER);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_LESS_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[7] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_F_LESS_EQUAL);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[8] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_F_EQUAL);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_NOT_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
+	/* buf[9] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_F_NOT_EQUAL);
 
 	sljit_emit_fop2(compiler, SLJIT_ADD_F64, SLJIT_FR3, 0, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_f64));
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_UNORDERED, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_f64));
+	/* buf[10] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_sw), SLJIT_UNORDERED);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_F_EQUAL, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_f64));
+	/* buf[11] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_sw), SLJIT_F_EQUAL);
 
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_ORDERED, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_f64), SLJIT_FR0, 0);
+	/* buf[12] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 12 * sizeof(sljit_sw), SLJIT_ORDERED);
 
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | SLJIT_SET_UNORDERED, SLJIT_FR3, 0, SLJIT_FR2, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), 0);
+	/* buf[13] */
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 13 * sizeof(sljit_sw), SLJIT_UNORDERED);
 
 	sljit_emit_return_void(compiler);
@@ -2942,22 +3232,27 @@ static void test33(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 3, 3, 0, 0, 0);
 
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 20);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 10);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_R2, 0, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_ZERO);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -10);
 	jump = sljit_emit_jump(compiler, SLJIT_LESS);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_IMM, 11);
 	sljit_set_label(jump, sljit_emit_label(compiler));
 
+	/* buf[2] */
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_SIG_GREATER, SLJIT_R2, 0, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_SIG_GREATER);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_IMM, 45);
 	jump = sljit_emit_jump(compiler, SLJIT_NOT_EQUAL);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_IMM, 55);
 	sljit_set_label(jump, sljit_emit_label(compiler));
 
+	/* buf[4-5] */
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)SLJIT_W(0x8000000000000000));
 #else
@@ -2967,20 +3262,25 @@ static void test33(void)
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_OVERFLOW, SLJIT_R2, 0, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_IMM, 33);
 	jump = sljit_emit_jump(compiler, SLJIT_NOT_OVERFLOW);
+	/* buf[5] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_ZERO);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_IMM, 13);
 	sljit_set_label(jump, sljit_emit_label(compiler));
 
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0x80000000);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_SUB32 | SLJIT_SET_Z | SLJIT_SET_OVERFLOW, SLJIT_R2, 0, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 0);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_NOT_ZERO);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_IMM, 78);
 	jump = sljit_emit_jump(compiler, SLJIT_OVERFLOW);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_IMM, 48);
 	sljit_set_label(jump, sljit_emit_label(compiler));
 
+	/* buf[8] */
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)SLJIT_W(0x8000000000000000));
 #else
@@ -2991,6 +3291,7 @@ static void test33(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_IMM, 30);
 	jump = sljit_emit_jump(compiler, SLJIT_NOT_OVERFLOW);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_IMM, 50);
+	/* buf[9] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_ZERO);
 	sljit_set_label(jump, sljit_emit_label(compiler));
 
@@ -3044,7 +3345,7 @@ static void test34(void)
 	sljit_set_context(compiler, 0, 1, 5, 5, 0, 0, 2 * sizeof(sljit_p));
 
 	sljit_emit_op0(compiler, SLJIT_ENDBR);
-	sljit_emit_fast_enter(compiler, SLJIT_R1, 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_R1, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 4);
 	sljit_emit_op_src(compiler, SLJIT_FAST_RETURN, SLJIT_R1, 0);
 
@@ -3058,7 +3359,7 @@ static void test34(void)
 	sljit_set_context(compiler, 0, 1, 5, 5, 0, 0, 2 * sizeof(sljit_p));
 
 	sljit_emit_op0(compiler, SLJIT_ENDBR);
-	sljit_emit_fast_enter(compiler, SLJIT_R4, 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_R4, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 6);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, SLJIT_FUNC_ADDR(codeA.code));
 	sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_R1, 0);
@@ -3074,7 +3375,7 @@ static void test34(void)
 	sljit_set_context(compiler, 0, 1, 5, 5, 0, 0, 2 * sizeof(sljit_p));
 
 	sljit_emit_op0(compiler, SLJIT_ENDBR);
-	sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_p));
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_p));
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 8);
 	jump = sljit_emit_jump(compiler, SLJIT_FAST_CALL | SLJIT_REWRITABLE_JUMP);
 	sljit_set_target(jump, SLJIT_FUNC_UADDR(codeB.code));
@@ -3090,7 +3391,7 @@ static void test34(void)
 	sljit_set_context(compiler, 0, 1, 5, 5, 0, 0, 2 * sizeof(sljit_p));
 
 	sljit_emit_op0(compiler, SLJIT_ENDBR);
-	sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_MEM1(SLJIT_SP), 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 10);
 	sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM, SLJIT_FUNC_ADDR(codeC.code));
 	sljit_emit_op_src(compiler, SLJIT_FAST_RETURN, SLJIT_MEM1(SLJIT_SP), 0);
@@ -3104,7 +3405,7 @@ static void test34(void)
 	FAILED(!compiler, "cannot create compiler\n");
 	sljit_set_context(compiler, 0, 1, 5, 5, 0, 0, 2 * sizeof(sljit_p));
 
-	sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 12);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_p), SLJIT_IMM, SLJIT_FUNC_ADDR(codeD.code));
 	sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_p));
@@ -3165,7 +3466,7 @@ static void test35(void)
 	FAILED(!compiler, "cannot create compiler\n");
 	sljit_set_context(compiler, 0, 0, 2, 2, 0, 0, 0);
 
-	sljit_emit_fast_enter(compiler, SLJIT_MEM0(), (sljit_sw)&buf[0]);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_MEM0(), (sljit_sw)&buf[0]);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 5);
 
 	jump = sljit_emit_jump(compiler, SLJIT_FAST_CALL | SLJIT_REWRITABLE_JUMP);
@@ -3186,7 +3487,7 @@ static void test35(void)
 	sljit_set_context(compiler, 0, 0, 2, 2, 0, 0, 0);
 
 	sljit_emit_op0(compiler, SLJIT_ENDBR);
-	sljit_emit_fast_enter(compiler, SLJIT_R1, 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_R1, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 7);
 	sljit_emit_op_src(compiler, SLJIT_FAST_RETURN, SLJIT_R1, 0);
 
@@ -3269,16 +3570,24 @@ static void test36(void)
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 13);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 15);
+	/* buf[0], compare_buf[0-6] */
 	cmp_test(compiler, SLJIT_EQUAL, SLJIT_IMM, 9, SLJIT_R0, 0);
+	/* buf[1] */
 	cmp_test(compiler, SLJIT_EQUAL, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 3);
+	/* buf[2] */
 	cmp_test(compiler, SLJIT_EQUAL, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_IMM, -13);
+	/* buf[3] */
 	cmp_test(compiler, SLJIT_NOT_EQUAL, SLJIT_IMM, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0);
+	/* buf[4] */
 	cmp_test(compiler, SLJIT_NOT_EQUAL | SLJIT_REWRITABLE_JUMP, SLJIT_IMM, 0, SLJIT_R0, 0);
+	/* buf[5] */
 	cmp_test(compiler, SLJIT_EQUAL, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), SLJIT_WORD_SHIFT);
+	/* buf[6] */
 	cmp_test(compiler, SLJIT_EQUAL | SLJIT_REWRITABLE_JUMP, SLJIT_R0, 0, SLJIT_IMM, 0);
 
+	/* buf[7-16], compare_buf[7-16] */
 	cmp_test(compiler, SLJIT_SIG_LESS, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_IMM, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -8);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0);
@@ -3292,6 +3601,7 @@ static void test36(void)
 	cmp_test(compiler, SLJIT_SIG_LESS, SLJIT_IMM, 0, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_sw));
 	cmp_test(compiler, SLJIT_SIG_LESS | SLJIT_REWRITABLE_JUMP, SLJIT_IMM, 0, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_sw));
 
+	/* buf[17-28], compare_buf[17-28] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 8);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0);
 	cmp_test(compiler, SLJIT_LESS, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_sw));
@@ -3307,6 +3617,7 @@ static void test36(void)
 	cmp_test(compiler, SLJIT_GREATER, SLJIT_R0, 0, SLJIT_R1, 0);
 	cmp_test(compiler, SLJIT_GREATER | SLJIT_REWRITABLE_JUMP, SLJIT_R0, 0, SLJIT_R1, 0);
 
+	/* buf[29-39], compare_buf[29-39] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -3);
 	cmp_test(compiler, SLJIT_SIG_LESS, SLJIT_R0, 0, SLJIT_R1, 0);
 	cmp_test(compiler, SLJIT_SIG_GREATER_EQUAL, SLJIT_R0, 0, SLJIT_R1, 0);
@@ -3324,20 +3635,22 @@ static void test36(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0xf00000004));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_R1, 0);
+	/* buf[40-43] */
 	cmp_test(compiler, SLJIT_LESS | SLJIT_32, SLJIT_R1, 0, SLJIT_IMM, 5);
 	cmp_test(compiler, SLJIT_LESS, SLJIT_R0, 0, SLJIT_IMM, 5);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0xff0000004));
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_R0, 0);
 	cmp_test(compiler, SLJIT_SIG_GREATER | SLJIT_32, SLJIT_R1, 0, SLJIT_IMM, 5);
 	cmp_test(compiler, SLJIT_SIG_GREATER, SLJIT_R0, 0, SLJIT_IMM, 5);
-#else
+#else /* !SLJIT_64BIT_ARCHITECTURE */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 4);
+	/* buf[40-43] */
 	cmp_test(compiler, SLJIT_LESS | SLJIT_32, SLJIT_R0, 0, SLJIT_IMM, 5);
 	cmp_test(compiler, SLJIT_GREATER | SLJIT_32, SLJIT_R0, 0, SLJIT_IMM, 5);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0xf0000004);
 	cmp_test(compiler, SLJIT_SIG_GREATER | SLJIT_32, SLJIT_R0, 0, SLJIT_IMM, 5);
 	cmp_test(compiler, SLJIT_SIG_LESS | SLJIT_32, SLJIT_R0, 0, SLJIT_IMM, 5);
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_emit_return_void(compiler);
 
@@ -3360,11 +3673,9 @@ static void test36(void)
 
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 #define BITN(n) (SLJIT_W(1) << (63 - (n)))
-#define RESN(n) (n)
-#else
+#else /* !SLJIT_64BIT_ARCHITECTURE */
 #define BITN(n) (1 << (31 - ((n) & 0x1f)))
-#define RESN(n) ((n) & 0x1f)
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 static void test37(void)
 {
@@ -3372,7 +3683,7 @@ static void test37(void)
 	executable_code code;
 	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
 	sljit_sw buf[9];
-	sljit_s32 ibuf[2];
+	sljit_s32 ibuf[3];
 	sljit_s32 i;
 
 	if (verbose)
@@ -3382,45 +3693,55 @@ static void test37(void)
 
 	for (i = 0; i < 9; i++)
 		buf[i] = -1;
+	for (i = 0; i < 3; i++)
+		ibuf[i] = -1;
 	buf[2] = 0;
 	buf[4] = BITN(13);
-	ibuf[0] = -1;
-	ibuf[1] = -1;
-	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, P, P), 1, 3, 0, 0, 0);
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, P, P), 2, 3, 0, 0, 0);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, BITN(27));
 	sljit_emit_op1(compiler, SLJIT_CLZ, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, BITN(47));
 	sljit_emit_op1(compiler, SLJIT_CLZ, SLJIT_R0, 0, SLJIT_S2, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_CLZ, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw));
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
 	sljit_emit_op1(compiler, SLJIT_CLZ, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* ibuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0);
 	sljit_emit_op1(compiler, SLJIT_CLZ32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_R0, 0);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
 	sljit_emit_op1(compiler, SLJIT_CLZ, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, BITN(58));
 	sljit_emit_op1(compiler, SLJIT_CLZ, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0);
 	sljit_emit_op1(compiler, SLJIT_CLZ, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_R0, 0);
-#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
-	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0xff08a00000));
-#else
-	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0x08a00000);
-#endif
+	/* ibuf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, WCONST(0xff08a00000, 0x08a00000));
 	sljit_emit_op1(compiler, SLJIT_CLZ32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_R0, 0);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_CLZ32, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_R0, 0);
-#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
-	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(0xffc8a00000));
-#else
-	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0xc8a00000);
-#endif
+	/* buf[8] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, WCONST(0xffc8a00000, 0xc8a00000));
 	sljit_emit_op1(compiler, SLJIT_CLZ32, SLJIT_R0, 0, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_R0, 0);
 
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 0xa00a);
+	sljit_emit_op2(compiler, SLJIT_SHL32, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, 8);
+	/* ibuf[2] */
+	sljit_emit_op1(compiler, SLJIT_CLZ32, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s32), SLJIT_R0, 0);
+
 	sljit_emit_return_void(compiler);
 
 	code.code = sljit_generate_code(compiler);
@@ -3428,32 +3749,24 @@ static void test37(void)
 	sljit_free_compiler(compiler);
 
 	code.func2((sljit_sw)&buf, (sljit_sw)&ibuf);
-	FAILED(buf[0] != RESN(27), "test37 case 1 failed\n");
-	FAILED(buf[1] != RESN(47), "test37 case 2 failed\n");
-#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
-	FAILED(buf[2] != 64, "test37 case 3 failed\n");
-#else
-	FAILED(buf[2] != 32, "test37 case 3 failed\n");
-#endif
+	FAILED(buf[0] != 27, "test37 case 1 failed\n");
+	FAILED(buf[1] != WCONST(47, 15), "test37 case 2 failed\n");
+	FAILED(buf[2] != WCONST(64, 32), "test37 case 3 failed\n");
 	FAILED(buf[3] != 0, "test37 case 4 failed\n");
 	FAILED(ibuf[0] != 32, "test37 case 5 failed\n");
-	FAILED(buf[4] != RESN(13), "test37 case 6 failed\n");
-	FAILED(buf[5] != RESN(58), "test37 case 7 failed\n");
-#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
-	FAILED(buf[6] != 64, "test37 case 8 failed\n");
-#else
-	FAILED(buf[6] != 32, "test37 case 8 failed\n");
-#endif
+	FAILED(buf[4] != 13, "test37 case 6 failed\n");
+	FAILED(buf[5] != WCONST(58, 26), "test37 case 7 failed\n");
+	FAILED(buf[6] != WCONST(64, 32), "test37 case 8 failed\n");
 	FAILED(ibuf[1] != 4, "test37 case 9 failed\n");
-
 	FAILED((buf[7] & (sljit_sw)0xffffffff) != 4, "test37 case 10 failed\n");
 	FAILED((buf[8] & (sljit_sw)0xffffffff) != 0, "test37 case 11 failed\n");
+	FAILED(ibuf[2] != 8, "test37 case 12 failed\n");
 
 	sljit_free_code(code.code, NULL);
 	successful_tests++;
 }
+
 #undef BITN
-#undef RESN
 
 static void test38(void)
 {
@@ -3644,22 +3957,26 @@ static void test40(void)
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_IMM, -6, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0x123456);
 	sljit_emit_op_flags(compiler, SLJIT_OR, SLJIT_R1, 0, SLJIT_SIG_LESS);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R1, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -13);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_IMM, -13, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 0);
 	sljit_emit_op_flags(compiler, SLJIT_OR | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_EQUAL);
+	/* buf[1] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_NOT_EQUAL);
 	sljit_emit_op2(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_SP), 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_IMM, -13, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0);
 	sljit_emit_op_flags(compiler, SLJIT_OR | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_EQUAL);
+	/* buf[2] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 2, SLJIT_EQUAL);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -13);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 3);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_R1, 0);
+	/* buf[3] */
 	sljit_emit_op_flags(compiler, SLJIT_OR, SLJIT_MEM2(SLJIT_S0, SLJIT_R1), SLJIT_WORD_SHIFT, SLJIT_SIG_LESS);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -8);
@@ -3671,22 +3988,28 @@ static void test40(void)
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S3, 0, SLJIT_IMM, 0x88);
 	sljit_emit_op_flags(compiler, SLJIT_OR, SLJIT_S3, 0, SLJIT_NOT_EQUAL);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 4, SLJIT_S1, 0);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 5, SLJIT_S3, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x84);
 	sljit_emit_op2u(compiler, SLJIT_AND | SLJIT_SET_Z, SLJIT_IMM, 0x180, SLJIT_R0, 0);
+	/* buf[6] */
 	sljit_emit_op_flags(compiler, SLJIT_OR | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 6, SLJIT_EQUAL);
+	/* buf[7] */
 	sljit_emit_op_flags(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 7, SLJIT_EQUAL);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op_flags(compiler, SLJIT_OR | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_NOT_EQUAL);
+	/* buf[8] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 8, SLJIT_NOT_EQUAL);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x123456);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_R0, 0, SLJIT_IMM, 1);
 	sljit_emit_op_flags(compiler, SLJIT_OR, SLJIT_R0, 0, SLJIT_GREATER);
+	/* buf[9] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw) * 9, SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 0xbaddead);
@@ -3975,6 +4298,7 @@ static void test42(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S4, 0, SLJIT_IMM, 0x9a3b06d);
 
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	/* buf[7-26] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, SLJIT_W(-0x5dc4f897b8cd67f5));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, SLJIT_W(0x3f8b5c026cb088df));
 	sljit_emit_op0(compiler, SLJIT_LMUL_UW);
@@ -4043,7 +4367,7 @@ static void test42(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 25 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 26 * sizeof(sljit_sw), SLJIT_R1, 0);
 
-#else
+#else /* !SLJIT_64BIT_ARCHITECTURE */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -0x58cd67f5);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 0x3cb088df);
 	sljit_emit_op0(compiler, SLJIT_LMUL_UW);
@@ -4107,8 +4431,9 @@ static void test42(void)
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_R1, 0, SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 25 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 26 * sizeof(sljit_sw), SLJIT_R1, 0);
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
+	/* buf[0-6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R2, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R3, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_R4, 0);
@@ -4142,7 +4467,7 @@ static void test42(void)
 	FAILED(buf[12] != SLJIT_W(2532236178951865933), "test42 case 13 failed\n");
 	FAILED(buf[13] != SLJIT_W(-1), "test42 case 14 failed\n");
 	FAILED(buf[14] != SLJIT_W(-2177944059851366166), "test42 case 15 failed\n");
-#else
+#else /* !SLJIT_64BIT_ARCHITECTURE */
 	FAILED(buf[7] != -1587000939, "test42 case 8 failed\n");
 	FAILED(buf[8] != 665003983, "test42 case 9 failed\n");
 	FAILED(buf[9] != -1587000939, "test42 case 10 failed\n");
@@ -4151,7 +4476,7 @@ static void test42(void)
 	FAILED(buf[12] != 768706125, "test42 case 13 failed\n");
 	FAILED(buf[13] != -1, "test42 case 14 failed\n");
 	FAILED(buf[14] != -471654166, "test42 case 15 failed\n");
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	FAILED(buf[15] != 56, "test42 case 16 failed\n");
 	FAILED(buf[16] != 58392872, "test42 case 17 failed\n");
@@ -4163,12 +4488,12 @@ static void test42(void)
 	FAILED(buf[20] != SLJIT_W(0x3d4af2c543), "test42 case 21 failed\n");
 	FAILED(buf[21] != SLJIT_W(-0xaf978), "test42 case 22 failed\n");
 	FAILED(buf[22] != SLJIT_W(0xa64ae42b7d6), "test42 case 23 failed\n");
-#else
+#else /* !SLJIT_64BIT_ARCHITECTURE */
 	FAILED(buf[19] != SLJIT_W(0xda5), "test42 case 20 failed\n");
 	FAILED(buf[20] != SLJIT_W(0xb86d0), "test42 case 21 failed\n");
 	FAILED(buf[21] != SLJIT_W(-0x6b6e), "test42 case 22 failed\n");
 	FAILED(buf[22] != SLJIT_W(0xd357), "test42 case 23 failed\n");
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	FAILED(buf[23] != 0x0, "test42 case 24 failed\n");
 	FAILED(buf[24] != (sljit_sw)0xf2906b14, "test42 case 25 failed\n");
@@ -4183,7 +4508,7 @@ static void test43(void)
 {
 	/* Test floating point compare. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	struct sljit_jump* jump;
 
 	union {
@@ -4201,11 +4526,10 @@ static void test43(void)
 		if (verbose)
 			printf("no fpu available, test43 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
+	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
 
 	dbuf[0].value = 12.125;
@@ -4274,16 +4598,20 @@ static void test44(void)
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(P, P), 3, 2, 0, 0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV_P, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV_P, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_p), SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_p));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 2);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV_P, SLJIT_MEM2(SLJIT_S0, SLJIT_R1), SLJIT_POINTER_SHIFT, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, sizeof(sljit_p));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 3 << SLJIT_POINTER_SHIFT);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_S0, 0);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV_P, SLJIT_MEM2(SLJIT_R2, SLJIT_R1), 0, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2 * sizeof(sljit_p));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 1 << SLJIT_POINTER_SHIFT);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV_P, SLJIT_MEM2(SLJIT_R2, SLJIT_R1), 2, SLJIT_R0, 0);
 
 	sljit_emit_return(compiler, SLJIT_MOV_P, SLJIT_R0, 0);
@@ -4307,7 +4635,7 @@ static void test45(void)
 	/* Test single precision floating point. */
 
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	sljit_f32 buf[12];
 	sljit_sw buf2[6];
 	struct sljit_jump* jump;
@@ -4319,11 +4647,10 @@ static void test45(void)
 		if (verbose)
 			printf("no fpu available, test45 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
+	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
 
 	buf[0] = 5.5;
@@ -4348,42 +4675,58 @@ static void test45(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, P, P), 3, 2, 6, 0, 0);
 
+	/* buf[2] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR5, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f32));
 	sljit_emit_fop1(compiler, SLJIT_NEG_F32, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_f32), SLJIT_FR0, 0);
+	/* buf[3] */
 	sljit_emit_fop1(compiler, SLJIT_ABS_F32, SLJIT_FR1, 0, SLJIT_FR5, 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_f32), SLJIT_FR1, 0);
+	/* buf[4] */
 	sljit_emit_fop1(compiler, SLJIT_ABS_F32, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_f32), SLJIT_FR5, 0);
+	/* buf[5] */
 	sljit_emit_fop1(compiler, SLJIT_NEG_F32, SLJIT_FR4, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_f32), SLJIT_FR4, 0);
 
+	/* buf[6] */
 	sljit_emit_fop2(compiler, SLJIT_ADD_F32, SLJIT_FR0, 0, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f32));
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_f32), SLJIT_FR0, 0);
+	/* buf[7] */
 	sljit_emit_fop2(compiler, SLJIT_SUB_F32, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_f32), SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_f32), SLJIT_FR5, 0);
+	/* buf[8] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_fop2(compiler, SLJIT_MUL_F32, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_f32), SLJIT_FR0, 0, SLJIT_FR0, 0);
+	/* buf[9] */
 	sljit_emit_fop2(compiler, SLJIT_DIV_F32, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_f32), SLJIT_FR0, 0);
 	sljit_emit_fop1(compiler, SLJIT_ABS_F32, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_f32), SLJIT_FR2, 0);
+	/* buf[10] */
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_S0, 0, SLJIT_IMM, 0x3d0ac);
 	sljit_emit_fop1(compiler, SLJIT_NEG_F32, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_f32), SLJIT_MEM1(SLJIT_R0), 0x3d0ac);
+	/* buf[11] */
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S0, 0, SLJIT_IMM, 0x3d0ac + sizeof(sljit_f32));
 	sljit_emit_fop1(compiler, SLJIT_ABS_F32, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_f32), SLJIT_MEM1(SLJIT_R0), -0x3d0ac);
 
+	/* buf2[0] */
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f32));
 	sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_EQUAL, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_F_EQUAL);
+	/* buf2[1] */
 	sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_LESS, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_sw), SLJIT_F_LESS);
+	/* buf2[2] */
 	sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_sw), SLJIT_F_EQUAL);
+	/* buf2[3] */
 	sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_GREATER_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_sw), SLJIT_F_GREATER_EQUAL);
 
+	/* buf2[4] */
 	jump = sljit_emit_fcmp(compiler, SLJIT_F_LESS_EQUAL | SLJIT_32, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f32));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_sw), SLJIT_IMM, 7);
 	sljit_set_label(jump, sljit_emit_label(compiler));
 
+	/* buf2[5] */
 	jump = sljit_emit_fcmp(compiler, SLJIT_F_GREATER | SLJIT_32, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_FR2, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_sw), SLJIT_IMM, 6);
 	sljit_set_label(jump, sljit_emit_label(compiler));
@@ -4440,47 +4783,65 @@ static void test46(void)
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, -7);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_R2, 0, SLJIT_IMM, 13);
+	/* buf[0] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV32, SLJIT_MEM0(), (sljit_sw)&buf, SLJIT_LESS);
+	/* buf[2] */
 	sljit_emit_op_flags(compiler, SLJIT_AND32, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_s32), SLJIT_NOT_ZERO);
 
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_IMM, -7);
+	/* buf[4] */
 	sljit_emit_op_flags(compiler, SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_s32), SLJIT_EQUAL);
+	/* buf[6] */
 	sljit_emit_op_flags(compiler, SLJIT_AND32, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_s32), SLJIT_NOT_EQUAL);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x1235);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, 0x1235);
 	sljit_emit_op_flags(compiler, SLJIT_AND32 | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_ZERO);
+	/* buf[8] */
 	sljit_emit_op_flags(compiler, SLJIT_AND32, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_s32), SLJIT_ZERO);
+	/* buf[10] */
 	sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_s32), SLJIT_R0, 0);
 
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_IMM, -7);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 12);
+	/* buf[12] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV32, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 2, SLJIT_EQUAL);
 	sljit_emit_op_flags(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_EQUAL);
+	/* buf[14] */
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_MEM1(SLJIT_S0), 14 * sizeof(sljit_s32), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 16);
+	/* buf[16] */
 	sljit_emit_op_flags(compiler, SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 2, SLJIT_EQUAL);
+	/* buf[18] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S0), 18 * sizeof(sljit_s32), SLJIT_NOT_EQUAL);
 
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_IMM, -7);
+	/* buf[20] */
 	sljit_emit_op_flags(compiler, SLJIT_XOR32 | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_S0), 20 * sizeof(sljit_s32), SLJIT_ZERO);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 39);
 	sljit_emit_op_flags(compiler, SLJIT_XOR32, SLJIT_R0, 0, SLJIT_NOT_ZERO);
+	/* buf[22] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S0), 22 * sizeof(sljit_s32), SLJIT_R0, 0);
 
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_R2, 0, SLJIT_IMM, -7);
+	/* buf2[0] */
 	sljit_emit_op_flags(compiler, SLJIT_AND, SLJIT_MEM0(), (sljit_sw)&buf2, SLJIT_GREATER);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R2, 0, SLJIT_IMM, 5);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
+	/* buf2[1] */
 	sljit_emit_op_flags(compiler, SLJIT_AND | SLJIT_SET_Z, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_SIG_LESS);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_R2, 0, SLJIT_IMM, 5);
+	/* buf2[2] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_LESS);
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_NOT_EQUAL);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R2, 0, SLJIT_IMM, 5);
+	/* buf2[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_sw), SLJIT_S2, 0);
 	sljit_emit_op_flags(compiler, SLJIT_AND | SLJIT_SET_Z, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_SIG_LESS);
+	/* buf2[4] */
 	sljit_emit_op_flags(compiler, SLJIT_OR, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_sw), SLJIT_ZERO);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_R2, 0, SLJIT_IMM, 0);
+	/* buf2[5] */
 	sljit_emit_op_flags(compiler, SLJIT_XOR, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_sw), SLJIT_GREATER);
 
 	sljit_emit_return_void(compiler);
@@ -4542,15 +4903,18 @@ static void test47(void)
 	buf[2] = 0;
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 1, 0, 0, 0);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x3a5c6f);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_IMM, 3);
 	sljit_set_target(sljit_emit_jump(compiler, SLJIT_LESS), 0x11223344);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0xd37c10);
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_set_target(sljit_emit_jump(compiler, SLJIT_LESS), SLJIT_W(0x112233445566));
 #endif
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x59b48e);
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_set_target(sljit_emit_jump(compiler, SLJIT_LESS), SLJIT_W(0x1122334455667788));
@@ -4575,7 +4939,7 @@ static void test48(void)
 {
 	/* Test floating point conversions. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	int i;
 	sljit_f64 dbuf[10];
 	sljit_f32 sbuf[10];
@@ -4589,12 +4953,12 @@ static void test48(void)
 		if (verbose)
 			printf("no fpu available, test48 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
+	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
+
 	for (i = 0; i < 10; i++) {
 		dbuf[i] = 0.0;
 		sbuf[i] = 0.0;
@@ -4728,7 +5092,7 @@ static void test49(void)
 {
 	/* Test floating point conversions. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	int i;
 	sljit_f64 dbuf[10];
 	sljit_f32 sbuf[9];
@@ -4744,11 +5108,10 @@ static void test49(void)
 		if (verbose)
 			printf("no fpu available, test49 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
+	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
 
 	for (i = 0; i < 9; i++) {
@@ -4809,7 +5172,7 @@ static void test49(void)
 	sljit_emit_fop1(compiler, SLJIT_CONV_F64_FROM_SW, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_f64), SLJIT_R0, 0);
 	/* dbuf[9] */
 	sljit_emit_fop1(compiler, SLJIT_CONV_F64_FROM_S32, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_f64), SLJIT_IMM, SLJIT_W(0x7766554433));
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_emit_return_void(compiler);
 
@@ -4856,7 +5219,7 @@ static void test49(void)
 	FAILED(dbuf[9] != (sljit_f64)SLJIT_W(0x66554433), "test49 case 30 failed\n");
 	FAILED(wbuf[8] != SLJIT_W(0x1122334455), "test48 case 31 failed\n");
 	FAILED(ibuf[8] == 0x4455, "test48 case 32 failed\n");
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	sljit_free_code(code.code, NULL);
 	successful_tests++;
@@ -4866,7 +5229,7 @@ static void test50(void)
 {
 	/* Test stack and floating point operations. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
 	sljit_uw size1, size2, size3;
 	int result;
@@ -4880,11 +5243,10 @@ static void test50(void)
 		if (verbose)
 			printf("no fpu available, test50 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
+	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
 
 	sbuf[0] = 245.5;
@@ -5098,8 +5460,6 @@ static void test52(void)
 		return;
 	}
 
-	/* Next test. */
-
 	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
 	buf[0] = 6.25;
@@ -5193,7 +5553,7 @@ static void test53(void)
 		buf[i] = 0;
 
 	if (verbose)
-		printf("Run test78\n");
+		printf("Run test53\n");
 
 	FAILED(!compiler, "cannot create compiler\n");
 
@@ -5203,15 +5563,15 @@ static void test53(void)
 	for (i = 0; i < SLJIT_NUMBER_OF_REGISTERS; i++, addr++) {
 		if (sljit_get_register_index(SLJIT_R(i)) == -1)
 			continue;
-
+		/* buf_start[i * 3] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R(i), 0, SLJIT_IMM, (sljit_sw)addr);
 		sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(SLJIT_R(i), SLJIT_R(i)), 1, SLJIT_IMM, 88 + i);
-
+		/* buf_start[i * 3 + 1] */
 		if (i != 0) {
 			sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)(addr * 2 + 1));
 			sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(SLJIT_R(i), SLJIT_R0), 0, SLJIT_IMM, 147 + i);
 		}
-
+		/* buf_start[i * 3 + 2] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R(i), 0, SLJIT_IMM, (sljit_sw)(addr * 3 + 2));
 		sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM1(SLJIT_R(i)), 0, SLJIT_IMM, 191 + i);
 	}
@@ -5279,18 +5639,23 @@ static void test54(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 34);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R0, 0, SLJIT_IMM, -10);
 	sljit_emit_cmov(compiler, SLJIT_SIG_LESS, SLJIT_R0, SLJIT_R1, 0);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_SIG_GREATER, SLJIT_R0, 0, SLJIT_IMM, -10);
 	sljit_emit_cmov(compiler, SLJIT_SIG_GREATER, SLJIT_R0, SLJIT_R1, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 24);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, 24);
 	sljit_emit_cmov(compiler, SLJIT_NOT_EQUAL, SLJIT_R0, SLJIT_IMM, 66);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_cmov(compiler, SLJIT_EQUAL, SLJIT_R0, SLJIT_IMM, 78);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_cmov(compiler, SLJIT_EQUAL, SLJIT_R0, SLJIT_IMM, large_num);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
@@ -5300,22 +5665,26 @@ static void test54(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, -45);
 	sljit_emit_op2(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 8);
 	sljit_emit_cmov(compiler, SLJIT_OVERFLOW, SLJIT_R3, SLJIT_IMM, 35);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R3, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, large_num);
 	sljit_emit_op2u(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_IMM, large_num);
 	sljit_emit_cmov(compiler, SLJIT_OVERFLOW, SLJIT_R3, SLJIT_IMM, 35);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_R3, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 71);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, 13);
 	sljit_emit_op2(compiler, SLJIT_LSHR | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 8);
 	sljit_emit_cmov(compiler, SLJIT_EQUAL, SLJIT_R3, SLJIT_R0, 0);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_R3, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 12);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, -29);
 	sljit_emit_op2(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 8);
 	sljit_emit_cmov(compiler, SLJIT_NOT_OVERFLOW, SLJIT_R0, SLJIT_R3, 0);
+	/* buf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_R3, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 16);
@@ -5323,8 +5692,10 @@ static void test54(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, 21);
 	sljit_emit_op2u(compiler, SLJIT_AND | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_IMM, 8);
 	sljit_emit_cmov(compiler, SLJIT_NOT_EQUAL, SLJIT_R3, SLJIT_R4, 0);
+	/* buf[9] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_R3, 0);
 	sljit_emit_cmov(compiler, SLJIT_EQUAL, SLJIT_R3, SLJIT_R4, 0);
+	/* buf[10] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_sw), SLJIT_R3, 0);
 
 	if (sljit_has_cpu_feature(SLJIT_HAS_FPU)) {
@@ -5335,53 +5706,65 @@ static void test54(void)
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 16);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
 		sljit_emit_cmov(compiler, SLJIT_F_EQUAL, SLJIT_R0, SLJIT_IMM, -45);
+		/* buf[11] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_sw), SLJIT_R0, 0);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_GREATER, SLJIT_FR1, 0, SLJIT_FR2, 0);
 		sljit_emit_cmov(compiler, SLJIT_F_GREATER, SLJIT_R0, SLJIT_IMM, -45);
+		/* buf[12] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 12 * sizeof(sljit_sw), SLJIT_R0, 0);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_GREATER_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
 		sljit_emit_cmov(compiler, SLJIT_F_GREATER_EQUAL, SLJIT_R0, SLJIT_IMM, 33);
+		/* buf[13] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 13 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 8);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_LESS, SLJIT_FR1, 0, SLJIT_FR2, 0);
 		sljit_emit_cmov(compiler, SLJIT_F_LESS, SLJIT_R0, SLJIT_IMM, -70);
+		/* buf[14] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 14 * sizeof(sljit_sw), SLJIT_R0, 0);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_LESS_EQUAL, SLJIT_FR2, 0, SLJIT_FR1, 0);
 		sljit_emit_cmov(compiler, SLJIT_F_LESS_EQUAL, SLJIT_R0, SLJIT_IMM, -60);
+		/* buf[15] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 15 * sizeof(sljit_sw), SLJIT_R0, 0);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_F_NOT_EQUAL, SLJIT_FR1, 0, SLJIT_FR2, 0);
 		sljit_emit_cmov(compiler, SLJIT_F_NOT_EQUAL, SLJIT_R0, SLJIT_IMM, 31);
+		/* buf[16] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 16 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 53);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_ORDERED, SLJIT_FR1, 0, SLJIT_FR0, 0);
 		sljit_emit_cmov(compiler, SLJIT_ORDERED, SLJIT_R0, SLJIT_IMM, 17);
+		/* buf[17] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 17 * sizeof(sljit_sw), SLJIT_R0, 0);
 		sljit_emit_fop1(compiler, SLJIT_CMP_F32 | SLJIT_SET_UNORDERED, SLJIT_FR1, 0, SLJIT_FR0, 0);
 		sljit_emit_cmov(compiler, SLJIT_UNORDERED, SLJIT_R0, SLJIT_IMM, 59);
+		/* buf[18] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 18 * sizeof(sljit_sw), SLJIT_R0, 0);
 	}
 
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 177);
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_LESS, SLJIT_R0, 0, SLJIT_IMM, 178);
-	sljit_emit_cmov(compiler, SLJIT_LESS, SLJIT_R0 | SLJIT_32, SLJIT_IMM, 200);
+	sljit_emit_cmov(compiler, SLJIT_LESS | SLJIT_32, SLJIT_R0, SLJIT_IMM, 200);
+	/* ibuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 95);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R3, 0, SLJIT_IMM, 177);
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_LESS_EQUAL, SLJIT_R0, 0, SLJIT_IMM, 95);
-	sljit_emit_cmov(compiler, SLJIT_LESS_EQUAL, SLJIT_R3 | SLJIT_32, SLJIT_R0, 0);
+	sljit_emit_cmov(compiler, SLJIT_LESS_EQUAL | SLJIT_32, SLJIT_R3, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0);
+	/* ibuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_R3, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R3, 0, SLJIT_IMM, 56);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, -63);
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_SIG_LESS, SLJIT_R3, 0, SLJIT_R4, 0);
-	sljit_emit_cmov(compiler, SLJIT_SIG_LESS, SLJIT_R3 | SLJIT_32, SLJIT_R4, 0);
+	sljit_emit_cmov(compiler, SLJIT_SIG_LESS | SLJIT_32, SLJIT_R3, SLJIT_R4, 0);
+	/* ibuf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s32), SLJIT_R3, 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB32 | SLJIT_SET_SIG_GREATER, SLJIT_R3, 0, SLJIT_R4, 0);
-	sljit_emit_cmov(compiler, SLJIT_SIG_GREATER, SLJIT_R3 | SLJIT_32, SLJIT_R4, 0);
+	sljit_emit_cmov(compiler, SLJIT_SIG_GREATER | SLJIT_32, SLJIT_R3, SLJIT_R4, 0);
+	/* ibuf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32), SLJIT_R3, 0);
 
 	sljit_emit_return_void(compiler);
@@ -5488,7 +5871,7 @@ static void test55(void)
 
 static void test56(void)
 {
-	/* Check integer substraction with negative immediate. */
+	/* Check integer subtraction with negative immediate. */
 	executable_code code;
 	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
 	sljit_sw buf[13];
@@ -5506,34 +5889,47 @@ static void test56(void)
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 90 << 12);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_GREATER, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -(91 << 12));
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R1, 0);
+	/* buf[1] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_SIG_GREATER);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_LESS, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -(91 << 12));
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_R1, 0);
+	/* buf[3] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_LESS);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_GREATER_EQUAL, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -(91 << 12));
+	/* buf[4] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_SIG_GREATER_EQUAL);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -(91 << 12));
+	/* buf[5] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_LESS_EQUAL);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -(91 << 12));
+	/* buf[6] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_GREATER);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_LESS, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, -(91 << 12));
+	/* buf[7] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_SIG_LESS);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 90);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_SIG_GREATER, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, -91);
+	/* buf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[9] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_SIG_GREATER);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 90);
 	sljit_emit_op2(compiler, SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, -91);
+	/* buf[10] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_sw), SLJIT_LESS_EQUAL);
 
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, -0x7fffffff);
 	sljit_emit_op2(compiler, SLJIT_ADD32 | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, -(91 << 12));
+	/* buf[11] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_sw), SLJIT_OVERFLOW);
 
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, -0x7fffffff-1);
 	sljit_emit_op2(compiler, SLJIT_SUB32 | SLJIT_SET_OVERFLOW, SLJIT_R0, 0, SLJIT_IMM, 0, SLJIT_R0, 0);
+	/* buf[12] */
 	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 12 * sizeof(sljit_sw), SLJIT_OVERFLOW);
 
 	sljit_emit_return_void(compiler);
@@ -5657,7 +6053,7 @@ static void test58(void)
 {
 	/* Check function calls with floating point arguments. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	struct sljit_jump* jump = NULL;
 	sljit_f64 dbuf[7];
 	sljit_f32 sbuf[7];
@@ -5670,11 +6066,12 @@ static void test58(void)
 		if (verbose)
 			printf("no fpu available, test58 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
 	dbuf[0] = 5.25;
 	dbuf[1] = 0.0;
 	dbuf[2] = 2.5;
@@ -5692,8 +6089,6 @@ static void test58(void)
 	wbuf[0] = 0;
 	wbuf[1] = 0;
 
-	FAILED(!compiler, "cannot create compiler\n");
-
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS3(VOID, P, P, P), 3, 3, 4, 0, sizeof(sljit_sw));
 
 	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S1), 0);
@@ -5949,97 +6344,113 @@ static void test60(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS3(VOID, P, P, P), 4, 3, 4, 0, sizeof(sljit_sw));
 
-	supported[0] = sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_sw));
+	supported[0] = sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_sw));
 	if (supported[0] == SLJIT_SUCCESS) {
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_S0, 0);
-		sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_sw));
+		sljit_emit_mem_update(compiler, SLJIT_MOV, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_sw));
+		/* buf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R1, 0);
+		/* buf[1] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0);
 	}
 
-	supported[1] = sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM1(SLJIT_R2), -2 * (sljit_sw)sizeof(sljit_s8));
+	supported[1] = sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM1(SLJIT_R2), -2 * (sljit_sw)sizeof(sljit_s8));
 	if (supported[1] == SLJIT_SUCCESS) {
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S1, 0, SLJIT_IMM, 2 * sizeof(sljit_s8));
-		sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM1(SLJIT_R2), -2 * (sljit_sw)sizeof(sljit_s8));
+		sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM1(SLJIT_R2), -2 * (sljit_sw)sizeof(sljit_s8));
+		/* buf[3] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R0, 0);
+		/* buf[4] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R2, 0);
 	}
 
-	supported[2] = sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R1), -2 * (sljit_sw)sizeof(sljit_s32));
+	supported[2] = sljit_emit_mem_update(compiler, SLJIT_MOV_S32 | SLJIT_MEM_SUPP, SLJIT_R2, SLJIT_MEM1(SLJIT_R1), -2 * (sljit_sw)sizeof(sljit_s32));
 	if (supported[2] == SLJIT_SUCCESS) {
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S2, 0, SLJIT_IMM, 2 * sizeof(sljit_s32));
-		sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R1), -2 * (sljit_sw)sizeof(sljit_s32));
+		sljit_emit_mem_update(compiler, SLJIT_MOV_S32, SLJIT_R2, SLJIT_MEM1(SLJIT_R1), -2 * (sljit_sw)sizeof(sljit_s32));
+		/* buf[5] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R2, 0);
+		/* buf[6] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_R1, 0);
 	}
 
-	supported[3] = sljit_emit_mem(compiler, SLJIT_MOV32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), 2 * sizeof(sljit_s32));
+	supported[3] = sljit_emit_mem_update(compiler, SLJIT_MOV32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), 2 * sizeof(sljit_s32));
 	if (supported[3] == SLJIT_SUCCESS) {
 		sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, -8765);
 		sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R2, 0, SLJIT_S2, 0, SLJIT_IMM, sizeof(sljit_s32));
-		sljit_emit_mem(compiler, SLJIT_MOV32 | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), 2 * sizeof(sljit_s32));
+		sljit_emit_mem_update(compiler, SLJIT_MOV32 | SLJIT_MEM_STORE, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), 2 * sizeof(sljit_s32));
+		/* buf[7] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_R2, 0);
 	}
 
-	supported[4] = sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), -128 * (sljit_sw)sizeof(sljit_s8));
+	supported[4] = sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), -128 * (sljit_sw)sizeof(sljit_s8));
 	if (supported[4] == SLJIT_SUCCESS) {
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -121);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_S1, 0);
-		sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), -128 * (sljit_sw)sizeof(sljit_s8));
+		sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_R1, SLJIT_MEM1(SLJIT_R2), -128 * (sljit_sw)sizeof(sljit_s8));
+		/* buf[8] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_R2, 0);
 	}
 
-	supported[5] = sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 1);
+	supported[5] = sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 1);
 	if (supported[5] == SLJIT_SUCCESS) {
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S0, 0, SLJIT_IMM, 9 * sizeof(sljit_sw) - 1);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -881199);
-		sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 1);
+		sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_STORE, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 1);
+		/* buf[10] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_sw), SLJIT_R0, 0);
 	}
 
-	supported[6] = sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+	supported[6] = sljit_emit_mem_update(compiler, SLJIT_MOV_S32 | SLJIT_MEM_SUPP, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
 	if (supported[6] == SLJIT_SUCCESS) {
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S2, 0, SLJIT_IMM, 213);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, -213);
-		sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+		sljit_emit_mem_update(compiler, SLJIT_MOV_S32, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+		/* buf[11] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_sw), SLJIT_R0, 0);
+		/* buf[12] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 12 * sizeof(sljit_sw), SLJIT_R1, 0);
 	}
 
-	supported[7] = sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+	supported[7] = sljit_emit_mem_update(compiler, SLJIT_MOV_S32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
 	if (supported[7] == SLJIT_SUCCESS) {
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_S2, 0);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 2 * sizeof(sljit_s32));
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -7890);
-		sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+		sljit_emit_mem_update(compiler, SLJIT_MOV_S32 | SLJIT_MEM_STORE, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+		/* buf[13] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 13 * sizeof(sljit_sw), SLJIT_R1, 0);
 	}
 
-	supported[8] = sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 2);
+	supported[8] = sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 2);
 	if (supported[8] == SLJIT_SUCCESS) {
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S0, 0, SLJIT_IMM, 2 * sizeof(sljit_sw));
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 2 * sizeof(sljit_sw));
-		sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 2);
+		sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 2);
+		/* buf[14] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 14 * sizeof(sljit_sw), SLJIT_R0, 0);
+		/* buf[15] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 15 * sizeof(sljit_sw), SLJIT_R1, 0);
 	}
 
-	supported[9] = sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+	supported[9] = sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
 	if (supported[9] == SLJIT_SUCCESS) {
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S1, 0, SLJIT_IMM, 2 * sizeof(sljit_s8));
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, -2 * (sljit_sw)sizeof(sljit_s8));
-		sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+		sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+		/* buf[16] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 16 * sizeof(sljit_sw), SLJIT_R0, 0);
+		/* buf[17] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 17 * sizeof(sljit_sw), SLJIT_R1, 0);
 	}
 
-	SLJIT_ASSERT(sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 1) == SLJIT_ERR_UNSUPPORTED);
-	SLJIT_ASSERT(sljit_emit_mem(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 1) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 1) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_mem_update(compiler, SLJIT_MOV_S8 | SLJIT_MEM_SUPP, SLJIT_R0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 1) == SLJIT_ERR_UNSUPPORTED);
 
 #if (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2) || (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
 	/* TODO: at least for ARM (both V5 and V7) the range below needs further fixing */
-	SLJIT_ASSERT(sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 256) == SLJIT_ERR_UNSUPPORTED);
-	SLJIT_ASSERT(sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), -257) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), 256) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_R1, SLJIT_MEM1(SLJIT_R0), -257) == SLJIT_ERR_UNSUPPORTED);
 #endif
 
 	sljit_emit_return_void(compiler);
@@ -6095,7 +6506,7 @@ static void test61(void)
 {
 	/* Test float memory accesses with pre/post updates. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	sljit_u32 i;
 	sljit_s32 supported[6];
 	sljit_sw wbuf[6];
@@ -6109,17 +6520,18 @@ static void test61(void)
 	static sljit_u8 expected[6] = { 0, 0, 0, 0, 0, 0 };
 #endif
 
+	if (verbose)
+		printf("Run test61\n");
+
 	if (!sljit_has_cpu_feature(SLJIT_HAS_FPU)) {
 		if (verbose)
 			printf("no fpu available, test61 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
-	if (verbose)
-		printf("Run test61\n");
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
 
 	for (i = 0; i < 6; i++)
 		wbuf[i] = 0;
@@ -6134,67 +6546,77 @@ static void test61(void)
 	sbuf[2] = 0.0;
 	sbuf[3] = 0.0;
 
-	FAILED(!compiler, "cannot create compiler\n");
-
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS3(VOID, P, P, P), 4, 3, 4, 0, sizeof(sljit_sw));
 
-	supported[0] = sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), 4 * sizeof(sljit_f64));
+	supported[0] = sljit_emit_fmem_update(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), 4 * sizeof(sljit_f64));
 	if (supported[0] == SLJIT_SUCCESS) {
+		/* dbuf[1] */
 		sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_S1, 0, SLJIT_IMM, 4 * sizeof(sljit_f64));
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_PRE, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), 4 * sizeof(sljit_f64));
+		sljit_emit_fmem_update(compiler, SLJIT_MOV_F64, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), 4 * sizeof(sljit_f64));
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_f64), SLJIT_FR0, 0);
+		/* wbuf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
 	}
 
-	supported[1] = sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_FR2, SLJIT_MEM1(SLJIT_R0), -(sljit_sw)sizeof(sljit_f64));
+	supported[1] = sljit_emit_fmem_update(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_FR2, SLJIT_MEM1(SLJIT_R0), -(sljit_sw)sizeof(sljit_f64));
 	if (supported[1] == SLJIT_SUCCESS) {
+		/* dbuf[2] */
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S1, 0, SLJIT_IMM, 2 * sizeof(sljit_f64));
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S1), 0);
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_FR2, SLJIT_MEM1(SLJIT_R0), -(sljit_sw)sizeof(sljit_f64));
+		sljit_emit_fmem_update(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_FR2, SLJIT_MEM1(SLJIT_R0), -(sljit_sw)sizeof(sljit_f64));
+		/* wbuf[1] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0);
 	}
 
-	supported[2] = sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_FR1, SLJIT_MEM1(SLJIT_R2), -4 * (sljit_sw)sizeof(sljit_f32));
+	supported[2] = sljit_emit_fmem_update(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE, SLJIT_FR1, SLJIT_MEM1(SLJIT_R2), -4 * (sljit_sw)sizeof(sljit_f32));
 	if (supported[2] == SLJIT_SUCCESS) {
+		/* sbuf[0] */
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S2, 0, SLJIT_IMM, 4 * sizeof(sljit_f32));
 		sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_f32));
-		sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_FR1, SLJIT_MEM1(SLJIT_R2), -4 * (sljit_sw)sizeof(sljit_f32));
+		sljit_emit_fmem_update(compiler, SLJIT_MOV_F32 | SLJIT_MEM_STORE, SLJIT_FR1, SLJIT_MEM1(SLJIT_R2), -4 * (sljit_sw)sizeof(sljit_f32));
+		/* wbuf[2] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_R2, 0);
 	}
 
-	supported[3] = sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_FR1, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_f32));
+	supported[3] = sljit_emit_fmem_update(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_FR1, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_f32));
 	if (supported[3] == SLJIT_SUCCESS) {
+		/* sbuf[2] */
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S2, 0, SLJIT_IMM, sizeof(sljit_f32));
-		sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_POST, SLJIT_FR1, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_f32));
+		sljit_emit_fmem_update(compiler, SLJIT_MOV_F32 | SLJIT_MEM_POST, SLJIT_FR1, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_f32));
 		sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_S2), 2 * sizeof(sljit_f32), SLJIT_FR1, 0);
+		/* wbuf[3] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R1, 0);
 	}
 
-	supported[4] = sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0);
+	supported[4] = sljit_emit_fmem_update(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0);
 	if (supported[4] == SLJIT_SUCCESS) {
+		/* dbuf[3] */
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S1, 0, SLJIT_IMM, 8 * sizeof(sljit_f64));
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -8 * (sljit_sw)sizeof(sljit_f64));
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_PRE, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0);
+		sljit_emit_fmem_update(compiler, SLJIT_MOV_F64, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0);
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_f64), SLJIT_FR0, 0);
+		/* wbuf[4] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R1, 0);
 	}
 
-	supported[5] = sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_FR2, SLJIT_MEM2(SLJIT_R2, SLJIT_R1), 0);
+	supported[5] = sljit_emit_fmem_update(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE, SLJIT_FR2, SLJIT_MEM2(SLJIT_R2, SLJIT_R1), 0);
 	if (supported[5] == SLJIT_SUCCESS) {
+		/* sbuf[3] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_S2, 0);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 3 * sizeof(sljit_f32));
 		sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_f32));
-		sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_FR2, SLJIT_MEM2(SLJIT_R2, SLJIT_R1), 0);
+		sljit_emit_fmem_update(compiler, SLJIT_MOV_F32 | SLJIT_MEM_STORE, SLJIT_FR2, SLJIT_MEM2(SLJIT_R2, SLJIT_R1), 0);
+		/* wbuf[5] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R2, 0);
 	}
 
-	SLJIT_ASSERT(sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0) == SLJIT_ERR_UNSUPPORTED);
-	SLJIT_ASSERT(sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_fmem_update(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_fmem_update(compiler, SLJIT_MOV_F32 | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_POST, SLJIT_FR0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0) == SLJIT_ERR_UNSUPPORTED);
 
 #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
 	/* TODO: at least for ARM (both V5 and V7) the range below needs further fixing */
-	SLJIT_ASSERT(sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), 256) == SLJIT_ERR_UNSUPPORTED);
-	SLJIT_ASSERT(sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), -257) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_fmem_update(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), 256) == SLJIT_ERR_UNSUPPORTED);
+	SLJIT_ASSERT(sljit_emit_fmem_update(compiler, SLJIT_MOV_F64 | SLJIT_MEM_SUPP | SLJIT_MEM_POST, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), -257) == SLJIT_ERR_UNSUPPORTED);
 #endif
 
 	sljit_emit_return_void(compiler);
@@ -6253,7 +6675,7 @@ static void test62(void)
 	FAILED(!compiler, "cannot create compiler\n");
 	sljit_set_context(compiler, 0, SLJIT_ARGS1(W, W), 1, 1, 0, 0, 0);
 
-	sljit_emit_fast_enter(compiler, SLJIT_R0, 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_R0, 0);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_S0, 0, SLJIT_IMM, 42);
 	sljit_emit_op_src(compiler, SLJIT_FAST_RETURN, SLJIT_R0, 0);
 
@@ -6313,6 +6735,7 @@ static void test63(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 1, 0, 0, 2 * sizeof(sljit_sw));
 
+	/* buf[0-1] */
 	put_label[0] = sljit_emit_put_label(compiler, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
 
@@ -6323,6 +6746,7 @@ static void test63(void)
 	sljit_set_put_label(put_label[0], label[0]);
 	sljit_set_put_label(put_label[1], label[0]);
 
+	/* buf[2-3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)(buf + 2) - offs);
 	put_label[2] = sljit_emit_put_label(compiler, SLJIT_MEM1(SLJIT_R0), offs);
 
@@ -6333,6 +6757,7 @@ static void test63(void)
 	sljit_set_put_label(put_label[2], label[1]);
 	sljit_set_put_label(put_label[3], label[1]);
 
+	/* Return value */
 	put_label[4] = sljit_emit_put_label(compiler, SLJIT_RETURN_REG, 0);
 	sljit_set_put_label(put_label[4], label[0]);
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
@@ -6422,36 +6847,44 @@ static void test64(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, P), 3, 1, 0, 0, 2 * sizeof(sljit_sw));
 
+	/* buf[0] */
 	put_label[0] = sljit_emit_put_label(compiler, SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
 
+	/* buf[1] */
 	put_label[1] = sljit_emit_put_label(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_uw));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_uw), SLJIT_MEM1(SLJIT_SP), sizeof(sljit_uw));
 
 	sljit_set_put_label(put_label[0], &label[0]);
 	sljit_set_put_label(put_label[1], &label[0]);
 
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)(buf + 2) - offs1);
 	put_label[0] = sljit_emit_put_label(compiler, SLJIT_MEM1(SLJIT_R0), offs1);
 
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (offs1 + (sljit_sw)sizeof(sljit_uw)) >> 1);
 	put_label[1] = sljit_emit_put_label(compiler, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), 1);
 
 	sljit_set_put_label(put_label[0], &label[1]);
 	sljit_set_put_label(put_label[1], &label[1]);
 
+	/* buf[4] */
 	put_label[0] = sljit_emit_put_label(compiler, SLJIT_R1, 0);
 	sljit_set_put_label(put_label[0], &label[2]);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_uw), SLJIT_R1, 0);
 
+	/* buf[5] */
 	put_label[0] = sljit_emit_put_label(compiler, SLJIT_R2, 0);
 	sljit_set_put_label(put_label[0], &label[3]);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_uw), SLJIT_R2, 0);
 
+	/* buf[6] */
 	put_label[0] = sljit_emit_put_label(compiler, SLJIT_R1, 0);
 	sljit_set_put_label(put_label[0], &label[4]);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_uw), SLJIT_R1, 0);
 
+	/* buf[7] */
 	put_label[0] = sljit_emit_put_label(compiler, SLJIT_RETURN_REG, 0);
 	sljit_set_put_label(put_label[0], &label[5]);
 	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
@@ -6590,7 +7023,7 @@ static void test67(void)
 	/* First function, never returns. */
 	label = sljit_emit_label(compiler);
 	sljit_set_label(call, label);
-	sljit_emit_fast_enter(compiler, SLJIT_R1, 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_R1, 0);
 
 	call = sljit_emit_jump(compiler, SLJIT_FAST_CALL);
 
@@ -6599,7 +7032,7 @@ static void test67(void)
 
 	/* Second function, skips the first function. */
 	sljit_set_label(call, sljit_emit_label(compiler));
-	sljit_emit_fast_enter(compiler, SLJIT_R2, 0);
+	sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_R2, 0);
 
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
 
@@ -6653,7 +7086,7 @@ static void test68(void)
 		/* Recursive fast call. */
 		label = sljit_emit_label(compiler);
 		sljit_set_label(call, label);
-		sljit_emit_fast_enter(compiler, SLJIT_R1, 0);
+		sljit_emit_op_dst(compiler, SLJIT_FAST_ENTER, SLJIT_R1, 0);
 
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
 
@@ -6697,50 +7130,50 @@ static void test69(void)
 	FAILED(!compiler, "cannot create compiler\n");
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 3, 1, 0, 0, 0);
-
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)1 << ((sizeof (sljit_sw) * 8) - 2));
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW | SLJIT_CURRENT_FLAGS_ADD);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_OVERFLOW);
-
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 5);
 	sljit_emit_op2(compiler, SLJIT_ADD | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW | SLJIT_CURRENT_FLAGS_ADD);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_OVERFLOW);
-
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_R1, 0);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_OVERFLOW);
-
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 5);
 	sljit_emit_op2(compiler, SLJIT_MUL | SLJIT_SET_OVERFLOW, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_R1, 0);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_OVERFLOW);
-
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 6);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 5);
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_R1, 0, SLJIT_R2, 0);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_GREATER | SLJIT_CURRENT_FLAGS_SUB | SLJIT_CURRENT_FLAGS_COMPARE);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_GREATER);
-
+	/* buf[5] */
 	sljit_emit_op2u(compiler, SLJIT_SUB | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R2, 0);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_Z | SLJIT_CURRENT_FLAGS_SUB | SLJIT_CURRENT_FLAGS_COMPARE);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_ZERO);
-
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)(1u << 31));
 	sljit_emit_op2u(compiler, SLJIT_ADD32 | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R1, 0);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_Z | SLJIT_CURRENT_FLAGS_32 | SLJIT_CURRENT_FLAGS_ADD);
 	cond_set(compiler, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_ZERO);
-
+	/* buf[7] */
 	sljit_emit_op2u(compiler, SLJIT_SHL32 | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_IMM, 1);
 	sljit_emit_label(compiler);
 	sljit_set_current_flags(compiler, SLJIT_SET_Z | SLJIT_CURRENT_FLAGS_32);
@@ -7357,7 +7790,7 @@ static void test72(void)
 {
 	/* Test using all fpu registers. */
 	executable_code code;
-	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	struct sljit_compiler* compiler;
 	sljit_f64 buf[SLJIT_NUMBER_OF_FLOAT_REGISTERS];
 	sljit_f64 buf2[2];
 	struct sljit_jump *jump;
@@ -7370,12 +7803,10 @@ static void test72(void)
 		if (verbose)
 			printf("no fpu available, test72 skipped\n");
 		successful_tests++;
-		if (compiler)
-			sljit_free_compiler(compiler);
 		return;
 	}
 
-	/* Next test. */
+	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
 
 	buf2[0] = 7.75;
@@ -7498,11 +7929,15 @@ static void test73(void)
 	FAILED(!compiler, "cannot create compiler\n");
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS4(VOID, 32_R, W, W_R, 32), 3, 2, 0, 0, 0);
+	/* wbuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&wbuf);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_S0, 0);
+	/* wbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_R2, 0);
+	/* ibuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&ibuf);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+	/* ibuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_s32), SLJIT_S1, 0);
 	sljit_emit_return_void(compiler);
 
@@ -7524,11 +7959,15 @@ static void test73(void)
 	FAILED(!compiler, "cannot create compiler\n");
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS4(VOID, 32, W_R, W, 32_R), 4, 2, 0, 0, 8192);
+	/* wbuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&wbuf);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_R1, 0);
+	/* wbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_sw), SLJIT_S1, 0);
+	/* ibuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&ibuf);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_S0, 0);
+	/* ibuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_s32), SLJIT_R3, 0);
 	sljit_emit_return_void(compiler);
 
@@ -7550,11 +7989,15 @@ static void test73(void)
 	FAILED(!compiler, "cannot create compiler\n");
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS4(VOID, 32_R, W_R, W_R, 32_R), 4, 1, 0, 0, SLJIT_MAX_LOCAL_SIZE);
+	/* wbuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, (sljit_sw)&wbuf);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R1, 0);
+	/* wbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R2, 0);
+	/* ibuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, (sljit_sw)&ibuf);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+	/* ibuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_s32), SLJIT_R3, 0);
 	sljit_emit_return_void(compiler);
 
@@ -7577,11 +8020,15 @@ static void test73(void)
 
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS4(VOID, W_R, W_R, 32_R, 32_R), 4, 1, 0, 0, SLJIT_MAX_LOCAL_SIZE);
 	sljit_set_context(compiler, 0, SLJIT_ARGS4(VOID, W_R, W_R, 32_R, 32_R), 4, 1, 0, 0, SLJIT_MAX_LOCAL_SIZE);
+	/* wbuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, (sljit_sw)&wbuf);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+	/* wbuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R1, 0);
+	/* ibuf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, (sljit_sw)&ibuf);
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R2, 0);
+	/* ibuf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_s32), SLJIT_R3, 0);
 	sljit_emit_return_void(compiler);
 
@@ -7604,10 +8051,14 @@ static void test73(void)
 		FAILED(!compiler, "cannot create compiler\n");
 
 		sljit_emit_enter(compiler, 0, SLJIT_ARGS4(VOID, F64, F64, F64, W_R), 1, 0, 3, 0, SLJIT_MAX_LOCAL_SIZE);
+		/* wbuf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM0(), (sljit_sw)&wbuf, SLJIT_R0, 0);
+		/* dbuf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&dbuf);
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_FR0, 0);
+		/* dbuf[1] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_f64), SLJIT_FR1, 0);
+		/* dbuf[2] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_f64), SLJIT_FR2, 0);
 
 		sljit_emit_return_void(compiler);
@@ -7629,11 +8080,15 @@ static void test73(void)
 		FAILED(!compiler, "cannot create compiler\n");
 
 		sljit_emit_enter(compiler, 0, SLJIT_ARGS4(VOID, F64, F64, W, W_R), 2, 1, 2, 0, SLJIT_MAX_LOCAL_SIZE);
+		/* wbuf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&wbuf);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_S0, 0);
+		/* wbuf[1] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_sw), SLJIT_R1, 0);
+		/* dbuf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&dbuf);
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_FR0, 0);
+		/* dbuf[1] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_f64), SLJIT_FR1, 0);
 
 		sljit_emit_return_void(compiler);
@@ -7802,7 +8257,7 @@ static void test75_set(struct sljit_compiler *compiler, sljit_s32 compare, sljit
 	sljit_s32 is_ordered;
 
 	if (sljit_cmp_info(type)) {
-		sljit_emit_fop1(compiler, compare | SLJIT_SET(type), left_fr, 0, right_fr, 0);
+		sljit_emit_fop1(compiler, compare | SLJIT_SET(type & 0xfe), left_fr, 0, right_fr, 0);
 		sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_R0, 0, type);
 		jump1 = sljit_emit_jump(compiler, type);
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2);
@@ -7817,7 +8272,7 @@ static void test75_set(struct sljit_compiler *compiler, sljit_s32 compare, sljit
 		}
 		SLJIT_ASSERT(sljit_cmp_info(type) && sljit_cmp_info(SLJIT_UNORDERED) && sljit_cmp_info(SLJIT_ORDERED));
 
-		sljit_emit_fop1(compiler, compare | SLJIT_SET(type), left_fr, 0, right_fr, 0);
+		sljit_emit_fop1(compiler, compare | SLJIT_SET(type & 0xfe), left_fr, 0, right_fr, 0);
 		sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_R0, 0, type);
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 0);
 
@@ -7871,15 +8326,6 @@ static void test75(void)
 		sljit_s32 value1;
 	} sbuf[3];
 
-	dbuf[0].u.value1 = 0x7fffffff;
-	dbuf[0].u.value2 = 0x7fffffff;
-	dbuf[1].value = -13.0;
-	dbuf[2].value = 27.0;
-
-	sbuf[0].value1 = 0x7fffffff;
-	sbuf[1].value = -13.0;
-	sbuf[2].value = 27.0;
-
 	if (verbose)
 		printf("Run test75\n");
 
@@ -7890,12 +8336,21 @@ static void test75(void)
 		return;
 	}
 
-	for (i = 0; i < 96; i++)
-		bbuf[i] = -3;
-
 	compiler = sljit_create_compiler(NULL, NULL);
 	FAILED(!compiler, "cannot create compiler\n");
 
+	dbuf[0].u.value1 = 0x7fffffff;
+	dbuf[0].u.value2 = 0x7fffffff;
+	dbuf[1].value = -13.0;
+	dbuf[2].value = 27.0;
+
+	sbuf[0].value1 = 0x7fffffff;
+	sbuf[1].value = -13.0;
+	sbuf[2].value = 27.0;
+
+	for (i = 0; i < 96; i++)
+		bbuf[i] = -3;
+
 	sljit_emit_enter(compiler, 0, SLJIT_ARGS3(VOID, P, P, P), 3, 3, 6, 0, 0);
 
 	i = SLJIT_CMP_F64;
@@ -8171,7 +8626,6 @@ static void test76(void)
 	if (verbose)
 		printf("Run test76\n");
 
-	/* Next test. */
 	for (i = 0; i < 9; i++)
 		buf[i] = -1;
 
@@ -8191,8 +8645,11 @@ static void test76(void)
 	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS4(W, W, W, W, W));
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_S0, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_S1, 0);
 	sljit_emit_return_void(compiler);
 
@@ -8200,6 +8657,7 @@ static void test76(void)
 	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG, SLJIT_ARGS4(W, W_R, W_R, W_R, W_R), 4, 2, 0, 0, 32);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[3-6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R2, 0);
@@ -8246,8 +8704,11 @@ static void test76(void)
 	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS4(W, W, W, W, W));
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_S0, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_S1, 0);
 	sljit_emit_return_void(compiler);
 
@@ -8256,6 +8717,7 @@ static void test76(void)
 	sljit_set_context(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(1), SLJIT_ARGS4(W, W_R, W_R, W_R, W_R), 6, 2, 0, 0, SLJIT_MAX_LOCAL_SIZE);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[3-7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_sw), SLJIT_S0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_sw), SLJIT_R1, 0);
@@ -8305,7 +8767,9 @@ static void test76(void)
 	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS4(VOID, W, W, W, W));
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_S0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_sw), SLJIT_S1, 0);
 	sljit_emit_return_void(compiler);
 
@@ -8313,6 +8777,7 @@ static void test76(void)
 	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(2), SLJIT_ARGS4(W, W_R, W_R, W_R, W_R), 4, 3, 0, 0, SLJIT_MAX_LOCAL_SIZE);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[2-7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 2 * sizeof(sljit_sw), SLJIT_S0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 3 * sizeof(sljit_sw), SLJIT_S1, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 4 * sizeof(sljit_sw), SLJIT_R0, 0);
@@ -8358,9 +8823,13 @@ static void test76(void)
 	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS0(W));
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_S0, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_S1, 0);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 3 * sizeof(sljit_sw), SLJIT_S2, 0);
 	sljit_emit_return_void(compiler);
 
@@ -8368,7 +8837,9 @@ static void test76(void)
 	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(2), SLJIT_ARGS0(W), 4, 3, 0, 0, SLJIT_MAX_LOCAL_SIZE);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 4 * sizeof(sljit_sw), SLJIT_S0, 0);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 5 * sizeof(sljit_sw), SLJIT_S1, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -6278);
@@ -8387,9 +8858,12 @@ static void test76(void)
 	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG, SLJIT_ARGS4(W, W_R, W_R, W_R, W_R), 4, 2, 0, 0, 256);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[6] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_R0, 0);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_R1, 0);
+	/* buf[7] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_R2, 0);
+	/* buf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_R3, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, -1);
@@ -8428,8 +8902,11 @@ static void test76(void)
 		sljit_emit_enter(compiler, 0, SLJIT_ARGS0(VOID), 2, 3, 3, 0, 0);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&dbuf);
+		/* dbuf[0] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_R1), 0);
+		/* dbuf[1] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_f64));
+		/* dbuf[2] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_f64));
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1706);
@@ -8440,9 +8917,13 @@ static void test76(void)
 		jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS4(W, F64, F64, F64, W));
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf);
+		/* buf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+		/* buf[1] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_S0, 0);
+		/* buf[2] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_S1, 0);
+		/* buf[3] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 3 * sizeof(sljit_sw), SLJIT_S2, 0);
 		sljit_emit_return_void(compiler);
 
@@ -8450,12 +8931,17 @@ static void test76(void)
 		sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(1), SLJIT_ARGS4(W, F64, F64, F64, W_R), 1, 3, 3, 0, SLJIT_MAX_LOCAL_SIZE);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, (sljit_sw)&buf);
+		/* buf[4] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_sw), SLJIT_S0, 0);
+		/* buf[5] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, (sljit_sw)&dbuf);
+		/* dbuf[0] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_FR2, 0);
+		/* dbuf[1] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_f64), SLJIT_FR0, 0);
+		/* dbuf[2] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_f64), SLJIT_FR1, 0);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2784);
@@ -8506,8 +8992,11 @@ static void test76(void)
 		jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS4(VOID, F64, W, W, W));
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf);
+		/* buf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_S0, 0);
+		/* buf[1] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_sw), SLJIT_S1, 0);
+		/* buf[2] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_sw), SLJIT_S2, 0);
 		sljit_emit_return_void(compiler);
 
@@ -8515,10 +9004,15 @@ static void test76(void)
 		sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(2), SLJIT_ARGS4(VOID, F64, W_R, W_R, W_R), 3, 3, 3, 0, SLJIT_MAX_LOCAL_SIZE);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, (sljit_sw)&buf);
+		/* buf[3] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 3 * sizeof(sljit_sw), SLJIT_S0, 0);
+		/* buf[4] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 4 * sizeof(sljit_sw), SLJIT_S1, 0);
+		/* buf[5] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 5 * sizeof(sljit_sw), SLJIT_R0, 0);
+		/* buf[6] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 6 * sizeof(sljit_sw), SLJIT_R1, 0);
+		/* buf[7] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), 7 * sizeof(sljit_sw), SLJIT_R2, 0);
 
 		sljit_emit_fop1(compiler, SLJIT_NEG_F64, SLJIT_MEM0(), (sljit_sw)&dbuf, SLJIT_FR0, 0);
@@ -8567,9 +9061,13 @@ static void test76(void)
 		jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS0(W));
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)&buf);
+		/* buf[0] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+		/* buf[1] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_S0, 0);
+		/* buf[2] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_S1, 0);
+		/* buf[3] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 3 * sizeof(sljit_sw), SLJIT_S2, 0);
 		sljit_emit_return_void(compiler);
 
@@ -8577,12 +9075,17 @@ static void test76(void)
 		sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(2), SLJIT_ARGS0(W), 1, 3, 3, 0, SLJIT_MAX_LOCAL_SIZE);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf);
+		/* buf[4] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 4 * sizeof(sljit_sw), SLJIT_S0, 0);
+		/* buf[5] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 5 * sizeof(sljit_sw), SLJIT_S1, 0);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&dbuf);
+		/* dbuf[0] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_R0), 0);
+		/* dbuf[1] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_f64));
+		/* dbuf[2] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_f64));
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1706);
@@ -8597,11 +9100,15 @@ static void test76(void)
 		sljit_set_label(jump, sljit_emit_label(compiler));
 		sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG, SLJIT_ARGS4(W, F64, F64, F64, W_R), 1, 0, 3, 0, 256);
 
+		/* buf[6] */
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM0(), (sljit_sw)&buf[6], SLJIT_R0, 0);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&dbuf);
+		/* dbuf[0] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_FR2, 0);
+		/* dbuf[1] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_f64), SLJIT_FR0, 0);
+		/* dbuf[2] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), 2 * sizeof(sljit_f64), SLJIT_FR1, 0);
 
 		sljit_emit_return(compiler, SLJIT_MOV, SLJIT_IMM, 5074);
@@ -8755,31 +9262,31 @@ static void test77(void)
 #endif /* SLJIT_UNALIGNED */
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 18 >> 1);
-	sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_R0, SLJIT_MEM2(SLJIT_S1, SLJIT_R1), 1);
+	sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_UNALIGNED_16, SLJIT_R0, SLJIT_MEM2(SLJIT_S1, SLJIT_R1), 1);
 	/* wbuf[8] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV_S32, SLJIT_R0, 0, SLJIT_MEM0(), (sljit_sw)&ibuf);
 	/* bbuf[18] */
-	sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_R0, SLJIT_MEM2(SLJIT_S1, SLJIT_R1), 1);
+	sljit_emit_mem(compiler, SLJIT_MOV_S32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_16, SLJIT_R0, SLJIT_MEM2(SLJIT_S1, SLJIT_R1), 1);
 
-	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_R0, SLJIT_MEM0(), (sljit_sw)bbuf + 22);
+	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_UNALIGNED_16, SLJIT_R0, SLJIT_MEM0(), (sljit_sw)bbuf + 22);
 	/* wbuf[9] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	/* bbuf[22] */
-	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_R0, SLJIT_MEM0(), (sljit_sw)bbuf + 22);
+	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_16, SLJIT_R0, SLJIT_MEM0(), (sljit_sw)bbuf + 22);
 
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S1, 0, SLJIT_IMM, 128);
-	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_32, SLJIT_R0, SLJIT_MEM1(SLJIT_R0), -128 + 32);
+	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_UNALIGNED_32, SLJIT_R0, SLJIT_MEM1(SLJIT_R0), -128 + 32);
 	/* wbuf[10] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_sw), SLJIT_R0, 0);
 
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_S1, 0, SLJIT_IMM, 128);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), 0);
 	/* bbuf[32] */
-	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_32, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), -128 + 32);
+	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_32, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), -128 + 32);
 
 	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_UNALIGNED, SLJIT_R0, SLJIT_MEM1(SLJIT_S0), 0);
 	/* wbuf[11] */
@@ -8869,35 +9376,35 @@ static void test77(void)
 		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), 100000 + 5);
 #endif /* SLJIT_FPU_UNALIGNED */
 
-		sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_FR0, SLJIT_MEM1(SLJIT_S2), 14);
+		sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_UNALIGNED_16, SLJIT_FR0, SLJIT_MEM1(SLJIT_S2), 14);
 		/* sbuf[2] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_f32), SLJIT_FR0, 0);
 
 		sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S1), 0);
 		/* bbuf[14] */
-		sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_FR0, SLJIT_MEM1(SLJIT_S2), 14);
+		sljit_emit_fmem(compiler, SLJIT_MOV_F32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_16, SLJIT_FR0, SLJIT_MEM1(SLJIT_S2), 14);
 
 		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 18 >> 1);
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_FR0, SLJIT_MEM2(SLJIT_S2, SLJIT_R0), 1);
+		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_UNALIGNED_16, SLJIT_FR0, SLJIT_MEM2(SLJIT_S2, SLJIT_R0), 1);
 		/* dbuf[2] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_f64), SLJIT_FR0, 0);
 
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S0), 0);
 		/* bbuf[18] */
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_FR0, SLJIT_MEM2(SLJIT_S2, SLJIT_R0), 1);
+		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_16, SLJIT_FR0, SLJIT_MEM2(SLJIT_S2, SLJIT_R0), 1);
 
 		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, SLJIT_S2, 0, SLJIT_IMM, 128);
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), -128 + 28);
+		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_UNALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), -128 + 28);
 		/* dbuf[3] */
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_f64), SLJIT_FR0, 0);
 
 		sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, SLJIT_MEM1(SLJIT_S0), 0);
 		/* bbuf[28] */
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), -128 + 28);
+		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_R0), -128 + 28);
 
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0);
+		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_UNALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0);
 		/* dbuf[4] */
-		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_f64));
+		sljit_emit_fmem(compiler, SLJIT_MOV_F64 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_32, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_f64));
 
 		sljit_emit_return_void(compiler);
 
@@ -8951,8 +9458,11 @@ static void test78(void)
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM0(), (sljit_sw)(buf + 6), SLJIT_R0, 0);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)&buf);
+	/* buf[3] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 3 * sizeof(sljit_sw), SLJIT_S0, 0);
+	/* buf[4] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 4 * sizeof(sljit_sw), SLJIT_S1, 0);
+	/* buf[5] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 5 * sizeof(sljit_sw), SLJIT_S2, 0);
 
 	for (i = 1; i < SLJIT_NUMBER_OF_REGISTERS - 3; i++)
@@ -8964,8 +9474,11 @@ static void test78(void)
 	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(3), SLJIT_ARGS4(VOID, W_R, W_R, W_R, W_R), 4, 3, 0, 0, SLJIT_MAX_LOCAL_SIZE);
 	sljit_set_context(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(3), SLJIT_ARGS4(VOID, W_R, W_R, W_R, W_R), 4, 3, 0, 0, SLJIT_MAX_LOCAL_SIZE);
 
+	/* buf[0] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM0(), (sljit_sw)(buf + 0), SLJIT_S0, 0);
+	/* buf[1] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM0(), (sljit_sw)(buf + 1), SLJIT_S1, 0);
+	/* buf[2] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM0(), (sljit_sw)(buf + 2), SLJIT_S2, 0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, 6501);
@@ -9002,7 +9515,6 @@ static void test78(void)
 
 static void test79(void)
 {
-#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) || (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
 	/* Test register pair movement. */
 	executable_code code;
 	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
@@ -9042,7 +9554,7 @@ static void test79(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 5814);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 7201);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S0, 0, SLJIT_IMM, 6 * sizeof(sljit_sw) + 77);
-	/* buf[6], buf[7] */
+	/* buf[6-7] */
 	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE, SLJIT_REG_PAIR(SLJIT_R0, SLJIT_R1), SLJIT_MEM1(SLJIT_R2), -77);
 
 	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_S0, 0, SLJIT_IMM, 36);
@@ -9094,12 +9606,12 @@ static void test79(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, 3065);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, 7481);
 	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_S2, 0, SLJIT_S0, 0, SLJIT_IMM, 0x7f7f0 + 20 * sizeof(sljit_sw));
-	/* buf[20], buf[21] */
+	/* buf[20-21] */
 	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE, SLJIT_REG_PAIR(SLJIT_R3, SLJIT_R4), SLJIT_MEM1(SLJIT_S2), -0x7f7f0);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, 3275);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S3, 0, SLJIT_IMM, -8714);
-	/* buf[22], buf[23] */
+	/* buf[22-23] */
 	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE, SLJIT_REG_PAIR(SLJIT_S1, SLJIT_S3), SLJIT_MEM0(), (sljit_sw)(buf + 22));
 
 	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_LOAD | SLJIT_MEM_UNALIGNED, SLJIT_REG_PAIR(SLJIT_R0, SLJIT_R1), SLJIT_MEM0(), (sljit_sw)bbuf + 1);
@@ -9112,10 +9624,10 @@ static void test79(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S4, 0, SLJIT_IMM, -9035);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)bbuf + 18 - 0x7f0f);
 	/* bbuf[18], buf[18] + sizeof(sljit_sw) */
-	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_REG_PAIR(SLJIT_R4, SLJIT_S4), SLJIT_MEM1(SLJIT_R0), 0x7f0f);
+	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_16, SLJIT_REG_PAIR(SLJIT_R4, SLJIT_S4), SLJIT_MEM1(SLJIT_R0), 0x7f0f);
 
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, (sljit_sw)bbuf + 34 - 0xfff);
-	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_LOAD | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_REG_PAIR(SLJIT_S1, SLJIT_R0), SLJIT_MEM1(SLJIT_S1), 0xfff);
+	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_LOAD | SLJIT_MEM_UNALIGNED_16, SLJIT_REG_PAIR(SLJIT_S1, SLJIT_R0), SLJIT_MEM1(SLJIT_S1), 0xfff);
 	/* buf[26] */
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 26 * sizeof(sljit_sw), SLJIT_S1, 0);
 	/* buf[27] */
@@ -9138,7 +9650,7 @@ static void test79(void)
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, (sljit_sw)bbuf + 50 + 0xfff);
 	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, -9035);
 	/* bbuf[50], buf[50] + sizeof(sljit_sw) */
-	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16, SLJIT_REG_PAIR(SLJIT_R2, SLJIT_R3), SLJIT_MEM1(SLJIT_R2), -0xfff);
+	sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED_16, SLJIT_REG_PAIR(SLJIT_R2, SLJIT_R3), SLJIT_MEM1(SLJIT_R2), -0xfff);
 
 	sljit_emit_return_void(compiler);
 
@@ -9184,7 +9696,1406 @@ static void test79(void)
 	FAILED(buf[32] != -1, "test79 case 33 failed\n");
 
 	sljit_free_code(code.code, NULL);
-#endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_ARM */
+	successful_tests++;
+}
+
+static void test80(void)
+{
+	/* Test masked shift. */
+	executable_code code;
+	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	sljit_sw buf[8];
+	sljit_s32 ibuf[8];
+	sljit_s32 i;
+
+	if (verbose)
+		printf("Run test80\n");
+
+	FAILED(!compiler, "cannot create compiler\n");
+
+	for (i = 0; i < 8; i++)
+		buf[i] = -1;
+	for (i = 0; i < 8; i++)
+		ibuf[i] = -1;
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W, W), 5, 5, 0, 0, 2 * sizeof(sljit_sw));
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x1234);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 8 * sizeof(sljit_sw) + 4);
+	sljit_emit_op2(compiler, SLJIT_MSHL, SLJIT_R2, 0, SLJIT_R0, 0, SLJIT_R1, 0);
+	/* buf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 8 * sizeof(sljit_sw));
+	/* buf[1] */
+	sljit_emit_op2(compiler, SLJIT_MSHL, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0x5678);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 32 + 8);
+	sljit_emit_op2(compiler, SLJIT_MSHL32, SLJIT_R2, 0, SLJIT_R0, 0, SLJIT_R1, 0);
+	/* ibuf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_R2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R3, 0, SLJIT_IMM, 1);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, -2);
+	/* ibuf[1] */
+	sljit_emit_op2(compiler, SLJIT_MSHL32 | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_R3, 0, SLJIT_R4, 0);
+	/* buf[2] */
+	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_NOT_ZERO);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 8 * sizeof(sljit_sw) + 4);
+	sljit_emit_op2(compiler, SLJIT_MLSHR | SLJIT_SET_Z, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_R2, 0);
+	/* buf[3] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[4] */
+	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_NOT_ZERO);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 0x5678);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), SLJIT_IMM, -32);
+	sljit_emit_op2(compiler, SLJIT_MLSHR32, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32));
+	/* ibuf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s32), SLJIT_MEM1(SLJIT_SP), 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0x345678);
+	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R1, 0, SLJIT_S1, 0, SLJIT_IMM, 0x123000 - 3 * sizeof(sljit_s32));
+	/* ibuf[3] */
+	sljit_emit_op2(compiler, SLJIT_MLSHR32, SLJIT_MEM1(SLJIT_R1), 0x123000, SLJIT_R0, 0, SLJIT_IMM, 32 + 4);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, -0x100);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 8 * sizeof(sljit_sw) + 4);
+	sljit_emit_op2(compiler, SLJIT_MASHR, SLJIT_R1, 0, SLJIT_R3, 0, SLJIT_R2, 0);
+	/* buf[5] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), SLJIT_IMM, -0x100);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, -32 + 1);
+	sljit_emit_op2(compiler, SLJIT_MASHR32, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), SLJIT_R2, 0);
+	/* ibuf[4] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_s32), SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32));
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_s32), SLJIT_IMM, 0x7fffffff);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, -1);
+	/* ibuf[5] */
+	sljit_emit_op2(compiler, SLJIT_MLSHR32 | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_s32), SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_s32), SLJIT_R0, 0);
+	/* buf[6] */
+	sljit_emit_op_flags(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_NOT_ZERO);
+
+#if (defined SLJIT_MASKED_SHIFT && SLJIT_MASKED_SHIFT)
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 12344321);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, (8 * sizeof(sljit_sw)) + 1);
+	/* buf[7] */
+	sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_R1, 0, SLJIT_R2, 0);
+#endif /* SLJIT_MASKED_SHIFT */
+#if (defined SLJIT_MASKED_SHIFT32 && SLJIT_MASKED_SHIFT32)
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 24688643);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, (8 * sizeof(sljit_s32)) + 1);
+	/* ibuf[6] */
+	sljit_emit_op2(compiler, SLJIT_LSHR32, SLJIT_MEM1(SLJIT_S1), 6 * sizeof(sljit_s32), SLJIT_R1, 0, SLJIT_R2, 0);
+#endif /* SLJIT_MASKED_SHIFT32 */
+
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	code.func2((sljit_sw)buf, (sljit_sw)ibuf);
+
+	FAILED(buf[0] != 0x12340, "test80 case 1 failed\n");
+	FAILED(buf[1] != 0x1234, "test80 case 2 failed\n");
+	FAILED(ibuf[0] != 0x567800, "test80 case 3 failed\n");
+	FAILED(ibuf[1] != (sljit_sw)1 << 30, "test80 case 4 failed\n");
+	FAILED(buf[2] != 1, "test80 case 5 failed\n");
+	FAILED(buf[3] != ((sljit_uw)-1 >> 4), "test80 case 6 failed\n");
+	FAILED(buf[4] != 1, "test80 case 7 failed\n");
+	FAILED(ibuf[2] != 0x5678, "test80 case 8 failed\n");
+	FAILED(ibuf[3] != 0x34567, "test80 case 9 failed\n");
+	FAILED(buf[5] != -0x10, "test80 case 10 failed\n");
+	FAILED(ibuf[4] != -0x80, "test80 case 11 failed\n");
+	FAILED(ibuf[5] != 0, "test80 case 12 failed\n");
+	FAILED(buf[6] != 0, "test80 case 13 failed\n");
+#if (defined SLJIT_MASKED_SHIFT && SLJIT_MASKED_SHIFT)
+	FAILED(buf[7] != 24688642, "test80 case 14 failed\n");
+#endif /* SLJIT_MASKED_SHIFT */
+#if (defined SLJIT_MASKED_SHIFT32 && SLJIT_MASKED_SHIFT32)
+	FAILED(ibuf[6] != 12344321, "test80 case 15 failed\n");
+#endif /* SLJIT_MASKED_SHIFT32 */
+
+	sljit_free_code(code.code, NULL);
+	successful_tests++;
+}
+
+static void test81(void)
+{
+	/* Test return with floating point value. */
+	executable_code code;
+	struct sljit_compiler* compiler;
+	struct sljit_jump* jump;
+	sljit_f64 dbuf[2];
+	sljit_f32 sbuf[2];
+
+	if (verbose)
+		printf("Run test81\n");
+
+	if (!sljit_has_cpu_feature(SLJIT_HAS_FPU)) {
+		if (verbose)
+			printf("no fpu available, test81 skipped\n");
+		successful_tests++;
+		return;
+	}
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(F64, W), 0, 1, 3, 0, 0);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_return(compiler, SLJIT_MOV_F64, SLJIT_FR2, 0);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	dbuf[0] = 35.125;
+	FAILED(code.test81_f2((sljit_sw)dbuf) != 35.125, "test81 case 1 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(F32, W), 0, 1, 1, 0, 0);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_RETURN_FREG, 0, SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_return(compiler, SLJIT_MOV_F32, SLJIT_RETURN_FREG, 0);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	sbuf[0] = -9027.5;
+	FAILED(code.test81_f1((sljit_sw)sbuf) != -9027.5, "test81 case 2 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(F32, W), 0, 1, 1, 0, sizeof(sljit_f32));
+	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_return(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_SP), 0);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	sbuf[0] = -6.75;
+	FAILED(code.test81_f1((sljit_sw)sbuf) != -6.75, "test81 case 3 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(F64, W), 0, 1, 1, 0, 2 * sizeof(sljit_f64));
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_f64), SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_return(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_f64));
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	dbuf[0] = 45.125;
+	FAILED(code.test81_f2((sljit_sw)dbuf) != 45.125, "test81 case 4 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS0(VOID), 1, 0, 1, 0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)dbuf - 33);
+	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS1(F64, W));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)dbuf);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_f64), SLJIT_RETURN_FREG, 0);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG, SLJIT_ARGS1(F64, W_R), 1, 0, 1, 0, 0);
+	sljit_emit_return(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_R0), 33);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	dbuf[0] = 2571.75;
+	dbuf[1] = 0;
+	code.func0();
+	FAILED(dbuf[1] != 2571.75, "test81 case 5 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS0(VOID), 1, 0, 1, 0, 0);
+	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS0(F32));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)sbuf);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_f32), SLJIT_RETURN_FREG, 0);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG, SLJIT_ARGS0(F32), 0, 0, 1, 0, 0);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_RETURN_FREG, 0, SLJIT_MEM0(), (sljit_sw)sbuf);
+	sljit_emit_return(compiler, SLJIT_MOV_F32, SLJIT_RETURN_FREG, 0);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	sbuf[0] = 6310.25;
+	sbuf[1] = 0;
+	code.func0();
+	FAILED(sbuf[1] != 6310.25, "test81 case 6 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	successful_tests++;
+}
+
+static void test82(void)
+{
+	/* Test return_to operation. */
+	executable_code code, code2;
+	struct sljit_compiler* compiler;
+	struct sljit_jump* jump;
+	struct sljit_label* label;
+	sljit_s32 i;
+	sljit_sw buf[3];
+
+	if (verbose)
+		printf("Run test82\n");
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 2, 1, 0, 0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -7602);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_S0, 0);
+	jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM0(), 0);
+	label = sljit_emit_label(compiler);
+	/* buf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_RETURN_REG, 0);
+	sljit_emit_op0(compiler, SLJIT_SKIP_FRAMES_BEFORE_RETURN);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W_R, W_R), 2, 0, 0, 0, 256);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 256 - sizeof(sljit_sw), SLJIT_IMM, -1);
+	/* buf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_R0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, 8945);
+	sljit_emit_return_to(compiler, SLJIT_MEM1(SLJIT_R1), 0);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+
+	buf[0] = (sljit_sw)sljit_get_label_addr(label);
+	buf[1] = 0;
+
+	sljit_free_compiler(compiler);
+
+	code.func1((sljit_sw)buf);
+	FAILED(buf[0] != 8945, "test82 case 1 failed\n");
+	FAILED(buf[1] != -7602, "test82 case 2 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	for (i = 0; i < 3; i++) {
+		compiler = sljit_create_compiler(NULL, NULL);
+		FAILED(!compiler, "cannot create compiler\n");
+
+		sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 2, 1, 0, 0, 0);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 6032);
+		jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS1(W, W));
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM0(), 0);
+		label = sljit_emit_label(compiler);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)buf);
+		/* buf[0] */
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_RETURN_REG, 0);
+		/* buf[2] */
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_S0, 0);
+		sljit_emit_op0(compiler, SLJIT_SKIP_FRAMES_BEFORE_RETURN);
+		sljit_emit_return_void(compiler);
+
+		sljit_set_label(jump, sljit_emit_label(compiler));
+		sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(1), SLJIT_ARGS1(VOID, W_R), 2, i == 1 ? 2 : 1, 0, 0, SLJIT_MAX_LOCAL_SIZE);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, -1);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_R0, 0);
+		/* buf[1] */
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
+		if (i == 2)
+			sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 2 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_S0), 0);
+		else
+			sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S(i), 0, SLJIT_MEM1(SLJIT_S0), 0);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), SLJIT_MAX_LOCAL_SIZE - sizeof(sljit_sw), SLJIT_IMM, -1);
+		if (i != 0)
+			sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, -3890);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, 7145);
+		if (i == 2)
+			sljit_emit_return_to(compiler, SLJIT_MEM1(SLJIT_SP), 2 * sizeof(sljit_sw));
+		else
+			sljit_emit_return_to(compiler, SLJIT_S(i), 0);
+
+		code.code = sljit_generate_code(compiler);
+		CHECK(compiler);
+
+		buf[0] = (sljit_sw)sljit_get_label_addr(label);
+		buf[1] = 0;
+		buf[2] = 0;
+
+		sljit_free_compiler(compiler);
+
+		code.func1((sljit_sw)buf);
+		FAILED(buf[0] != 7145, "test82 case 3 failed\n");
+		FAILED(buf[1] != 6032, "test82 case 4 failed\n");
+		if (i != 0)
+			FAILED(buf[2] != -3890, "test82 case 5 failed\n");
+
+		sljit_free_code(code.code, NULL);
+	}
+
+	/* Next test. */
+
+	for (i = 0; i < 3; i++) {
+		compiler = sljit_create_compiler(NULL, NULL);
+		FAILED(!compiler, "cannot create compiler\n");
+
+		sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P_R), 2, 1, 0, 0, 0);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_R0, 0);
+		jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS1(W, W));
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM0(), 0);
+		label = sljit_emit_label(compiler);
+
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)buf);
+		/* buf[0] */
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_RETURN_REG, 0);
+		sljit_emit_op0(compiler, SLJIT_SKIP_FRAMES_BEFORE_RETURN);
+		sljit_emit_return_void(compiler);
+
+		sljit_set_label(jump, sljit_emit_label(compiler));
+		sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, W_R), 2, 1, 0, 0, (i == 0) ? 0 : (i == 1) ? 512 : 32768);
+		/* buf[1] */
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), sizeof(sljit_sw), SLJIT_R0, 0);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, -1);
+		sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 0x1000);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, -4502);
+		sljit_emit_return_to(compiler, SLJIT_MEM1(SLJIT_R1), -0x1000);
+
+		code.code = sljit_generate_code(compiler);
+		CHECK(compiler);
+
+		buf[0] = (sljit_sw)sljit_get_label_addr(label);
+		buf[1] = 0;
+
+		sljit_free_compiler(compiler);
+
+		code.func1((sljit_sw)buf);
+		FAILED(buf[0] != -4502, "test82 case 6 failed\n");
+		FAILED(buf[1] != (sljit_sw)buf, "test82 case 7 failed\n");
+
+		sljit_free_code(code.code, NULL);
+	}
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	i = SLJIT_S2;
+#else
+	i = SLJIT_S(SLJIT_NUMBER_OF_SAVED_REGISTERS - 1);
+#endif
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, P), 2, SLJIT_NUMBER_OF_SAVED_REGISTERS, 0, 0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, i, 0, SLJIT_IMM, 2 * sizeof(sljit_sw));
+	jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS0(W));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM0(), 0);
+	label = sljit_emit_label(compiler);
+	/* buf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_S0, i), 0, SLJIT_RETURN_REG, 0);
+	sljit_emit_op0(compiler, SLJIT_SKIP_FRAMES_BEFORE_RETURN);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS0(VOID), 2, SLJIT_NUMBER_OF_SAVED_REGISTERS, 0, 0, 16);
+	for (i = 0; i < SLJIT_NUMBER_OF_SAVED_REGISTERS; i++)
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S(i), 0, SLJIT_IMM, -1);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, (sljit_sw)(buf + 3));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -3);
+	sljit_emit_return_to(compiler, SLJIT_MEM2(SLJIT_RETURN_REG, SLJIT_R1), SLJIT_WORD_SHIFT);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+
+	buf[0] = (sljit_sw)sljit_get_label_addr(label);
+	buf[1] = 0;
+	buf[2] = 0;
+
+	sljit_free_compiler(compiler);
+
+	code.func1((sljit_sw)buf);
+	FAILED(buf[2] != (sljit_sw)(buf + 3), "test82 case 8 failed\n");
+
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, P_R, P), 2, SLJIT_NUMBER_OF_SAVED_REGISTERS, 0, 0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_S0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, 586000);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, 392);
+	sljit_emit_icall(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS0(W), SLJIT_R0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM0(), 0);
+	label = sljit_emit_label(compiler);
+	/* buf[0] */
+	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM1(SLJIT_S2), 0, SLJIT_S0, 0, SLJIT_S1, 0);
+	/* buf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S2), sizeof(sljit_sw), SLJIT_RETURN_REG, 0);
+	sljit_emit_op0(compiler, SLJIT_SKIP_FRAMES_BEFORE_RETURN);
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+
+	buf[0] = (sljit_sw)sljit_get_label_addr(label);
+
+	sljit_free_compiler(compiler);
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(2), SLJIT_ARGS0(VOID), 2, SLJIT_NUMBER_OF_SAVED_REGISTERS, 0, 0, 16);
+	for (i = 2; i < SLJIT_NUMBER_OF_SAVED_REGISTERS; i++)
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S(i), 0, SLJIT_IMM, -1);
+	/* buf[2] */
+	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_MEM0(), (sljit_sw)(buf + 2), SLJIT_S0, 0, SLJIT_S1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S0, 0, SLJIT_IMM, 416000);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S1, 0, SLJIT_IMM, 931);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, 2906);
+	sljit_emit_return_to(compiler, SLJIT_IMM, buf[0]);
+
+	code2.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	buf[0] = 0;
+	buf[1] = 0;
+	buf[2] = 0;
+
+	code.func2(SLJIT_FUNC_ADDR(code2.func0), (sljit_sw)buf);
+	FAILED(buf[0] != 416931, "test82 case 9 failed\n");
+	FAILED(buf[1] != 2906, "test82 case 10 failed\n");
+	FAILED(buf[2] != 586392, "test82 case 11 failed\n");
+
+	sljit_free_code(code.code, NULL);
+	sljit_free_code(code2.code, NULL);
+
+	successful_tests++;
+}
+
+static void test83(void)
+{
+	/* Test rotate. */
+	executable_code code;
+	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	sljit_sw buf[13];
+	sljit_s32 ibuf[8];
+	sljit_s32 i;
+#ifdef SLJIT_PREF_SHIFT_REG
+	sljit_s32 shift_reg = SLJIT_PREF_SHIFT_REG;
+#else
+	sljit_s32 shift_reg = SLJIT_R2;
+#endif
+
+	if (verbose)
+		printf("Run test83\n");
+
+	FAILED(!compiler, "cannot create compiler\n");
+
+	for (i = 0; i < 13; i++)
+		buf[i] = -1;
+	for (i = 0; i < 8; i++)
+		ibuf[i] = -1;
+
+	ibuf[0] = 8;
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W, W), 5, 5, 0, 0, 2 * sizeof(sljit_sw));
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, WCONST(0x1234567812345678, 0x12345678));
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 12);
+	sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_R0, 0, SLJIT_R0, 0, shift_reg, 0);
+	/* buf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, WCONST(0xfedcba0987654321, 0x87654321));
+	sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_R1, 0, SLJIT_R0, 0, SLJIT_IMM, 1);
+	/* buf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, WCONST(0xfedcba0987654321, 0x87654321));
+	sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_S2, 0, SLJIT_R4, 0, SLJIT_IMM, 0xffff00);
+	/* buf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_S2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -1);
+	/* buf[3] */
+	sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_IMM, WCONST(0x9876543210abcdef, 0x87654321), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_IMM, WCONST(0x9876543210abcdc0, 0x876543e0));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 4);
+	sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_R0, 0, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw));
+	/* buf[4] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_IMM, WCONST(0x1234567812345678, 0x12345678));
+	sljit_emit_op2(compiler, SLJIT_ROTR, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_IMM, 4);
+	/* buf[5] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_SP), 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_IMM, WCONST(0x1234567812345678, 0x12345678));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 20);
+	sljit_emit_op2(compiler, SLJIT_ROTR, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[6] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_SP), 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, WCONST(0x1234567887654341, 0x17654321));
+	sljit_emit_op2(compiler, SLJIT_ROTR, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_R0, 0);
+	/* buf[7] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 8 * sizeof(sljit_sw));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, WCONST(0xfedcba0987654321, 0x87654321));
+	/* buf[8] */
+	sljit_emit_op2(compiler, SLJIT_ROTR, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 0, SLJIT_R1, 0, SLJIT_IMM, 0xff00);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 0xffc0);
+	sljit_emit_op2(compiler, SLJIT_ROTR, SLJIT_R1, 0, SLJIT_R1, 0, shift_reg, 0);
+	/* buf[9] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -7834);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0x87654321);
+	sljit_emit_op2(compiler, SLJIT_ROTL32, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), 0);
+	/* ibuf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_R0, 0);
+	/* buf[10] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_sw), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, (sljit_sw)0xabc89def);
+	sljit_emit_op2(compiler, SLJIT_ROTL32, SLJIT_S4, 0, SLJIT_R4, 0, SLJIT_IMM, 0xfffe1);
+	/* ibuf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_S4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, (sljit_sw)0xabc89def);
+	sljit_emit_op2(compiler, SLJIT_ROTL32, SLJIT_S4, 0, SLJIT_R4, 0, SLJIT_IMM, 0xfffe0);
+	/* ibuf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s32), SLJIT_S4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -6512);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, 0xfffe0);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32), SLJIT_IMM, (sljit_sw)0xabc89def);
+	/* ibuf[3] */
+	sljit_emit_op2(compiler, SLJIT_ROTL32, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32), SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32), SLJIT_R4, 0);
+	/* buf[11] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_sw), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 30);
+	sljit_emit_op2(compiler, SLJIT_ROTR32, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0x87654321, SLJIT_R0, 0);
+	/* ibuf[4] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_s32), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), SLJIT_IMM, (sljit_sw)0xfedccdef);
+	sljit_emit_op2(compiler, SLJIT_ROTR32, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), SLJIT_IMM, 4);
+	/* ibuf[5] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_s32), SLJIT_MEM1(SLJIT_SP), 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 6 * sizeof(sljit_s32), SLJIT_IMM, (sljit_sw)0x89abcdef);
+	sljit_emit_op2(compiler, SLJIT_ROTR32, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S1), 6 * sizeof(sljit_s32), SLJIT_IMM, 0xfffe0);
+	/* ibuf[6] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 6 * sizeof(sljit_s32), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -2647);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)0x89abcde0);
+	/* ibuf[7] */
+	sljit_emit_op2(compiler, SLJIT_ROTR32, SLJIT_MEM1(SLJIT_S1), 7 * sizeof(sljit_s32), SLJIT_R1, 0, SLJIT_R1, 0);
+	/* buf[12] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 12 * sizeof(sljit_sw), shift_reg, 0);
+
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	code.func2((sljit_sw)buf, (sljit_sw)ibuf);
+
+	FAILED(buf[0] != WCONST(0x4567812345678123, 0x45678123), "test83 case 1 failed\n");
+	FAILED(buf[1] != WCONST(0xfdb974130eca8643, 0xeca8643), "test83 case 2 failed\n");
+	FAILED(buf[2] != WCONST(0xfedcba0987654321, 0x87654321), "test83 case 3 failed\n");
+	FAILED(buf[3] != WCONST(0xcc3b2a190855e6f7, 0xc3b2a190), "test83 case 4 failed\n");
+	FAILED(buf[4] != WCONST(0x9876543210abcdc0, 0x876543e0), "test83 case 5 failed\n");
+	FAILED(buf[5] != WCONST(0x8123456781234567, 0x81234567), "test83 case 6 failed\n");
+	FAILED(buf[6] != WCONST(0x4567812345678123, 0x45678123), "test83 case 7 failed\n");
+	FAILED(buf[7] != WCONST(0x891a2b3c43b2a1a0, 0x8bb2a190), "test83 case 8 failed\n");
+	FAILED(buf[8] != WCONST(0xfedcba0987654321, 0x87654321), "test83 case 9 failed\n");
+	FAILED(buf[9] != WCONST(0xfedcba0987654321, 0x87654321), "test83 case 10 failed\n");
+	FAILED(ibuf[0] != (sljit_s32)0x65432187, "test83 case 11 failed\n");
+	FAILED(buf[10] != -7834, "test83 case 12 failed\n");
+	FAILED(ibuf[1] != (sljit_s32)0x57913bdf, "test83 case 13 failed\n");
+	FAILED(ibuf[2] != (sljit_s32)0xabc89def, "test83 case 14 failed\n");
+	FAILED(ibuf[3] != (sljit_s32)0xabc89def, "test83 case 15 failed\n");
+	FAILED(buf[11] != -6512, "test83 case 16 failed\n");
+	FAILED(ibuf[4] != (sljit_s32)0x1d950c86, "test83 case 17 failed\n");
+	FAILED(ibuf[5] != (sljit_s32)0xffedccde, "test83 case 18 failed\n");
+	FAILED(ibuf[6] != (sljit_s32)0x89abcdef, "test83 case 19 failed\n");
+	FAILED(ibuf[7] != (sljit_s32)0x89abcde0, "test83 case 20 failed\n");
+	FAILED(buf[12] != -2647, "test83 case 21 failed\n");
+
+	sljit_free_code(code.code, NULL);
+	successful_tests++;
+}
+
+static void test84(void)
+{
+	/* Test "shift into". */
+	executable_code code;
+	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	sljit_sw buf[19];
+	sljit_s32 ibuf[10];
+	sljit_s32 i;
+#ifdef SLJIT_PREF_SHIFT_REG
+	sljit_s32 shift_reg = SLJIT_PREF_SHIFT_REG;
+#else
+	sljit_s32 shift_reg = SLJIT_R2;
+#endif
+
+	if (verbose)
+		printf("Run test84\n");
+
+	FAILED(!compiler, "cannot create compiler\n");
+
+	for (i = 0; i < 19; i++)
+		buf[i] = -1;
+	for (i = 0; i < 10; i++)
+		ibuf[i] = -1;
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W, W), 5, 5, 0, 0, 2 * sizeof(sljit_sw));
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, WCONST(0x1234567812345678, 0x12345678));
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 12);
+	sljit_emit_shift_into(compiler, SLJIT_SHL, SLJIT_R0, SLJIT_R1, SLJIT_R1, shift_reg, 0);
+	/* buf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, WCONST(0x1234567812345678, 0x12345678));
+	sljit_emit_shift_into(compiler, SLJIT_MLSHR, SLJIT_R4, SLJIT_R3, SLJIT_R3, SLJIT_IMM, 0xffd4 /* 20 */);
+	/* buf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, shift_reg, 0, SLJIT_IMM, (sljit_s32)0x86421357);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 0xffeb /* 11 */);
+	sljit_emit_shift_into(compiler, SLJIT_MSHL32, SLJIT_R0, shift_reg, shift_reg, SLJIT_MEM1(SLJIT_SP), 0);
+	/* ibuf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -8762);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, (sljit_s32)0x89abcdef);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_IMM, 0xffff);
+	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R0, 0, SLJIT_S1, 0, SLJIT_IMM, 16 * sizeof(sljit_s32));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 17);
+	sljit_emit_shift_into(compiler, SLJIT_MLSHR32, SLJIT_S2, SLJIT_R4, SLJIT_R4, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), 2);
+	/* ibuf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_S2, 0);
+	/* buf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S4, 0, SLJIT_IMM, WCONST(0x1234567812345678, 0x12345678));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, WCONST(0xabcd000000000000, 0xabcd0000));
+	sljit_emit_shift_into(compiler, SLJIT_MSHL, SLJIT_S4, SLJIT_S4, SLJIT_R0, SLJIT_IMM, 12);
+	/* buf[3] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_S4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, WCONST(0xaabbccddeeff8899, 0xabcdef89));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, 0xfedcba);
+	sljit_emit_shift_into(compiler, SLJIT_LSHR, SLJIT_R1, SLJIT_R0, SLJIT_R4, SLJIT_IMM, 19);
+	/* buf[4] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, WCONST(0xfedcba0987654321, 0xfedcba09));
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, WCONST(0x7fffffffffffffff, 0x7fffffff));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 1);
+	sljit_emit_shift_into(compiler, SLJIT_SHL | SLJIT_SHIFT_INTO_NON_ZERO, SLJIT_R4, SLJIT_R1, shift_reg, SLJIT_MEM1(SLJIT_SP), 0);
+	/* buf[5] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 5 * sizeof(sljit_sw), SLJIT_R4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)0xdeadbeaf);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, (sljit_sw)0xfedcba09);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, -5);
+	sljit_emit_shift_into(compiler, SLJIT_MLSHR32 | SLJIT_SHIFT_INTO_NON_ZERO, shift_reg, SLJIT_R1, SLJIT_R4, SLJIT_R0, 0);
+	/* ibuf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s32), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R3, 0, SLJIT_IMM, (sljit_sw)0xabcd6543);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, (sljit_s32)0xc9000000);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0xffe8);
+	sljit_emit_shift_into(compiler, SLJIT_MSHL32, shift_reg, SLJIT_R3, SLJIT_R4, SLJIT_R0, 0);
+	/* ibuf[3] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, -6032);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, 0x7cadcad7);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_S3, 0, SLJIT_IMM, (sljit_s32)0xfffffff5);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 3);
+	sljit_emit_shift_into(compiler, SLJIT_LSHR32, SLJIT_R4, SLJIT_R4, SLJIT_S3, SLJIT_R0, 0);
+	/* ibuf[4] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_s32), SLJIT_R4, 0);
+	/* buf[6] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 6 * sizeof(sljit_sw), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -9740);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -5182);
+	sljit_emit_shift_into(compiler, SLJIT_SHL, SLJIT_R0, SLJIT_R0, SLJIT_R1, SLJIT_IMM, 0);
+	/* buf[7] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 7 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[8] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 8 * sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, -4072);
+	sljit_emit_op1(compiler, SLJIT_MOV32, shift_reg, 0, SLJIT_IMM, -2813);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 0);
+	sljit_emit_shift_into(compiler, SLJIT_LSHR32, SLJIT_R0, SLJIT_R0, shift_reg, SLJIT_R1, 0);
+	/* ibuf[5] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 5 * sizeof(sljit_s32), SLJIT_R0, 0);
+	/* ibuf[6] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 6 * sizeof(sljit_s32), shift_reg, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -3278);
+	sljit_emit_op1(compiler, SLJIT_MOV, shift_reg, 0, SLJIT_IMM, 0);
+	sljit_emit_shift_into(compiler, SLJIT_LSHR, SLJIT_R1, SLJIT_R0, SLJIT_R0, shift_reg, 0);
+	/* buf[9] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 9 * sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S3, 0, SLJIT_IMM, WCONST(0x1234567890abcdef, 0x12345678));
+	sljit_emit_shift_into(compiler, SLJIT_LSHR, SLJIT_R0, SLJIT_S3, SLJIT_S3, SLJIT_IMM, 0xfff8 /* 24/56 */);
+	/* buf[10] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 10 * sizeof(sljit_sw), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S3, 0, SLJIT_IMM, WCONST(0x1234567890abcdef, 0x12345678));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S4, 0, SLJIT_IMM, WCONST(0xba9876fedcba9800, 0xfedcba00));
+	sljit_emit_shift_into(compiler, SLJIT_SHL, SLJIT_S3, SLJIT_S3, SLJIT_S4, SLJIT_IMM, 0xfff8 /* 24/56 */);
+	/* buf[11] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 11 * sizeof(sljit_sw), SLJIT_S3, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, WCONST(0x1234567890abcdef, 0x12345678));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -4986);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, 0);
+	sljit_emit_shift_into(compiler, SLJIT_SHL, SLJIT_R0, SLJIT_R0, SLJIT_R1, SLJIT_R4, 0);
+	/* buf[12] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 12 * sizeof(sljit_sw), SLJIT_R0, 0);
+	/* buf[13] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 13 * sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, WCONST(0x12345678fedcba09, 0x12348765));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 14 * sizeof(sljit_sw), SLJIT_IMM, -1);
+	sljit_emit_shift_into(compiler, SLJIT_MLSHR, shift_reg, SLJIT_R0, SLJIT_R1, SLJIT_MEM1(SLJIT_S0), 14 * sizeof(sljit_sw));
+	/* buf[14] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 14 * sizeof(sljit_sw), shift_reg, 0);
+	/* buf[15] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 15 * sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, WCONST(0x8000000000000005, 0x80000005));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 16 * sizeof(sljit_sw), SLJIT_R1, 0);
+	sljit_emit_shift_into(compiler, SLJIT_MSHL, SLJIT_R0, SLJIT_R0, SLJIT_R1, SLJIT_MEM0(), (sljit_sw)(buf + 16));
+	/* buf[16] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 16 * sizeof(sljit_sw), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, WCONST(0x2345678923456789, 0x23456789));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, -1);
+	sljit_emit_shift_into(compiler, SLJIT_SHL, SLJIT_R0, SLJIT_R1, SLJIT_S2, SLJIT_R0, 0);
+	/* buf[17] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 17 * sizeof(sljit_sw), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0xabc23456);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)0xef000000);
+	sljit_emit_shift_into(compiler, SLJIT_SHL32, SLJIT_R0, SLJIT_R0, SLJIT_R1, SLJIT_IMM, 4);
+	/* ibuf[7] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 7 * sizeof(sljit_s32), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, (sljit_sw)0xabc23456);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)0xfe);
+	sljit_emit_shift_into(compiler, SLJIT_LSHR32, SLJIT_S2, SLJIT_R0, SLJIT_R1, SLJIT_IMM, 4);
+	/* ibuf[8] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 8 * sizeof(sljit_s32), SLJIT_S2, 0);
+
+#if (defined SLJIT_MASKED_SHIFT && SLJIT_MASKED_SHIFT)
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 12344321);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, (8 * sizeof(sljit_sw)) + 1);
+	sljit_emit_shift_into(compiler, SLJIT_SHL, SLJIT_R0, SLJIT_R0, SLJIT_R1, SLJIT_R2, 0);
+	/* buf[18] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 18 * sizeof(sljit_sw), SLJIT_R0, 0);
+#endif /* SLJIT_MASKED_SHIFT */
+#if (defined SLJIT_MASKED_SHIFT32 && SLJIT_MASKED_SHIFT32)
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 24688642);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 1);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, (8 * sizeof(sljit_s32)) + 1);
+	sljit_emit_shift_into(compiler, SLJIT_LSHR32, SLJIT_R0, SLJIT_R0, SLJIT_R1, SLJIT_R2, 0);
+	/* ibuf[9] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 9 * sizeof(sljit_s32), SLJIT_R0, 0);
+#endif /* SLJIT_MASKED_SHIFT32 */
+
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	code.func2((sljit_sw)buf, (sljit_sw)ibuf);
+
+	FAILED(buf[0] != WCONST(0x4567812345678123, 0x45678123), "test84 case 1 failed\n");
+	FAILED(buf[1] != WCONST(0x4567812345678123, 0x45678123), "test84 case 2 failed\n");
+	FAILED(ibuf[0] != 0x109abc32, "test84 case 3 failed\n");
+	FAILED(ibuf[1] != 0x13579bdf, "test84 case 4 failed\n");
+	FAILED(buf[2] != -8762, "test84 case 5 failed\n");
+	FAILED(buf[3] != WCONST(0x4567812345678abc, 0x45678abc), "test84 case 6 failed\n");
+	FAILED(buf[4] != WCONST(0xdb975557799bbddf, 0xdb975579), "test84 case 7 failed\n");
+	FAILED(buf[5] != WCONST(0xfdb974130eca8642, 0xfdb97412), "test84 case 8 failed\n");
+	FAILED(ibuf[2] != (sljit_s32)0xdb97413b, "test84 case 9 failed\n");
+	FAILED(ibuf[3] != (sljit_s32)0xcd6543c9, "test84 case 10 failed\n");
+	FAILED(ibuf[4] != (sljit_s32)0xaf95b95a, "test84 case 11 failed\n");
+	FAILED(buf[6] != -6032, "test84 case 12 failed\n");
+	FAILED(buf[7] != -9740, "test84 case 13 failed\n");
+	FAILED(buf[8] != -5182, "test84 case 14 failed\n");
+	FAILED(ibuf[5] != -4072, "test84 case 15 failed\n");
+	FAILED(ibuf[6] != -2813, "test84 case 16 failed\n");
+	FAILED(buf[9] != -3278, "test84 case 17 failed\n");
+	FAILED(buf[10] != WCONST(0x34567890abcdef12, 0x34567812), "test84 case 18 failed\n");
+	FAILED(buf[11] != WCONST(0xefba9876fedcba98, 0x78fedcba), "test84 case 19 failed\n");
+	FAILED(buf[12] != WCONST(0x1234567890abcdef, 0x12345678), "test84 case 20 failed\n");
+	FAILED(buf[13] != -4986, "test84 case 21 failed\n");
+	FAILED(buf[14] != WCONST(0x2468acf1fdb97413, 0x24690ecb), "test84 case 22 failed\n");
+	FAILED(buf[15] != WCONST(0x12345678fedcba09, 0x12348765), "test84 case 23 failed\n");
+	FAILED(buf[16] != 0x30, "test84 case 24 failed\n");
+	FAILED(buf[17] != WCONST(0x8d159e248d159e27, 0x8d159e27), "test84 case 25 failed\n");
+	FAILED(ibuf[7] != (sljit_s32)0xbc23456e, "test84 case 26 failed\n");
+	FAILED(ibuf[8] != (sljit_s32)0xeabc2345, "test84 case 27 failed\n");
+#if (defined SLJIT_MASKED_SHIFT && SLJIT_MASKED_SHIFT)
+	FAILED(buf[18] != 24688643, "test84 case 28 failed\n");
+#endif /* SLJIT_MASKED_SHIFT */
+#if (defined SLJIT_MASKED_SHIFT32 && SLJIT_MASKED_SHIFT32)
+	FAILED(ibuf[9] != (sljit_s32)-2135139327, "test84 case 29 failed\n");
+#endif /* SLJIT_MASKED_SHIFT32 */
+
+	sljit_free_code(code.code, NULL);
+	successful_tests++;
+}
+
+static void test85(void)
+{
+	/* Test count trailing zeroes. */
+	executable_code code;
+	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	sljit_sw buf[5];
+	sljit_s32 ibuf[7];
+	sljit_s32 i;
+
+	if (verbose)
+		printf("Run test85\n");
+
+	FAILED(!compiler, "cannot create compiler\n");
+
+	for (i = 0; i < 5; i++)
+		buf[i] = -1;
+	for (i = 0; i < 7; i++)
+		ibuf[i] = -1;
+
+	buf[2] = 0;
+	ibuf[3] = 1;
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W, W), 5, 5, 0, 0, 2 * sizeof(sljit_sw));
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 0x80);
+	/* buf[0] */
+	sljit_emit_op1(compiler, SLJIT_CTZ, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, 0x654321);
+	sljit_emit_op1(compiler, SLJIT_CTZ, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_SP), 0);
+	/* buf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, 2);
+	sljit_emit_op1(compiler, SLJIT_CTZ, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_MEM2(SLJIT_S0, SLJIT_S2), SLJIT_WORD_SHIFT);
+	/* buf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, (sljit_sw)1 << (8 * sizeof(sljit_sw) - 3));
+	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_R1, 0, SLJIT_S0, 0, SLJIT_IMM, 0x100000);
+	/* buf[3] */
+	sljit_emit_op1(compiler, SLJIT_CTZ, SLJIT_MEM1(SLJIT_R1), 0x100000 + 3 * sizeof(sljit_sw), SLJIT_R4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_IMM, WCONST(0xabcdef800, 0xcdef800));
+	sljit_emit_op1(compiler, SLJIT_CTZ, SLJIT_S4, 0, SLJIT_MEM0(), (sljit_sw)(buf + 4));
+	/* buf[4] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 4 * sizeof(sljit_sw), SLJIT_S4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, 0xa400);
+	sljit_emit_op2(compiler, SLJIT_ASHR32, SLJIT_R0, 0, SLJIT_R1, 0, SLJIT_IMM, 4);
+	sljit_emit_op1(compiler, SLJIT_CTZ32, SLJIT_R1, 0, SLJIT_R0, 0);
+	/* ibuf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_R1, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_R0, 0);
+	/* ibuf[1] */
+	sljit_emit_op1(compiler, SLJIT_CTZ32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), SLJIT_IMM, 0xbcdefe0);
+	sljit_emit_op1(compiler, SLJIT_CTZ32, SLJIT_S4, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32));
+	/* ibuf[2] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 2 * sizeof(sljit_s32), SLJIT_S4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 3);
+	sljit_emit_op1(compiler, SLJIT_CTZ32, SLJIT_R0, 0, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), 2);
+	/* ibuf[3] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32), SLJIT_R0, 0);
+
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	code.func2((sljit_sw)buf, (sljit_sw)ibuf);
+
+	FAILED(buf[0] != 7, "test85 case 1 failed\n");
+	FAILED(buf[1] != 0, "test85 case 2 failed\n");
+	FAILED(buf[2] != WCONST(64, 32), "test85 case 3 failed\n");
+	FAILED(buf[3] != WCONST(61, 29), "test85 case 4 failed\n");
+	FAILED(buf[4] != 11, "test85 case 5 failed\n");
+	FAILED(ibuf[0] != 6, "test85 case 6 failed\n");
+	FAILED(ibuf[1] != 32, "test85 case 7 failed\n");
+	FAILED(ibuf[2] != 5, "test85 case 8 failed\n");
+	FAILED(ibuf[3] != 0, "test85 case 9 failed\n");
+
+	sljit_free_code(code.code, NULL);
+	successful_tests++;
+}
+
+static void test86(void)
+{
+	/* Test get return address. */
+	executable_code code;
+	struct sljit_compiler* compiler;
+	struct sljit_jump *jump;
+	struct sljit_label *label;
+	sljit_uw return_addr = 0;
+	sljit_uw buf[1];
+
+	if (verbose)
+		printf("Run test86\n");
+
+	/* Next test. */
+
+	buf[0] = 0;
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, W), 1, 1, 0, 0, 0);
+	jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS0(W));
+	label = sljit_emit_label(compiler);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_RETURN_REG, 0);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS0(W), 1, 0, 0, 0, 0);
+	sljit_emit_op_dst(compiler, SLJIT_GET_RETURN_ADDRESS, SLJIT_RETURN_REG, 0);
+	sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	return_addr = sljit_get_label_addr(label);
+	sljit_free_compiler(compiler);
+
+	code.func1((sljit_sw)buf);
+
+	FAILED(buf[0] != return_addr, "test86 case 1 failed\n");
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	buf[0] = 0;
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS0(VOID), 2, 0, 0, 0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, -1);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, -1);
+	jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS2(VOID, W, W));
+	label = sljit_emit_label(compiler);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W, W), 1, SLJIT_NUMBER_OF_SAVED_REGISTERS - 2, 0, 0, SLJIT_MAX_LOCAL_SIZE);
+	sljit_emit_op_dst(compiler, SLJIT_GET_RETURN_ADDRESS, SLJIT_MEM0(), (sljit_sw)buf);
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	return_addr = sljit_get_label_addr(label);
+	sljit_free_compiler(compiler);
+
+	code.func0();
+
+	FAILED(buf[0] != return_addr, "test86 case 2 failed\n");
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	buf[0] = 0;
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, W), 1, 3, 0, 0, 0);
+	sljit_emit_op2(compiler, SLJIT_SUB, SLJIT_S2, 0, SLJIT_S0, 0, SLJIT_IMM, 16);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 8);
+	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS1(VOID, W));
+	label = sljit_emit_label(compiler);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG | SLJIT_ENTER_KEEP(3), SLJIT_ARGS1(VOID, W_R), 1, SLJIT_NUMBER_OF_SAVED_REGISTERS, 0, 0, SLJIT_MAX_LOCAL_SIZE >> 1);
+	sljit_emit_op_dst(compiler, SLJIT_GET_RETURN_ADDRESS, SLJIT_MEM2(SLJIT_S2, SLJIT_R0), 1);
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	return_addr = sljit_get_label_addr(label);
+	sljit_free_compiler(compiler);
+
+	code.func1((sljit_sw)buf);
+
+	FAILED(buf[0] != return_addr, "test86 case 3 failed\n");
+	sljit_free_code(code.code, NULL);
+
+	/* Next test. */
+
+	buf[0] = 0;
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, W_R), 1, 0, 0, 0, 0);
+	jump = sljit_emit_call(compiler, SLJIT_CALL_REG_ARG, SLJIT_ARGS1(VOID, W));
+	label = sljit_emit_label(compiler);
+	sljit_emit_return_void(compiler);
+
+	sljit_set_label(jump, sljit_emit_label(compiler));
+	sljit_emit_enter(compiler, SLJIT_ENTER_REG_ARG, SLJIT_ARGS1(VOID, W_R), 1, SLJIT_NUMBER_OF_SAVED_REGISTERS >> 1, 0, 0, 64);
+	sljit_emit_op_dst(compiler, SLJIT_GET_RETURN_ADDRESS, SLJIT_MEM1(SLJIT_SP), 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), 0, SLJIT_MEM1(SLJIT_SP), 0);
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	return_addr = sljit_get_label_addr(label);
+	sljit_free_compiler(compiler);
+
+	code.func1((sljit_sw)buf);
+
+	FAILED(buf[0] != return_addr, "test86 case 4 failed\n");
+	sljit_free_code(code.code, NULL);
+
+	if (sljit_has_cpu_feature(SLJIT_HAS_FPU) && SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS > 0) {
+		/* Next test. */
+
+		buf[0] = 0;
+
+		compiler = sljit_create_compiler(NULL, NULL);
+		FAILED(!compiler, "cannot create compiler\n");
+
+		sljit_emit_enter(compiler, 0, SLJIT_ARGS1(VOID, W), 1, 1, 0, 0, 0);
+		jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS0(W));
+		label = sljit_emit_label(compiler);
+		sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_RETURN_REG, 0);
+		sljit_emit_return_void(compiler);
+
+		sljit_set_label(jump, sljit_emit_label(compiler));
+		sljit_emit_enter(compiler, 0, SLJIT_ARGS0(W), 1, 3, 0, SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS, 64);
+		sljit_emit_op_dst(compiler, SLJIT_GET_RETURN_ADDRESS, SLJIT_RETURN_REG, 0);
+		sljit_emit_return(compiler, SLJIT_MOV, SLJIT_RETURN_REG, 0);
+
+		code.code = sljit_generate_code(compiler);
+		CHECK(compiler);
+		return_addr = sljit_get_label_addr(label);
+		sljit_free_compiler(compiler);
+
+		code.func1((sljit_sw)buf);
+
+		FAILED(buf[0] != return_addr, "test86 case 5 failed\n");
+		sljit_free_code(code.code, NULL);
+	}
+
+	successful_tests++;
+}
+
+static void test87(void)
+{
+	/* Test reverse bytes. */
+	executable_code code;
+	struct sljit_compiler* compiler = sljit_create_compiler(NULL, NULL);
+	sljit_sw buf[5];
+	sljit_s32 ibuf[5];
+	sljit_s32 i;
+
+	if (verbose)
+		printf("Run test87\n");
+
+	FAILED(!compiler, "cannot create compiler\n");
+
+	for (i = 0; i < 5; i++)
+		buf[i] = -1;
+	for (i = 0; i < 5; i++)
+		ibuf[i] = -1;
+
+	buf[3] = WCONST(0x8070605040302010, 0x40302010);
+	ibuf[1] = (sljit_s32)0xffeeddcc;
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W, W), 5, 5, 0, 0, 2 * sizeof(sljit_sw));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, WCONST(0xf1e2d3c4b5a69788, 0xf1e2d3c4));
+	sljit_emit_op1(compiler, SLJIT_REV, SLJIT_R0, 0, SLJIT_R0, 0);
+	/* buf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, WCONST(0xffeeddccbbaa9988, 0xffeeddcc));
+	sljit_emit_op1(compiler, SLJIT_REV, SLJIT_R2, 0, SLJIT_R4, 0);
+	/* buf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw), SLJIT_R2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, WCONST(0x0102030405060708, 0x01020304));
+	/* buf[2] */
+	sljit_emit_op1(compiler, SLJIT_REV, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_sw), SLJIT_S2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 3);
+	sljit_emit_op1(compiler, SLJIT_REV, SLJIT_R4, 0, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), SLJIT_WORD_SHIFT);
+	/* buf[3] */
+	sljit_emit_op1(compiler, SLJIT_REV, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_sw), SLJIT_R4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_S2, 0, SLJIT_IMM, WCONST(0x1122334455667788, 0x11223344));
+	/* buf[4] */
+	sljit_emit_op1(compiler, SLJIT_REV, SLJIT_MEM0(), (sljit_sw)&buf[4], SLJIT_S2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, (sljit_s32)0xf1e2d3c4);
+	sljit_emit_op1(compiler, SLJIT_REV32, SLJIT_R1, 0, SLJIT_R0, 0);
+	/* ibuf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 0, SLJIT_R1, 0);
+
+	sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S1, 0, SLJIT_IMM, 0x12340 + sizeof(sljit_s32));
+	sljit_emit_op1(compiler, SLJIT_REV32, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R2), -0x12340);
+	/* ibuf[1] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_s32), SLJIT_R2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 2);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, (sljit_s32)0x01020304);
+	/* ibuf[2] */
+	sljit_emit_op1(compiler, SLJIT_REV32, SLJIT_MEM2(SLJIT_S1, SLJIT_R0), 2, SLJIT_R4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R4, 0, SLJIT_IMM, (sljit_s32)0x11223344);
+	sljit_emit_op1(compiler, SLJIT_REV32, SLJIT_R4, 0, SLJIT_R4, 0);
+	/* ibuf[3] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_S1), 3 * sizeof(sljit_s32), SLJIT_R4, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_IMM, (sljit_s32)0xfeeddccb);
+	/* ibuf[4] */
+	sljit_emit_op1(compiler, SLJIT_REV32, SLJIT_MEM1(SLJIT_S1), 4 * sizeof(sljit_s32), SLJIT_MEM1(SLJIT_SP), 0);
+
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	code.func2((sljit_sw)buf, (sljit_sw)ibuf);
+
+	FAILED(buf[0] != WCONST(0x8897a6b5c4d3e2f1, 0xc4d3e2f1), "test87 case 1 failed\n");
+	FAILED(buf[1] != WCONST(0x8899aabbccddeeff, 0xccddeeff), "test87 case 2 failed\n");
+	FAILED(buf[2] != WCONST(0x0807060504030201, 0x04030201), "test87 case 3 failed\n");
+	FAILED(buf[3] != WCONST(0x8070605040302010, 0x40302010), "test87 case 4 failed\n");
+	FAILED(buf[4] != WCONST(0x8877665544332211, 0x44332211), "test87 case 5 failed\n");
+	FAILED(ibuf[0] != (sljit_s32)0xc4d3e2f1, "test87 case 6 failed\n");
+	FAILED(ibuf[1] != (sljit_s32)0xccddeeff, "test87 case 7 failed\n");
+	FAILED(ibuf[2] != (sljit_s32)0x04030201, "test87 case 8 failed\n");
+	FAILED(ibuf[3] != (sljit_s32)0x44332211, "test87 case 9 failed\n");
+	FAILED(ibuf[4] != (sljit_s32)0xcbdcedfe, "test87 case 10 failed\n");
+
+	sljit_free_code(code.code, NULL);
+	successful_tests++;
+}
+
+static void test88(void)
+{
+	/* Test sljit_emit_fcopy. */
+	executable_code code;
+	struct sljit_compiler* compiler;
+	sljit_f64 dbuf[4];
+	sljit_f32 sbuf[2];
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	sljit_sw wbuf[2];
+	sljit_s32 ibuf[2];
+#else /* !SLJIT_64BIT_ARCHITECTURE */
+	sljit_s32 ibuf[7];
+#endif /* SLJIT_64BIT_ARCHITECTURE */
+
+	if (verbose)
+		printf("Run test88\n");
+
+	if (!sljit_has_cpu_feature(SLJIT_HAS_FPU)) {
+		if (verbose)
+			printf("no fpu available, test88 skipped\n");
+		successful_tests++;
+		return;
+	}
+
+	compiler = sljit_create_compiler(NULL, NULL);
+	FAILED(!compiler, "cannot create compiler\n");
+
+	sbuf[0] = 12345.0;
+	sbuf[1] = -1.0;
+	ibuf[0] = -1;
+	ibuf[1] = (sljit_s32)0xc7543100;
+	dbuf[0] = 123456789012345.0;
+	dbuf[1] = -1.0;
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	wbuf[0] = -1;
+	wbuf[1] = (sljit_sw)0xc2fee0c29f50cb10;
+#else /* !SLJIT_64BIT_ARCHITECTURE */
+	ibuf[2] = -1;
+	ibuf[3] = -1;
+	ibuf[4] = -1;
+	ibuf[5] = (sljit_sw)0x9f50cb10;
+	ibuf[6] = (sljit_sw)0xc2fee0c2;
+#endif /* SLJIT_64BIT_ARCHITECTURE */
+
+	sljit_emit_enter(compiler, 0, SLJIT_ARGS2(VOID, W, W), 5, 5, 5, 0, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)ibuf);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR2, 0, SLJIT_MEM1(SLJIT_S1), 0);
+	sljit_emit_fcopy(compiler, SLJIT_COPY32_FROM_F32, SLJIT_FR2, SLJIT_R0);
+	/* ibuf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R3, 0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_s32));
+	sljit_emit_fcopy(compiler, SLJIT_COPY32_TO_F32, SLJIT_FR4, SLJIT_R3);
+	/* sbuf[1] */
+	sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_MEM1(SLJIT_S1), sizeof(sljit_f32), SLJIT_FR4, 0);
+
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, (sljit_sw)wbuf);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_fcopy(compiler, SLJIT_COPY_FROM_F64, SLJIT_FR1, SLJIT_S2);
+	/* wbuf[0] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_S2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw));
+	sljit_emit_fcopy(compiler, SLJIT_COPY_TO_F64, SLJIT_FR0, SLJIT_R3);
+	/* dbuf[1] */
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64), SLJIT_FR0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 0);
+	sljit_emit_fcopy(compiler, SLJIT_COPY_TO_F64, SLJIT_FR3, SLJIT_R2);
+	/* dbuf[2] */
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_f64), SLJIT_FR3, 0);
+#else /* !SLJIT_64BIT_ARCHITECTURE */
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR1, 0, SLJIT_MEM1(SLJIT_S0), 0);
+	sljit_emit_fcopy(compiler, SLJIT_COPY_FROM_F64, SLJIT_FR1, SLJIT_REG_PAIR(SLJIT_S3, SLJIT_S2));
+	/* ibuf[2-3] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 2 * sizeof(sljit_sw), SLJIT_S2, 0);
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 3 * sizeof(sljit_sw), SLJIT_S3, 0);
+
+	sljit_emit_fcopy(compiler, SLJIT_COPY_FROM_F64, SLJIT_FR1, SLJIT_R2);
+	/* ibuf[4] */
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 4 * sizeof(sljit_sw), SLJIT_R2, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R3, 0, SLJIT_MEM1(SLJIT_R1), 5 * sizeof(sljit_sw));
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_R1), 6 * sizeof(sljit_sw));
+	sljit_emit_fcopy(compiler, SLJIT_COPY_TO_F64, SLJIT_FR0, SLJIT_REG_PAIR(SLJIT_R0, SLJIT_R3));
+	/* dbuf[1] */
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_f64), SLJIT_FR0, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, 0);
+	sljit_emit_fcopy(compiler, SLJIT_COPY_TO_F64, SLJIT_FR3, SLJIT_REG_PAIR(SLJIT_R2, SLJIT_R2));
+	/* dbuf[2] */
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), 2 * sizeof(sljit_f64), SLJIT_FR3, 0);
+
+	sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, (sljit_sw)0xc00c0000);
+	sljit_emit_fcopy(compiler, SLJIT_COPY_TO_F64, SLJIT_FR3, SLJIT_R2);
+	sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_MEM1(SLJIT_S0), 3 * sizeof(sljit_f64), SLJIT_FR3, 0);
+#endif /* SLJIT_64BIT_ARCHITECTURE */
+
+	sljit_emit_return_void(compiler);
+
+	code.code = sljit_generate_code(compiler);
+	CHECK(compiler);
+	sljit_free_compiler(compiler);
+
+	code.func2((sljit_sw)dbuf, (sljit_sw)sbuf);
+
+	FAILED(ibuf[0] != (sljit_s32)0x4640e400, "test88 case 1 failed\n");
+	FAILED(sbuf[1] != -54321.0, "test88 case 2 failed\n");
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	FAILED(wbuf[0] != (sljit_sw)0x42dc12218377de40, "test88 case 3 failed\n");
+	FAILED(dbuf[1] != -543210987654321.0, "test88 case 4 failed\n");
+	FAILED(dbuf[2] != 0.0, "test88 case 5 failed\n");
+#else /* !SLJIT_64BIT_ARCHITECTURE */
+	FAILED(ibuf[2] != (sljit_sw)0x8377de40, "test88 case 3 failed\n");
+	FAILED(ibuf[3] != (sljit_sw)0x42dc1221, "test88 case 4 failed\n");
+	FAILED(ibuf[4] != (sljit_sw)0x42dc1221, "test88 case 5 failed\n");
+	FAILED(dbuf[1] != -543210987654321.0, "test88 case 6 failed\n");
+	FAILED(dbuf[2] != 0.0, "test88 case 7 failed\n");
+	FAILED(dbuf[3] != -3.5, "test88 case 8 failed\n");
+#endif /* SLJIT_64BIT_ARCHITECTURE */
+
+	sljit_free_code(code.code, NULL);
 	successful_tests++;
 }
 
@@ -9279,12 +11190,21 @@ int sljit_test(int argc, char* argv[])
 	test77();
 	test78();
 	test79();
+	test80();
+	test81();
+	test82();
+	test83();
+	test84();
+	test85();
+	test86();
+	test87();
+	test88();
 
 #if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR)
 	sljit_free_unused_memory_exec();
 #endif
 
-#	define TEST_COUNT 79
+#	define TEST_COUNT 88
 
 	printf("SLJIT tests: ");
 	if (successful_tests == TEST_COUNT)